diff options
Diffstat (limited to 'fs')
878 files changed, 35175 insertions, 51168 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index de009a33e0e2..f84412290a30 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -131,10 +131,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); - } else { - if (dentry->d_inode) - ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); } + if (!ret && dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); return ret; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 281a1ed03a04..77e9c4387c1d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -659,21 +659,6 @@ static void v9fs_destroy_inode_cache(void) kmem_cache_destroy(v9fs_inode_cache); } -static int v9fs_cache_register(void) -{ - int ret; - - ret = v9fs_init_inode_cache(); - if (ret < 0) - return ret; - return ret; -} - -static void v9fs_cache_unregister(void) -{ - v9fs_destroy_inode_cache(); -} - /** * init_v9fs - Initialize module * @@ -686,7 +671,7 @@ static int __init init_v9fs(void) pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ - err = v9fs_cache_register(); + err = v9fs_init_inode_cache(); if (err < 0) { pr_err("Failed to register v9fs for caching\n"); return err; @@ -709,7 +694,7 @@ out_sysfs_cleanup: v9fs_sysfs_cleanup(); out_cache: - v9fs_cache_unregister(); + v9fs_destroy_inode_cache(); return err; } @@ -722,7 +707,7 @@ out_cache: static void __exit exit_v9fs(void) { v9fs_sysfs_cleanup(); - v9fs_cache_unregister(); + v9fs_destroy_inode_cache(); unregister_filesystem(&v9fs_fs_type); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1775fcc7f0e8..698c43dd5dc8 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -179,14 +179,16 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, - bool new); +extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern const struct netfs_request_ops v9fs_req_ops; -extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb, - struct p9_fid *fid, bool new); +extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, + struct p9_fid *fid, + struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 @@ -225,12 +227,30 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb, bool new) + struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_fid_iget_dotl(sb, fid, new); + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else - return v9fs_fid_iget(sb, fid, new); + return v9fs_inode_from_fid(v9ses, fid, sb, 0); +} + +/** + * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 7923c3c347cb..d3aefbec4de6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -42,7 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev); + struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); #if (BITS_PER_LONG == 32) #define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32))) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 819c75233235..32619d146cbc 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) int err, len; len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + if (len > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); netfs_write_subrequest_terminated(subreq, len ?: err, false); } @@ -79,11 +81,13 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - - if (!err) + if (!err && total) { subreq->transferred += total; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } - netfs_read_subreq_terminated(subreq, err, false); + subreq->error = err; + netfs_read_subreq_terminated(subreq); } /** diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index fd72fc38c8f5..3e68521f4e2f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -256,12 +256,9 @@ void v9fs_set_netfs_context(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev) + struct inode *inode, umode_t mode, dev_t rdev) { int err = 0; - struct v9fs_inode *v9inode = V9FS_I(inode); - - memcpy(&v9inode->qid, qid, sizeof(struct p9_qid)); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; @@ -365,59 +362,105 @@ void v9fs_evict_inode(struct inode *inode) clear_inode(inode); } -struct inode * -v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new) +static int v9fs_test_inode(struct inode *inode, void *data) +{ + int umode; + dev_t rdev; + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + umode = p9mode2unixmode(v9ses, st, &rdev); + /* don't match inode of different type */ + if (inode_wrong_type(inode, umode)) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; + return 1; +} + +static int v9fs_test_new_inode(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + return 0; +} + +static struct inode *v9fs_qid_iget(struct super_block *sb, + struct p9_qid *qid, + struct p9_wstat *st, + int new) { dev_t rdev; int retval; umode_t umode; struct inode *inode; - struct p9_wstat *st; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *inode, void *data); - inode = iget_locked(sb, QID2INO(&fid->qid)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { - if (!new) { - goto done; - } else { - p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n", - inode->i_ino); - iput(inode); - remove_inode_hash(inode); - inode = iget_locked(sb, QID2INO(&fid->qid)); - WARN_ON(!(inode->i_state & I_NEW)); - } - } + if (new) + test = v9fs_test_new_inode; + else + test = v9fs_test_inode; + inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto error; - } - + inode->i_ino = QID2INO(qid); umode = p9mode2unixmode(v9ses, st, &rdev); - retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev); - v9fs_stat2inode(st, inode, sb, 0); - p9stat_free(st); - kfree(st); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); if (retval) goto error; + v9fs_stat2inode(st, inode, sb, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); -done: return inode; error: iget_failed(inode); return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_wstat *st; + struct inode *inode = NULL; + + st = p9_client_stat(fid); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget(sb, &st->qid, st, new); + p9stat_free(st); + kfree(st); + return inode; } /** @@ -449,15 +492,8 @@ static int v9fs_at_to_dotl_flags(int flags) */ static void v9fs_dec_count(struct inode *inode) { - if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) { - if (inode->i_nlink) { - drop_nlink(inode); - } else { - p9_debug(P9_DEBUG_VFS, - "WARNING: unexpected i_nlink zero %d inode %ld\n", - inode->i_nlink, inode->i_ino); - } - } + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); } /** @@ -508,9 +544,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) } else v9fs_dec_count(inode); - if (inode->i_nlink <= 0) /* no more refs unhash it */ - remove_inode_hash(inode); - v9fs_invalidate_inode_attr(inode); v9fs_invalidate_inode_attr(dir); @@ -576,7 +609,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, /* * instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, @@ -704,8 +737,10 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; else if (IS_ERR(fid)) inode = ERR_CAST(fid); + else if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index c61b97bd13b9..143ac03b7425 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -52,50 +52,80 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) return current_fsgid(); } +static int v9fs_test_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + /* don't match inode of different type */ + if (inode_wrong_type(inode, st->st_mode)) + return 0; -struct inode * -v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new) + if (inode->i_generation != st->st_gen) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; + return 1; +} + +/* Always get a new inode */ +static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) +{ + return 0; +} + +static int v9fs_set_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + inode->i_generation = st->st_gen; + return 0; +} + +static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, + struct p9_qid *qid, + struct p9_fid *fid, + struct p9_stat_dotl *st, + int new) { int retval; struct inode *inode; - struct p9_stat_dotl *st; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *inode, void *data); - inode = iget_locked(sb, QID2INO(&fid->qid)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { - if (!new) { - goto done; - } else { /* deal with race condition in inode number reuse */ - p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n", - inode->i_ino); - iput(inode); - remove_inode_hash(inode); - inode = iget_locked(sb, QID2INO(&fid->qid)); - WARN_ON(!(inode->i_state & I_NEW)); - } - } + if (new) + test = v9fs_test_new_inode_dotl; + else + test = v9fs_test_inode_dotl; + inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto error; - } - - retval = v9fs_init_inode(v9ses, inode, &fid->qid, + inode->i_ino = QID2INO(qid); + retval = v9fs_init_inode(v9ses, inode, st->st_mode, new_decode_dev(st->st_rdev)); - v9fs_stat2inode_dotl(st, inode, 0); - kfree(st); if (retval) goto error; + v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); @@ -103,11 +133,27 @@ v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new) goto error; unlock_new_inode(inode); -done: return inode; error: iget_failed(inode); return ERR_PTR(retval); + +} + +struct inode * +v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb, int new) +{ + struct p9_stat_dotl *st; + struct inode *inode = NULL; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); + if (IS_ERR(st)) + return ERR_CAST(st); + + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); + kfree(st); + return inode; } struct dotl_openflag_map { @@ -259,7 +305,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -309,6 +355,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, umode_t omode) { int err; + struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; @@ -318,6 +365,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); + v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) @@ -352,7 +400,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, } /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -749,6 +797,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; umode_t mode; + struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; @@ -758,6 +807,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); + v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); @@ -788,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, err); goto error; } - inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f52fdf42945c..489db161abc9 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -139,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; diff --git a/fs/Kconfig b/fs/Kconfig index 949895cff872..64d420e3c475 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,7 +43,6 @@ config FS_MBCACHE default y if EXT4_FS=y default m if EXT2_FS_XATTR || EXT4_FS -source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" source "fs/xfs/Kconfig" @@ -388,7 +387,7 @@ config NFS_COMMON config NFS_COMMON_LOCALIO_SUPPORT tristate - default n + depends on NFS_LOCALIO default y if NFSD=y || NFS_FS=y default m if NFSD=m && NFS_FS=m select SUNRPC diff --git a/fs/Makefile b/fs/Makefile index 61679fd587b7..15df0a923d3a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_NETFS_SUPPORT) += netfs/ -obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ # We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the # ext2 driver, which doesn't know about journalling! Explicitly request ext2 diff --git a/fs/adfs/super.c b/fs/adfs/super.c index f0b999a4961b..017c48a80203 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -6,7 +6,8 @@ */ #include <linux/module.h> #include <linux/init.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; +enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix}; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_ownmask, "ownmask=%o"}, - {Opt_othmask, "othmask=%o"}, - {Opt_ftsuffix, "ftsuffix=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec adfs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("ownmask", Opt_ownmask), + fsparam_u32oct ("othmask", Opt_othmask), + fsparam_u32 ("ftsuffix", Opt_ftsuffix), + {} }; -static int parse_options(struct super_block *sb, struct adfs_sb_info *asb, - char *options) +static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - int option; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(asb->s_uid)) - return -EINVAL; - break; - case Opt_gid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(asb->s_gid)) - return -EINVAL; - break; - case Opt_ownmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_owner_mask = option; - break; - case Opt_othmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_other_mask = option; - break; - case Opt_ftsuffix: - if (match_int(args, &option)) - return -EINVAL; - asb->s_ftsuffix = option; - break; - default: - adfs_msg(sb, KERN_ERR, - "unrecognised mount option \"%s\" or missing value", - p); - return -EINVAL; - } + struct adfs_sb_info *asb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, adfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + asb->s_uid = result.uid; + break; + case Opt_gid: + asb->s_gid = result.gid; + break; + case Opt_ownmask: + asb->s_owner_mask = result.uint_32; + break; + case Opt_othmask: + asb->s_other_mask = result.uint_32; + break; + case Opt_ftsuffix: + asb->s_ftsuffix = result.uint_32; + break; + default: + return -EINVAL; } return 0; } -static int adfs_remount(struct super_block *sb, int *flags, char *data) +static int adfs_reconfigure(struct fs_context *fc) { - struct adfs_sb_info temp_asb; - int ret; + struct adfs_sb_info *new_asb = fc->s_fs_info; + struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb); - sync_filesystem(sb); - *flags |= ADFS_SB_FLAGS; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= ADFS_SB_FLAGS; - temp_asb = *ADFS_SB(sb); - ret = parse_options(sb, &temp_asb, data); - if (ret == 0) - *ADFS_SB(sb) = temp_asb; + /* Structure copy newly parsed options */ + *asb = *new_asb; - return ret; + return 0; } static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = { .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, - .remount_fs = adfs_remount, .show_options = adfs_show_options, }; @@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, return 0; } -static int adfs_fill_super(struct super_block *sb, void *data, int silent) +static int adfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct adfs_discrecord *dr; struct object_info root_obj; - struct adfs_sb_info *asb; + struct adfs_sb_info *asb = sb->s_fs_info; struct inode *root; int ret = -EINVAL; + int silent = fc->sb_flags & SB_SILENT; sb->s_flags |= ADFS_SB_FLAGS; - asb = kzalloc(sizeof(*asb), GFP_KERNEL); - if (!asb) - return -ENOMEM; - sb->s_fs_info = asb; sb->s_magic = ADFS_SUPER_MAGIC; sb->s_time_gran = 10000000; - /* set default options */ - asb->s_uid = GLOBAL_ROOT_UID; - asb->s_gid = GLOBAL_ROOT_GID; - asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; - asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; - asb->s_ftsuffix = 0; - - if (parse_options(sb, asb, data)) - goto error; - /* Try to probe the filesystem boot block */ ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); if (ret == -EILSEQ) @@ -453,18 +414,61 @@ error: return ret; } -static struct dentry *adfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int adfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, adfs_fill_super); +} + +static void adfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); + struct adfs_context *asb = fc->s_fs_info; + + kfree(asb); +} + +static const struct fs_context_operations adfs_context_ops = { + .parse_param = adfs_parse_param, + .get_tree = adfs_get_tree, + .reconfigure = adfs_reconfigure, + .free = adfs_free_fc, +}; + +static int adfs_init_fs_context(struct fs_context *fc) +{ + struct adfs_sb_info *asb; + + asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL); + if (!asb) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct adfs_sb_info *old_asb = ADFS_SB(sb); + + /* structure copy existing options before parsing */ + *asb = *old_asb; + } else { + /* set default options */ + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; + asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; + asb->s_ftsuffix = 0; + } + + fc->ops = &adfs_context_ops; + fc->s_fs_info = asb; + + return 0; } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", - .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = adfs_init_fs_context, + .parameters = adfs_param_spec, }; MODULE_ALIAS_FS("adfs"); diff --git a/fs/affs/super.c b/fs/affs/super.c index 3c5821339609..2fa40337776d 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -14,7 +14,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/statfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/cred.h> @@ -27,7 +28,6 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_show_options(struct seq_file *m, struct dentry *root); -static int affs_remount (struct super_block *sb, int *flags, char *data); static void affs_commit_super(struct super_block *sb, int wait) @@ -155,140 +155,114 @@ static const struct super_operations affs_sops = { .put_super = affs_put_super, .sync_fs = affs_sync_fs, .statfs = affs_statfs, - .remount_fs = affs_remount, .show_options = affs_show_options, }; enum { Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, - Opt_verbose, Opt_volume, Opt_ignore, Opt_err, + Opt_verbose, Opt_volume, Opt_ignore, }; -static const match_table_t tokens = { - {Opt_bs, "bs=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_mufs, "mufs"}, - {Opt_notruncate, "nofilenametruncate"}, - {Opt_prefix, "prefix=%s"}, - {Opt_protect, "protect"}, - {Opt_reserved, "reserved=%u"}, - {Opt_root, "root=%u"}, - {Opt_setgid, "setgid=%u"}, - {Opt_setuid, "setuid=%u"}, - {Opt_verbose, "verbose"}, - {Opt_volume, "volume=%s"}, - {Opt_ignore, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, - {Opt_err, NULL}, +struct affs_context { + kuid_t uid; /* uid to override */ + kgid_t gid; /* gid to override */ + unsigned int mode; /* mode to override */ + unsigned int reserved; /* Number of reserved blocks */ + int root_block; /* FFS root block number */ + int blocksize; /* Initial device blksize */ + char *prefix; /* Prefix for volumes and assigns */ + char volume[32]; /* Vol. prefix for absolute symlinks */ + unsigned long mount_flags; /* Options */ }; -static int -parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, - int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) +static const struct fs_parameter_spec affs_param_spec[] = { + fsparam_u32 ("bs", Opt_bs), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("mufs", Opt_mufs), + fsparam_flag ("nofilenametruncate", Opt_notruncate), + fsparam_string ("prefix", Opt_prefix), + fsparam_flag ("protect", Opt_protect), + fsparam_u32 ("reserved", Opt_reserved), + fsparam_u32 ("root", Opt_root), + fsparam_gid ("setgid", Opt_setgid), + fsparam_uid ("setuid", Opt_setuid), + fsparam_flag ("verbose", Opt_verbose), + fsparam_string ("volume", Opt_volume), + fsparam_flag ("grpquota", Opt_ignore), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("quota", Opt_ignore), + fsparam_flag ("usrquota", Opt_ignore), + {}, +}; + +static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - /* Fill in defaults */ - - *uid = current_uid(); - *gid = current_gid(); - *reserved = 2; - *root = -1; - *blocksize = -1; - volume[0] = ':'; - volume[1] = 0; - *mount_opts = 0; - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token, n, option; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_bs: - if (match_int(&args[0], &n)) - return 0; - if (n != 512 && n != 1024 && n != 2048 - && n != 4096) { - pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); - return 0; - } - *blocksize = n; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - *mode = option & 0777; - affs_set_opt(*mount_opts, SF_SETMODE); - break; - case Opt_mufs: - affs_set_opt(*mount_opts, SF_MUFS); - break; - case Opt_notruncate: - affs_set_opt(*mount_opts, SF_NO_TRUNCATE); - break; - case Opt_prefix: - kfree(*prefix); - *prefix = match_strdup(&args[0]); - if (!*prefix) - return 0; - affs_set_opt(*mount_opts, SF_PREFIX); - break; - case Opt_protect: - affs_set_opt(*mount_opts, SF_IMMUTABLE); - break; - case Opt_reserved: - if (match_int(&args[0], reserved)) - return 0; - break; - case Opt_root: - if (match_int(&args[0], root)) - return 0; - break; - case Opt_setgid: - if (match_int(&args[0], &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - affs_set_opt(*mount_opts, SF_SETGID); - break; - case Opt_setuid: - if (match_int(&args[0], &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - affs_set_opt(*mount_opts, SF_SETUID); - break; - case Opt_verbose: - affs_set_opt(*mount_opts, SF_VERBOSE); - break; - case Opt_volume: { - char *vol = match_strdup(&args[0]); - if (!vol) - return 0; - strscpy(volume, vol, 32); - kfree(vol); - break; - } - case Opt_ignore: - /* Silently ignore the quota options */ - break; - default: - pr_warn("Unrecognized mount option \"%s\" or missing value\n", - p); - return 0; + struct affs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int n; + int opt; + + opt = fs_parse(fc, affs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_bs: + n = result.uint_32; + if (n != 512 && n != 1024 && n != 2048 + && n != 4096) { + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + return -EINVAL; } + ctx->blocksize = n; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 0777; + affs_set_opt(ctx->mount_flags, SF_SETMODE); + break; + case Opt_mufs: + affs_set_opt(ctx->mount_flags, SF_MUFS); + break; + case Opt_notruncate: + affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE); + break; + case Opt_prefix: + kfree(ctx->prefix); + ctx->prefix = param->string; + param->string = NULL; + affs_set_opt(ctx->mount_flags, SF_PREFIX); + break; + case Opt_protect: + affs_set_opt(ctx->mount_flags, SF_IMMUTABLE); + break; + case Opt_reserved: + ctx->reserved = result.uint_32; + break; + case Opt_root: + ctx->root_block = result.uint_32; + break; + case Opt_setgid: + ctx->gid = result.gid; + affs_set_opt(ctx->mount_flags, SF_SETGID); + break; + case Opt_setuid: + ctx->uid = result.uid; + affs_set_opt(ctx->mount_flags, SF_SETUID); + break; + case Opt_verbose: + affs_set_opt(ctx->mount_flags, SF_VERBOSE); + break; + case Opt_volume: + strscpy(ctx->volume, param->string, 32); + break; + case Opt_ignore: + /* Silently ignore the quota options */ + break; + default: + return -EINVAL; } - return 1; + return 0; } static int affs_show_options(struct seq_file *m, struct dentry *root) @@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root) * hopefully have the guts to do so. Until then: sorry for the mess. */ -static int affs_fill_super(struct super_block *sb, void *data, int silent) +static int affs_fill_super(struct super_block *sb, struct fs_context *fc) { struct affs_sb_info *sbi; + struct affs_context *ctx = fc->fs_private; struct buffer_head *root_bh = NULL; struct buffer_head *boot_bh; struct inode *root_inode = NULL; - s32 root_block; + int silent = fc->sb_flags & SB_SILENT; int size, blocksize; u32 chksum; int num_bm; int i, j; - kuid_t uid; - kgid_t gid; - int reserved; - unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; int ret; - pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); - sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; sb->s_flags |= SB_NODIRATIME; @@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); - if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, - &blocksize,&sbi->s_prefix, - sbi->s_volume, &mount_flags)) { - pr_err("Error parsing options\n"); - return -EINVAL; - } - /* N.B. after this point s_prefix must be released */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; + sbi->s_reserved = ctx->reserved; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; + memcpy(sbi->s_volume, ctx->volume, 32); - sbi->s_flags = mount_flags; - sbi->s_mode = i; - sbi->s_uid = uid; - sbi->s_gid = gid; - sbi->s_reserved= reserved; + /* N.B. after this point s_prefix must be released */ /* Get the size of the device in 512-byte blocks. * If we later see that the partition uses bigger @@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) i = bdev_logical_block_size(sb->s_bdev); j = PAGE_SIZE; + blocksize = ctx->blocksize; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { - sbi->s_root_block = root_block; - if (root_block < 0) - sbi->s_root_block = (reserved + size - 1) / 2; + sbi->s_root_block = ctx->root_block; + if (ctx->root_block < 0) + sbi->s_root_block = (ctx->reserved + size - 1) / 2; pr_debug("setting blocksize to %d\n", blocksize); affs_set_blocksize(sb, blocksize); sbi->s_partition_size = size; @@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) "size=%d, reserved=%d\n", sb->s_id, sbi->s_root_block + num_bm, - blocksize, size, reserved); + ctx->blocksize, size, ctx->reserved); root_bh = affs_bread(sb, sbi->s_root_block + num_bm); if (!root_bh) continue; @@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) got_root: /* Keep super block in cache */ sbi->s_root_bh = root_bh; - root_block = sbi->s_root_block; + ctx->root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); @@ -506,7 +473,7 @@ got_root: return -EINVAL; } - if (affs_test_opt(mount_flags, SF_VERBOSE)) { + if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, @@ -528,7 +495,7 @@ got_root: /* set up enough so that it can read an inode */ - root_inode = affs_iget(sb, root_block); + root_inode = affs_iget(sb, ctx->root_block); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -548,56 +515,43 @@ got_root: return 0; } -static int -affs_remount(struct super_block *sb, int *flags, char *data) +static int affs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + struct affs_context *ctx = fc->fs_private; struct affs_sb_info *sbi = AFFS_SB(sb); - int blocksize; - kuid_t uid; - kgid_t gid; - int mode; - int reserved; - int root_block; - unsigned long mount_flags; int res = 0; - char volume[32]; - char *prefix = NULL; - - pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); - *flags |= SB_NODIRATIME; - - memcpy(volume, sbi->s_volume, 32); - if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &prefix, volume, - &mount_flags)) { - kfree(prefix); - return -EINVAL; - } + fc->sb_flags |= SB_NODIRATIME; flush_delayed_work(&sbi->sb_work); - sbi->s_flags = mount_flags; - sbi->s_mode = mode; - sbi->s_uid = uid; - sbi->s_gid = gid; + /* + * NB: Historically, only mount_flags, mode, uid, gic, prefix, + * and volume are accepted during remount. + */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; /* protect against readers */ spin_lock(&sbi->symlink_lock); - if (prefix) { + if (ctx->prefix) { kfree(sbi->s_prefix); - sbi->s_prefix = prefix; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; } - memcpy(sbi->s_volume, volume, 32); + memcpy(sbi->s_volume, ctx->volume, 32); spin_unlock(&sbi->symlink_lock); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & SB_RDONLY) + if (fc->sb_flags & SB_RDONLY) affs_free_bitmap(sb); else - res = affs_init_bitmap(sb, flags); + res = affs_init_bitmap(sb, &fc->sb_flags); return res; } @@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry *affs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int affs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); + return get_tree_bdev(fc, affs_fill_super); } static void affs_kill_sb(struct super_block *sb) @@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb) } } +static void affs_free_fc(struct fs_context *fc) +{ + struct affs_context *ctx = fc->fs_private; + + kfree(ctx->prefix); + kfree(ctx); +} + +static const struct fs_context_operations affs_context_ops = { + .parse_param = affs_parse_param, + .get_tree = affs_get_tree, + .reconfigure = affs_reconfigure, + .free = affs_free_fc, +}; + +static int affs_init_fs_context(struct fs_context *fc) +{ + struct affs_context *ctx; + + ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct affs_sb_info *sbi = AFFS_SB(sb); + + /* + * NB: historically, no options other than volume were + * preserved across a remount unless they were explicitly + * passed in. + */ + memcpy(ctx->volume, sbi->s_volume, 32); + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->reserved = 2; + ctx->root_block = -1; + ctx->blocksize = -1; + ctx->volume[0] = ':'; + } + + fc->ops = &affs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", - .mount = affs_mount, .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = affs_init_fs_context, + .parameters = affs_param_spec, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/afs/Makefile b/fs/afs/Makefile index dcdc0f1bb76f..5efd7e13b304 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -11,6 +11,7 @@ kafs-y := \ cmservice.o \ dir.o \ dir_edit.o \ + dir_search.o \ dir_silly.o \ dynroot.o \ file.o \ diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c index a189ff8a5034..c0384201b8fe 100644 --- a/fs/afs/addr_prefs.c +++ b/fs/afs/addr_prefs.c @@ -413,8 +413,10 @@ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) do { argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); - if (argc < 0) - return argc; + if (argc < 0) { + ret = argc; + goto done; + } if (argc < 2) goto inval; diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b488072aee87..ec3db00bd081 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -10,7 +10,7 @@ #include <linux/in.h> -#define AFS_MAXCELLNAME 256 /* Maximum length of a cell name */ +#define AFS_MAXCELLNAME 253 /* Maximum length of a cell name (DNS limited) */ #define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ #define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ #define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index a06296c8827d..b835e25a2c02 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -13,6 +13,7 @@ #define AFS_VL_PORT 7003 /* volume location service port */ #define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ #define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */ +#define YFS_VL_MAXCELLNAME 256 /* Maximum length of a cell name in YFS protocol */ enum AFSVL_Operations { VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 99b2c8172021..69e1dd55b160 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -41,7 +41,7 @@ static void afs_volume_init_callback(struct afs_volume *volume) list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { - atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); queue_work(system_unbound_wq, &vnode->cb_work); } } @@ -79,7 +79,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas _enter(""); clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) { + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) { vnode->cb_break++; vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); afs_clear_permits(vnode); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index caa09875f520..cee42646736c 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -146,18 +146,20 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } - cell->name = kmalloc(namelen + 1, GFP_KERNEL); + cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); } - cell->net = net; + cell->name[0] = '.'; + cell->name++; cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); cell->name[i] = 0; + cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); INIT_WORK(&cell->manager, afs_manage_cell_work); @@ -211,7 +213,7 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: - kfree(cell->name); + kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -365,6 +367,14 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) len = cp - rootcell; } + if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.') + return -EINVAL; + if (memchr(rootcell, '/', len)) + return -EINVAL; + cp = strstr(rootcell, ".."); + if (cp && cp < rootcell + len) + return -EINVAL; + /* allocate a cell record for the root cell */ new_root = afs_lookup_cell(net, rootcell, len, vllist, false); if (IS_ERR(new_root)) { @@ -502,7 +512,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); - kfree(cell->name); + kfree(cell->name - 1); kfree(cell); afs_dec_cells_outstanding(net); @@ -710,7 +720,8 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_proc_cell_remove(cell); mutex_lock(&net->proc_cells_lock); - hlist_del_rcu(&cell->proc_link); + if (!hlist_unhashed(&cell->proc_link)) + hlist_del_rcu(&cell->proc_link); afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index f8622ed72e08..a843c36fc471 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -12,6 +12,8 @@ #include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/iversion.h> +#include <linux/iov_iter.h> #include <linux/task_io_accounting_ops.h> #include "internal.h" #include "afs_fs.h" @@ -41,15 +43,6 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); -static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, - size_t length); - -static bool afs_dir_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - BUG(); /* This should never happen. */ -} const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -74,10 +67,7 @@ const struct inode_operations afs_dir_inode_operations = { }; const struct address_space_operations afs_dir_aops = { - .dirty_folio = afs_dir_dirty_folio, - .release_folio = afs_dir_release_folio, - .invalidate_folio = afs_dir_invalidate_folio, - .migrate_folio = filemap_migrate_folio, + .writepages = afs_single_writepages, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -98,152 +88,124 @@ struct afs_lookup_one_cookie { struct afs_lookup_cookie { struct dir_context ctx; struct qstr name; - bool found; - bool one_only; unsigned short nr_fids; struct afs_fid fids[50]; }; +static void afs_dir_unuse_cookie(struct afs_vnode *dvnode, int ret) +{ + if (ret == 0) { + struct afs_vnode_cache_aux aux; + loff_t i_size = i_size_read(&dvnode->netfs.inode); + + afs_set_cache_aux(dvnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(dvnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + } +} + /* - * Drop the refs that we're holding on the folios we were reading into. We've - * got refs on the first nr_pages pages. + * Iterate through a kmapped directory segment, dumping a summary of + * the contents. */ -static void afs_dir_read_cleanup(struct afs_read *req) +static size_t afs_dir_dump_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - struct address_space *mapping = req->vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; + do { + union afs_xdr_dir_block *block = iter_base; - XA_STATE(xas, &mapping->i_pages, 0); + pr_warn("[%05zx] %32phN\n", progress, block); + iter_base += AFS_DIR_BLOCK_SIZE; + progress += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); - if (unlikely(!req->nr_pages)) - return; + return len; +} - rcu_read_lock(); - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; - BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio->mapping, ==, mapping); +/* + * Dump the contents of a directory. + */ +static void afs_dir_dump(struct afs_vnode *dvnode) +{ + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); - folio_put(folio); - } + pr_warn("DIR %llx:%llx is=%llx\n", + dvnode->fid.vid, dvnode->fid.vnode, i_size); - rcu_read_unlock(); + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iterate_folioq(&iter, iov_iter_count(&iter), NULL, NULL, + afs_dir_dump_step); } /* * check that a directory folio is valid */ -static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, - loff_t i_size) +static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress, + union afs_xdr_dir_block *block) { - union afs_xdr_dir_block *block; - size_t offset, size; - loff_t pos; + if (block->hdr.magic != AFS_DIR_MAGIC) { + pr_warn("%s(%lx): [%zx] bad magic %04x\n", + __func__, dvnode->netfs.inode.i_ino, + progress, ntohs(block->hdr.magic)); + trace_afs_dir_check_failed(dvnode, progress); + trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); + return false; + } - /* Determine how many magic numbers there should be in this folio, but - * we must take care because the directory may change size under us. + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the folio + * *should* be NUL-terminated anyway. */ - pos = folio_pos(folio); - if (i_size <= pos) - goto checked; - - size = min_t(loff_t, folio_size(folio), i_size - pos); - for (offset = 0; offset < size; offset += sizeof(*block)) { - block = kmap_local_folio(folio, offset); - if (block->hdr.magic != AFS_DIR_MAGIC) { - printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->netfs.inode.i_ino, - pos, offset, size, ntohs(block->hdr.magic)); - trace_afs_dir_check_failed(dvnode, pos + offset, i_size); - kunmap_local(block); - trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); - goto error; - } - - /* Make sure each block is NUL terminated so we can reasonably - * use string functions on it. The filenames in the folio - * *should* be NUL-terminated anyway. - */ - ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; - - kunmap_local(block); - } -checked: + ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; afs_stat_v(dvnode, n_read_dir); return true; - -error: - return false; } /* - * Dump the contents of a directory. + * Iterate through a kmapped directory segment, checking the content. */ -static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) +static size_t afs_dir_check_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; - size_t offset, size; - - XA_STATE(xas, &mapping->i_pages, 0); - - pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n", - dvnode->fid.vid, dvnode->fid.vnode, - req->file_size, req->len, req->actual_len); - pr_warn("DIR %llx %x %zx %zx\n", - req->pos, req->nr_pages, - req->iter->iov_offset, iov_iter_count(req->iter)); - - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; + struct afs_vnode *dvnode = priv; - BUG_ON(folio->mapping != mapping); + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) + return len; - size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); - for (offset = 0; offset < size; offset += sizeof(*block)) { - block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio->index + offset, block); - kunmap_local(block); - } - } + do { + if (!afs_dir_check_block(dvnode, progress, iter_base)) + break; + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); + + return len; } /* - * Check all the blocks in a directory. All the folios are held pinned. + * Check all the blocks in a directory. */ -static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) +static int afs_dir_check(struct afs_vnode *dvnode) { - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; - int ret = 0; - - XA_STATE(xas, &mapping->i_pages, 0); + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); + size_t checked = 0; - if (unlikely(!req->nr_pages)) + if (unlikely(!i_size)) return 0; - rcu_read_lock(); - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; - - BUG_ON(folio->mapping != mapping); - - if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { - afs_dir_dump(dvnode, req); - ret = -EIO; - break; - } + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + checked = iterate_folioq(&iter, iov_iter_count(&iter), dvnode, NULL, + afs_dir_check_step); + if (checked != i_size) { + afs_dir_dump(dvnode); + return -EIO; } - - rcu_read_unlock(); - return ret; + return 0; } /* @@ -263,134 +225,140 @@ static int afs_dir_open(struct inode *inode, struct file *file) } /* - * Read the directory into the pagecache in one go, scrubbing the previous - * contents. The list of folios is returned, pinning them so that they don't - * get reclaimed during the iteration. + * Read a file in a single download. */ -static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) - __acquires(&dvnode->validate_lock) +static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file) { - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct afs_read *req; + struct iov_iter iter; + ssize_t ret; loff_t i_size; - int nr_pages, i; - int ret; - loff_t remote_size = 0; - - _enter(""); + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - refcount_set(&req->usage, 1); - req->vnode = dvnode; - req->key = key_get(key); - req->cleanup = afs_dir_read_cleanup; - -expand: i_size = i_size_read(&dvnode->netfs.inode); - if (i_size < remote_size) - i_size = remote_size; - if (i_size < 2048) { - ret = afs_bad(dvnode, afs_file_error_dir_small); - goto error; - } - if (i_size > 2048 * 1024) { - trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - ret = -EFBIG; - goto error; + if (is_dir) { + if (i_size < AFS_DIR_BLOCK_SIZE) + return afs_bad(dvnode, afs_file_error_dir_small); + if (i_size > AFS_DIR_BLOCK_SIZE * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + } else { + if (i_size > AFSPATHMAX) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } } - _enter("%llu", i_size); + /* Expand the storage. TODO: Shrink the storage too. */ + if (dvnode->directory_size < i_size) { + size_t cur_size = dvnode->directory_size; - nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; + ret = netfs_alloc_folioq_buffer(NULL, + &dvnode->directory, &cur_size, i_size, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + return ret; + } - req->actual_len = i_size; /* May change */ - req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ - req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages, - 0, i_size); - req->iter = &req->def_iter; + iov_iter_folio_queue(&iter, ITER_DEST, dvnode->directory, 0, 0, dvnode->directory_size); - /* Fill in any gaps that we might find where the memory reclaimer has - * been at work and pin all the folios. If there are any gaps, we will - * need to reread the entire directory contents. + /* AFS requires us to perform the read of a directory synchronously as + * a single unit to avoid issues with the directory contents being + * changed between reads. */ - i = req->nr_pages; - while (i < nr_pages) { - struct folio *folio; - - folio = filemap_get_folio(mapping, i); - if (IS_ERR(folio)) { - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_inval); - folio = __filemap_get_folio(mapping, - i, FGP_LOCK | FGP_CREAT, - mapping->gfp_mask); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto error; - } - folio_attach_private(folio, (void *)1); - folio_unlock(folio); + ret = netfs_read_single(&dvnode->netfs.inode, file, &iter); + if (ret >= 0) { + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size > ret) { + /* The content has grown, so we need to expand the + * buffer. + */ + ret = -ESTALE; + } else if (is_dir) { + int ret2 = afs_dir_check(dvnode); + + if (ret2 < 0) + ret = ret2; + } else if (i_size < folioq_folio_size(dvnode->directory, 0)) { + /* NUL-terminate a symlink. */ + char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0); + + symlink[i_size] = 0; + kunmap_local(symlink); } - - req->nr_pages += folio_nr_pages(folio); - i += folio_nr_pages(folio); } - /* If we're going to reload, we need to lock all the pages to prevent - * races. - */ + return ret; +} + +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file) +{ + ssize_t ret; + + fscache_use_cookie(afs_vnode_cache(dvnode), false); + ret = afs_do_read_single(dvnode, file); + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + return ret; +} + +/* + * Read the directory into a folio_queue buffer in one go, scrubbing the + * previous contents. We return -ESTALE if the caller needs to call us again. + */ +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock) +{ + ssize_t ret; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + ret = -ERESTARTSYS; if (down_read_killable(&dvnode->validate_lock) < 0) goto error; - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - goto success; + /* We only need to reread the data if it became invalid - or if we + * haven't read it yet. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + ret = i_size; + goto valid; + } up_read(&dvnode->validate_lock); if (down_write_killable(&dvnode->validate_lock) < 0) goto error; - if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { - trace_afs_reload_dir(dvnode); - ret = afs_fetch_data(dvnode, req); - if (ret < 0) - goto error_unlock; - - task_io_account_read(PAGE_SIZE * req->nr_pages); - - if (req->len < req->file_size) { - /* The content has grown, so we need to expand the - * buffer. - */ - up_write(&dvnode->validate_lock); - remote_size = req->file_size; - goto expand; - } + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_invalidate_cache(dvnode, 0); - /* Validate the data we just read. */ - ret = afs_dir_check(dvnode, req); + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) || + !test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + trace_afs_reload_dir(dvnode); + ret = afs_read_single(dvnode, file); if (ret < 0) goto error_unlock; // TODO: Trim excess pages set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); + } else { + ret = i_size; } downgrade_write(&dvnode->validate_lock); -success: - return req; +valid: + return ret; error_unlock: up_write(&dvnode->validate_lock); error: - afs_put_read(req); - _leave(" = %d", ret); - return ERR_PTR(ret); + _leave(" = %zd", ret); + return ret; } /* @@ -398,79 +366,69 @@ error: */ static int afs_dir_iterate_block(struct afs_vnode *dvnode, struct dir_context *ctx, - union afs_xdr_dir_block *block, - unsigned blkoff) + union afs_xdr_dir_block *block) { union afs_xdr_dirent *dire; - unsigned offset, next, curr, nr_slots; + unsigned int blknum, base, hdr, pos, next, nr_slots; size_t nlen; int tmp; - _enter("%llx,%x", ctx->pos, blkoff); + blknum = ctx->pos / AFS_DIR_BLOCK_SIZE; + base = blknum * AFS_DIR_SLOTS_PER_BLOCK; + hdr = (blknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + pos = DIV_ROUND_UP(ctx->pos, AFS_DIR_DIRENT_SIZE) - base; - curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); + _enter("%llx,%x", ctx->pos, blknum); /* walk through the block, an entry at a time */ - for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); - offset < AFS_DIR_SLOTS_PER_BLOCK; - offset = next - ) { + for (unsigned int slot = hdr; slot < AFS_DIR_SLOTS_PER_BLOCK; slot = next) { /* skip entries marked unused in the bitmap */ - if (!(block->hdr.bitmap[offset / 8] & - (1 << (offset % 8)))) { - _debug("ENT[%zu.%u]: unused", - blkoff / sizeof(union afs_xdr_dir_block), offset); - next = offset + 1; - if (offset >= curr) - ctx->pos = blkoff + - next * sizeof(union afs_xdr_dirent); + if (!(block->hdr.bitmap[slot / 8] & + (1 << (slot % 8)))) { + _debug("ENT[%x]: Unused", base + slot); + next = slot + 1; + if (next >= pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; } /* got a valid entry */ - dire = &block->dirents[offset]; + dire = &block->dirents[slot]; nlen = strnlen(dire->u.name, - sizeof(*block) - - offset * sizeof(union afs_xdr_dirent)); + (unsigned long)(block + 1) - (unsigned long)dire->u.name - 1); if (nlen > AFSNAMEMAX - 1) { - _debug("ENT[%zu]: name too long (len %u/%zu)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, nlen); + _debug("ENT[%x]: Name too long (len %zx)", + base + slot, nlen); return afs_bad(dvnode, afs_file_error_dir_name_too_long); } - _debug("ENT[%zu.%u]: %s %zu \"%s\"", - blkoff / sizeof(union afs_xdr_dir_block), offset, - (offset < curr ? "skip" : "fill"), + _debug("ENT[%x]: %s %zx \"%s\"", + base + slot, (slot < pos ? "skip" : "fill"), nlen, dire->u.name); nr_slots = afs_dir_calc_slots(nlen); - next = offset + nr_slots; + next = slot + nr_slots; if (next > AFS_DIR_SLOTS_PER_BLOCK) { - _debug("ENT[%zu.%u]:" - " %u extends beyond end dir block" - " (len %zu)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, next, nlen); + _debug("ENT[%x]: extends beyond end dir block (len %zx)", + base + slot, nlen); return afs_bad(dvnode, afs_file_error_dir_over_end); } /* Check that the name-extension dirents are all allocated */ for (tmp = 1; tmp < nr_slots; tmp++) { - unsigned int ix = offset + tmp; - if (!(block->hdr.bitmap[ix / 8] & (1 << (ix % 8)))) { - _debug("ENT[%zu.u]:" - " %u unmarked extension (%u/%u)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, tmp, nr_slots); + unsigned int xslot = slot + tmp; + + if (!(block->hdr.bitmap[xslot / 8] & (1 << (xslot % 8)))) { + _debug("ENT[%x]: Unmarked extension (%x/%x)", + base + slot, tmp, nr_slots); return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); } } /* skip if starts before the current position */ - if (offset < curr) { - if (next > curr) - ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + if (slot < pos) { + if (next > pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; } @@ -484,75 +442,110 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, return 0; } - ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); return 1; } +struct afs_dir_iteration_ctx { + struct dir_context *dir_ctx; + int error; +}; + /* - * iterate through the data blob that lists the contents of an AFS directory + * Iterate through a kmapped directory segment. */ -static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, - struct key *key, afs_dataversion_t *_dir_version) +static size_t afs_dir_iterate_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - struct afs_vnode *dvnode = AFS_FS_I(dir); - union afs_xdr_dir_block *dblock; - struct afs_read *req; - struct folio *folio; - unsigned offset, size; + struct afs_dir_iteration_ctx *ctx = priv2; + struct afs_vnode *dvnode = priv; int ret; - _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); - - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) { + pr_err("Mis-iteration prog=%zx len=%zx\n", + progress % AFS_DIR_BLOCK_SIZE, + len % AFS_DIR_BLOCK_SIZE); + return len; } - req = afs_read_dir(dvnode, key); - if (IS_ERR(req)) - return PTR_ERR(req); - *_dir_version = req->data_version; + do { + ret = afs_dir_iterate_block(dvnode, ctx->dir_ctx, iter_base); + if (ret != 1) + break; - /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_xdr_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1); + ctx->dir_ctx->pos = round_up(ctx->dir_ctx->pos, AFS_DIR_BLOCK_SIZE); + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); - /* walk through the blocks in sequence */ - ret = 0; - while (ctx->pos < req->actual_len) { - /* Fetch the appropriate folio from the directory and re-add it - * to the LRU. We have all the pages pinned with an extra ref. - */ - folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, - FGP_ACCESSED, 0); - if (IS_ERR(folio)) { - ret = afs_bad(dvnode, afs_file_error_dir_missing_page); - break; - } + return len; +} + +/* + * Iterate through the directory folios. + */ +static int afs_dir_iterate_contents(struct inode *dir, struct dir_context *dir_ctx) +{ + struct afs_dir_iteration_ctx ctx = { .dir_ctx = dir_ctx }; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct iov_iter iter; + unsigned long long i_size = i_size_read(dir); - offset = round_down(ctx->pos, sizeof(*dblock)) - folio_pos(folio); - size = min_t(loff_t, folio_size(folio), - req->actual_len - folio_pos(folio)); + /* Round the file position up to the next entry boundary */ + dir_ctx->pos = round_up(dir_ctx->pos, sizeof(union afs_xdr_dirent)); - do { - dblock = kmap_local_folio(folio, offset); - ret = afs_dir_iterate_block(dvnode, ctx, dblock, - folio_pos(folio) + offset); - kunmap_local(dblock); - if (ret != 1) - goto out; + if (i_size <= 0 || dir_ctx->pos >= i_size) + return 0; - } while (offset += sizeof(*dblock), offset < size); + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iov_iter_advance(&iter, round_down(dir_ctx->pos, AFS_DIR_BLOCK_SIZE)); - ret = 0; - } + iterate_folioq(&iter, iov_iter_count(&iter), dvnode, &ctx, + afs_dir_iterate_step); + + if (ctx.error == -ESTALE) + afs_invalidate_dir(dvnode, afs_dir_invalid_iter_stale); + return ctx.error; +} + +/* + * iterate through the data blob that lists the contents of an AFS directory + */ +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct file *file, afs_dataversion_t *_dir_version) +{ + struct afs_vnode *dvnode = AFS_FS_I(dir); + int retry_limit = 100; + int ret; + + _enter("{%lu},%llx,,", dir->i_ino, ctx->pos); + + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, file); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(dir); + + ret = afs_dir_iterate_contents(dir, ctx); + up_read(&dvnode->validate_lock); + } while (ret == -ESTALE); -out: - up_read(&dvnode->validate_lock); - afs_put_read(req); _leave(" = %d", ret); return ret; } @@ -564,8 +557,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) { afs_dataversion_t dir_version; - return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file), - &dir_version); + return afs_dir_iterate(file_inode(file), ctx, file, &dir_version); } /* @@ -606,7 +598,7 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, * - just returns the FID the dentry name maps to if found */ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key, + struct afs_fid *fid, afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; @@ -620,7 +612,7 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); /* search the directory */ - ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version); + ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; @@ -655,19 +647,10 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); - if (cookie->found) { - if (cookie->nr_fids < 50) { - cookie->fids[cookie->nr_fids].vnode = ino; - cookie->fids[cookie->nr_fids].unique = dtype; - cookie->nr_fids++; - } - } else if (cookie->name.len == nlen && - memcmp(cookie->name.name, name, nlen) == 0) { - cookie->fids[1].vnode = ino; - cookie->fids[1].unique = dtype; - cookie->found = 1; - if (cookie->one_only) - return false; + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; } return cookie->nr_fids < 50; @@ -787,8 +770,7 @@ static bool afs_server_supports_ibulk(struct afs_vnode *dvnode) * files in one go and create inodes for them. The inode of the file we were * asked for is returned. */ -static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct key *key) +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) { struct afs_lookup_cookie *cookie; struct afs_vnode_param *vp; @@ -796,6 +778,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); + bool supports_ibulk; long ret; int i; @@ -812,19 +795,19 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want * and slot 0 for the directory */ - if (!afs_server_supports_ibulk(dvnode)) - cookie->one_only = true; - - /* search the directory */ - ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version); + /* Search the directory for the named entry using the hash table... */ + ret = afs_dir_search(dvnode, &dentry->d_name, &cookie->fids[1], &data_version); if (ret < 0) goto out; - dentry->d_fsdata = (void *)(unsigned long)data_version; + supports_ibulk = afs_server_supports_ibulk(dvnode); + if (supports_ibulk) { + /* ...then scan linearly from that point for entries to lookup-ahead. */ + cookie->ctx.pos = (ret + 1) * AFS_DIR_DIRENT_SIZE; + afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version); + } - ret = -ENOENT; - if (!cookie->found) - goto out; + dentry->d_fsdata = (void *)(unsigned long)data_version; /* Check to see if we already have an inode for the primary fid. */ inode = ilookup5(dir->i_sb, cookie->fids[1].vnode, @@ -883,7 +866,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * the whole operation. */ afs_op_set_error(op, -ENOTSUPP); - if (!cookie->one_only) { + if (supports_ibulk) { op->ops = &afs_inline_bulk_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); @@ -925,8 +908,7 @@ out: /* * Look up an entry in a directory with @sys substitution. */ -static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, - struct key *key) +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry) { struct afs_sysnames *subs; struct afs_net *net = afs_i2net(dir); @@ -974,7 +956,6 @@ out_s: afs_put_sysnames(subs); kfree(buf); out_p: - key_put(key); return ret; } @@ -988,7 +969,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, struct afs_fid fid = {}; struct inode *inode; struct dentry *d; - struct key *key; int ret; _enter("{%llx:%llu},%p{%pd},", @@ -1006,15 +986,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ESTALE); } - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return ERR_CAST(key); - } - - ret = afs_validate(dvnode, key); + ret = afs_validate(dvnode, NULL); if (ret < 0) { - key_put(key); + afs_dir_unuse_cookie(dvnode, ret); _leave(" = %d [val]", ret); return ERR_PTR(ret); } @@ -1024,11 +998,10 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, dentry->d_name.name[dentry->d_name.len - 3] == 's' && dentry->d_name.name[dentry->d_name.len - 2] == 'y' && dentry->d_name.name[dentry->d_name.len - 1] == 's') - return afs_lookup_atsys(dir, dentry, key); + return afs_lookup_atsys(dir, dentry); afs_stat_v(dvnode, n_lookup); - inode = afs_do_lookup(dir, dentry, key); - key_put(key); + inode = afs_do_lookup(dir, dentry); if (inode == ERR_PTR(-ENOENT)) inode = afs_try_auto_mntpt(dentry, dir); @@ -1154,7 +1127,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1281,6 +1254,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op) */ static void afs_vnode_new_inode(struct afs_operation *op) { + struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode_param *vp = &op->file[1]; struct afs_vnode *vnode; struct inode *inode; @@ -1300,6 +1274,10 @@ static void afs_vnode_new_inode(struct afs_operation *op) vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + if (S_ISDIR(inode->i_mode)) + afs_mkdir_init_dir(vnode, dvp->vnode); + else if (S_ISLNK(inode->i_mode)) + afs_init_new_symlink(vnode, op); if (!afs_op_error(op)) afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); d_instantiate(op->dentry, inode); @@ -1316,18 +1294,21 @@ static void afs_create_success(struct afs_operation *op) static void afs_create_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode_param *vp = &op->file[1]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid, op->create.reason); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_create_put(struct afs_operation *op) @@ -1355,6 +1336,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); + int ret; _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); @@ -1365,6 +1347,8 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, return PTR_ERR(op); } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1374,7 +1358,9 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->create.reason = afs_edit_dir_for_mkdir; op->mtime = current_time(dir); op->ops = &afs_mkdir_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; } /* @@ -1387,8 +1373,8 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); - atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_rmdir); + afs_invalidate_dir(vnode, afs_dir_invalid_subdir_removed); } } @@ -1402,18 +1388,21 @@ static void afs_rmdir_success(struct afs_operation *op) static void afs_rmdir_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); afs_dir_remove_subdir(op->dentry); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_remove(dvnode, &op->dentry->d_name, afs_edit_dir_for_rmdir); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_rmdir_put(struct afs_operation *op) @@ -1448,6 +1437,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1471,10 +1462,18 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) op->file[1].vnode = vnode; } - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + + /* Not all systems that can host afs servers have ENOTEMPTY. */ + if (ret == -EEXIST) + ret = -ENOTEMPTY; +out: + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: - return afs_put_operation(op); + ret = afs_put_operation(op); + goto out; } /* @@ -1537,16 +1536,19 @@ static void afs_unlink_success(struct afs_operation *op) static void afs_unlink_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_remove(dvnode, &op->dentry->d_name, afs_edit_dir_for_unlink); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_unlink_put(struct afs_operation *op) @@ -1585,6 +1587,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1631,10 +1635,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_wait_for_operation(op); } - return afs_put_operation(op); - error: - return afs_put_operation(op); + ret = afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; } static const struct afs_operation_ops afs_create_operation = { @@ -1668,6 +1672,8 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1678,7 +1684,9 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir, op->create.reason = afs_edit_dir_for_create; op->mtime = current_time(dir); op->ops = &afs_create_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: d_drop(dentry); @@ -1743,6 +1751,8 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + ret = afs_validate(vnode, op->key); if (ret < 0) goto error_op; @@ -1758,10 +1768,13 @@ static int afs_link(struct dentry *from, struct inode *dir, op->dentry_2 = from; op->ops = &afs_link_operation; op->create.reason = afs_edit_dir_for_link; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error_op: afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); error: d_drop(dentry); _leave(" = %d", ret); @@ -1805,6 +1818,8 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; @@ -1813,7 +1828,9 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, op->create.reason = afs_edit_dir_for_symlink; op->create.symlink = content; op->mtime = current_time(dir); - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: d_drop(dentry); @@ -1823,6 +1840,8 @@ error: static void afs_rename_success(struct afs_operation *op) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; @@ -1832,10 +1851,28 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + + /* If we're moving a subdir between dirs, we need to update + * its DV counter too as the ".." will be altered. + */ + if (S_ISDIR(vnode->netfs.inode.i_mode) && + op->file[0].vnode != op->file[1].vnode) { + u64 new_dv; + + write_seqlock(&vnode->cb_lock); + + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + + write_sequnlock(&vnode->cb_lock); + } } static void afs_rename_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources orig_cres = {}, new_cres = {}; struct afs_vnode_param *orig_dvp = &op->file[0]; struct afs_vnode_param *new_dvp = &op->file[1]; struct afs_vnode *orig_dvnode = orig_dvp->vnode; @@ -1852,6 +1889,10 @@ static void afs_rename_edit_dir(struct afs_operation *op) op->rename.rehash = NULL; } + fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode)); + if (new_dvnode != orig_dvnode) + fscache_begin_write_operation(&new_cres, afs_vnode_cache(new_dvnode)); + down_write(&orig_dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) @@ -1873,6 +1914,12 @@ static void afs_rename_edit_dir(struct afs_operation *op) &vnode->fid, afs_edit_dir_for_rename_2); } + if (S_ISDIR(vnode->netfs.inode.i_mode) && + new_dvnode != orig_dvnode && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_edit_dir_update_dotdot(vnode, new_dvnode, + afs_edit_dir_for_rename_sub); + new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); @@ -1895,6 +1942,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) d_move(old_dentry, new_dentry); up_write(&new_dvnode->validate_lock); + fscache_end_operation(&orig_cres); + if (new_dvnode != orig_dvnode) + fscache_end_operation(&new_cres); } static void afs_rename_put(struct afs_operation *op) @@ -1947,6 +1997,10 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(orig_dvnode), true); + if (new_dvnode != orig_dvnode) + fscache_use_cookie(afs_vnode_cache(new_dvnode), true); + ret = afs_validate(vnode, op->key); afs_op_set_error(op, ret); if (ret < 0) @@ -2014,47 +2068,43 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, */ d_drop(old_dentry); - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); +out: + afs_dir_unuse_cookie(orig_dvnode, ret); + if (new_dvnode != orig_dvnode) + afs_dir_unuse_cookie(new_dvnode, ret); + return ret; error: - return afs_put_operation(op); -} - -/* - * Release a directory folio and clean up its private state if it's not busy - * - return true if the folio can now be released, false if not - */ -static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) -{ - struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index); - - folio_detach_private(folio); - - /* The directory will need reloading. */ - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_relpg); - return true; + ret = afs_put_operation(op); + goto out; } /* - * Invalidate part or all of a folio. + * Write the file contents to the cache as a single blob. */ -static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - BUG_ON(!folio_test_locked(folio)); + struct afs_vnode *dvnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); + int ret = 0; - /* The directory will need reloading. */ - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_inval); + /* Need to lock to prevent the folio queue and folios from being thrown + * away. + */ + down_read(&dvnode->validate_lock); + + if (is_dir ? + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) : + atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) { + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, + i_size_read(&dvnode->netfs.inode)); + ret = netfs_writeback_single(mapping, wbc, &iter); + } - /* we clean up only if the entire folio is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - folio_detach_private(folio); + up_read(&dvnode->validate_lock); + return ret; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index a71bff10496b..60a549f1d9c5 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/iversion.h> +#include <linux/folio_queue.h> #include "internal.h" #include "xdr_fs.h" @@ -105,32 +106,66 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, } /* - * Get a new directory folio. + * Get a specific block, extending the directory storage to cover it as needed. */ -static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) +static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; + struct folio_queue *fq; + struct afs_vnode *dvnode = iter->dvnode; struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int ret; + + if (dvnode->directory_size < blend) { + size_t cur_size = dvnode->directory_size; + + ret = netfs_alloc_folioq_buffer( + NULL, &dvnode->directory, &cur_size, blend, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + goto fail; + } - folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, - mapping->gfp_mask); - if (IS_ERR(folio)) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - return NULL; + fq = iter->fq; + if (!fq) + fq = dvnode->directory; + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (int s = iter->fq_slot; s < folioq_count(fq); s++) { + size_t fsize = folioq_folio_size(fq, s); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, s); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = s; + iter->fpos = fpos; + return kmap_local_folio(folio, blpos - fpos); + } + fpos += fsize; + } + iter->fq_slot = 0; } - if (!folio_test_private(folio)) - folio_attach_private(folio, (void *)1); - return folio; + +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; } /* * Scan a directory block looking for a dirent of the right name. */ -static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, +static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name, unsigned int blocknum) { - union afs_xdr_dirent *de; + const union afs_xdr_dirent *de; u64 bitmap; int d, len, n; @@ -209,9 +244,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode, { union afs_xdr_dir_block *meta, *block; union afs_xdr_dirent *de; - struct folio *folio0, *folio; - unsigned int need_slots, nr_blocks, b; - pgoff_t index; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b, entry; loff_t i_size; int slot; @@ -220,20 +254,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode, i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size); return; } - folio0 = afs_dir_get_folio(vnode, 0); - if (!folio0) { - _leave(" [fgp]"); + meta = afs_dir_get_block(&iter, 0); + if (!meta) return; - } /* Work out how many slots we're going to need. */ - need_slots = afs_dir_calc_slots(name->len); + iter.nr_slots = afs_dir_calc_slots(name->len); - meta = kmap_local_folio(folio0, 0); if (i_size == 0) goto new_directory; nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; @@ -245,22 +276,21 @@ void afs_edit_dir_add(struct afs_vnode *vnode, /* If the directory extended into a new folio, then we need to * tack a new folio on the end. */ - index = b / AFS_DIR_BLOCKS_PER_PAGE; if (nr_blocks >= AFS_DIR_MAX_BLOCKS) - goto error; - if (index >= folio_nr_pages(folio0)) { - folio = afs_dir_get_folio(vnode, index); - if (!folio) - goto error; - } else { - folio = folio0; - } + goto error_too_many_blocks; - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); + /* Lower dir blocks have a counter in the header we can check. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR && + meta->meta.alloc_ctrs[b] < iter.nr_slots) + continue; + + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - goto invalidated; + goto already_invalidated; _debug("block %u: %2u %3u %u", b, @@ -275,31 +305,23 @@ void afs_edit_dir_add(struct afs_vnode *vnode, afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); } - /* Only lower dir blocks have a counter in the header. */ - if (b >= AFS_DIR_BLOCKS_WITH_CTR || - meta->meta.alloc_ctrs[b] >= need_slots) { - /* We need to try and find one or more consecutive - * slots to hold the entry. - */ - slot = afs_find_contig_bits(block, need_slots); - if (slot >= 0) { - _debug("slot %u", slot); - goto found_space; - } + /* We need to try and find one or more consecutive slots to + * hold the entry. + */ + slot = afs_find_contig_bits(block, iter.nr_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; } kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } } /* There are no spare slots of sufficient size, yet the operation * succeeded. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots); goto out_unmap; new_directory: @@ -307,8 +329,7 @@ new_directory: i_size = AFS_DIR_BLOCK_SIZE; afs_set_i_size(vnode, i_size); slot = AFS_DIR_RESV_BLOCKS0; - folio = folio0; - block = kmap_local_folio(folio, 0); + block = afs_dir_get_block(&iter, 0); nr_blocks = 1; b = 0; @@ -326,41 +347,39 @@ found_space: de->u.name[name->len] = 0; /* Adjust the bitmap. */ - afs_set_contig_bits(block, slot, need_slots); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + afs_set_contig_bits(block, slot, iter.nr_slots); /* Adjust the allocation counter. */ if (b < AFS_DIR_BLOCKS_WITH_CTR) - meta->meta.alloc_ctrs[b] -= need_slots; + meta->meta.alloc_ctrs[b] -= iter.nr_slots; + + /* Adjust the hash chain. */ + entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot; + iter.bucket = afs_dir_hash_name(name); + de->u.hash_next = meta->meta.hashtable[iter.bucket]; + meta->meta.hashtable[iter.bucket] = htons(entry); + kunmap_local(block); inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + out_unmap: kunmap_local(meta); - folio_unlock(folio0); - folio_put(folio0); _leave(""); return; -invalidated: +already_invalidated: trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } goto out_unmap; +error_too_many_blocks: + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks); error: trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } @@ -374,13 +393,14 @@ error: void afs_edit_dir_remove(struct afs_vnode *vnode, struct qstr *name, enum afs_edit_dir_reason why) { - union afs_xdr_dir_block *meta, *block; - union afs_xdr_dirent *de; - struct folio *folio0, *folio; - unsigned int need_slots, nr_blocks, b; - pgoff_t index; + union afs_xdr_dir_block *meta, *block, *pblock; + union afs_xdr_dirent *de, *pde; + struct afs_dir_iter iter = { .dvnode = vnode }; + struct afs_fid fid; + unsigned int b, slot, entry; loff_t i_size; - int slot; + __be16 next; + int found; _enter(",,{%d,%s},", name->len, name->name); @@ -388,81 +408,95 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size); return; } - nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; - folio0 = afs_dir_get_folio(vnode, 0); - if (!folio0) { - _leave(" [fgp]"); + if (!afs_dir_init_iter(&iter, name)) return; - } - - /* Work out how many slots we're going to discard. */ - need_slots = afs_dir_calc_slots(name->len); - meta = kmap_local_folio(folio0, 0); - - /* Find a block that has sufficient slots available. Each folio - * contains two or more directory blocks. - */ - for (b = 0; b < nr_blocks; b++) { - index = b / AFS_DIR_BLOCKS_PER_PAGE; - if (index >= folio_nr_pages(folio0)) { - folio = afs_dir_get_folio(vnode, index); - if (!folio) - goto error; - } else { - folio = folio0; - } - - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); - - /* Abandon the edit if we got a callback break. */ - if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - goto invalidated; - - if (b > AFS_DIR_BLOCKS_WITH_CTR || - meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { - slot = afs_dir_scan_block(block, name, b); - if (slot >= 0) - goto found_dirent; - } + meta = afs_dir_find_block(&iter, 0); + if (!meta) + return; - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + /* Find the entry in the blob. */ + found = afs_dir_search_bucket(&iter, name, &fid); + if (found < 0) { + /* Didn't find the dirent to clobber. Re-download. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name); + goto out_unmap; } - /* Didn't find the dirent to clobber. Download the directory again. */ - trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, - 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - goto out_unmap; + entry = found; + b = entry / AFS_DIR_SLOTS_PER_BLOCK; + slot = entry % AFS_DIR_SLOTS_PER_BLOCK; -found_dirent: + block = afs_dir_find_block(&iter, b); + if (!block) + goto error; + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + /* Check and clear the entry. */ de = &block->dirents[slot]; + if (de->u.valid != 1) + goto error_unmap; trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, ntohl(de->u.vnode), ntohl(de->u.unique), name->name); - memset(de, 0, sizeof(*de) * need_slots); - /* Adjust the bitmap. */ - afs_clear_contig_bits(block, slot, need_slots); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + afs_clear_contig_bits(block, slot, iter.nr_slots); /* Adjust the allocation counter. */ if (b < AFS_DIR_BLOCKS_WITH_CTR) - meta->meta.alloc_ctrs[b] += need_slots; + meta->meta.alloc_ctrs[b] += iter.nr_slots; + + /* Clear the constituent entries. */ + next = de->u.hash_next; + memset(de, 0, sizeof(*de) * iter.nr_slots); + kunmap_local(block); + + /* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head + * index is previous; otherwise it's slot number of the previous entry. + */ + if (!iter.prev_entry) { + __be16 prev_next = meta->meta.hashtable[iter.bucket]; + + if (unlikely(prev_next != htons(entry))) { + pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + meta->meta.hashtable[iter.bucket] = next; + } else { + unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK; + __be16 prev_next; + + pblock = afs_dir_find_block(&iter, pb); + if (!pblock) + goto error; + pde = &pblock->dirents[ps]; + prev_next = pde->u.hash_next; + if (prev_next != htons(entry)) { + kunmap_local(pblock); + pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + pde->u.hash_next = next; + kunmap_local(pblock); + } + + netfs_single_mark_inode_dirty(&vnode->netfs.inode); inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); @@ -470,25 +504,145 @@ found_dirent: out_unmap: kunmap_local(meta); - folio_unlock(folio0); - folio_put(folio0); _leave(""); return; -invalidated: +already_invalidated: + kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } goto out_unmap; +error_unmap: + kunmap_local(block); error: trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } + +/* + * Edit a subdirectory that has been moved between directories to update the + * ".." entry. + */ +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *block; + union afs_xdr_dirent *de; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b; + loff_t i_size; + int slot; + + _enter(""); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size); + return; + } + + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + slot = afs_dir_scan_block(block, &dotdot_name, b); + if (slot >= 0) + goto found_dirent; + + kunmap_local(block); + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, + 0, 0, 0, 0, ".."); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); + goto out; + +found_dirent: + de = &block->dirents[slot]; + de->u.vnode = htonl(new_dvnode->fid.vnode); + de->u.unique = htonl(new_dvnode->fid.unique); + + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + + kunmap_local(block); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + +out: + _leave(""); + return; + +already_invalidated: + kunmap_local(block); + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, + 0, 0, 0, 0, ".."); + goto out; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, + 0, 0, 0, 0, ".."); + goto out; +} + +/* + * Initialise a new directory. We need to fill in the "." and ".." entries. + */ +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode) +{ + union afs_xdr_dir_block *meta; + struct afs_dir_iter iter = { .dvnode = dvnode }; + union afs_xdr_dirent *de; + unsigned int slot = AFS_DIR_RESV_BLOCKS0; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size != AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size); + return; + } + + meta = afs_dir_get_block(&iter, 0); + if (!meta) + return; + + afs_edit_init_block(meta, meta, 0); + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(dvnode->fid.vnode); + de->u.unique = htonl(dvnode->fid.unique); + memcpy(de->u.name, ".", 2); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + dvnode->fid.vnode, dvnode->fid.unique, "."); + slot++; + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(parent_dvnode->fid.vnode); + de->u.unique = htonl(parent_dvnode->fid.unique); + memcpy(de->u.name, "..", 3); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + parent_dvnode->fid.vnode, parent_dvnode->fid.unique, ".."); + + afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2); + meta->meta.alloc_ctrs[0] -= 2; + kunmap_local(meta); + + netfs_single_mark_inode_dirty(&dvnode->netfs.inode); + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); +} diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c new file mode 100644 index 000000000000..b25bd892db4d --- /dev/null +++ b/fs/afs/dir_search.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Search a directory's hash table. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00 + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" + +/* + * Calculate the name hash. + */ +unsigned int afs_dir_hash_name(const struct qstr *name) +{ + const unsigned char *p = name->name; + unsigned int hash = 0, i; + int bucket; + + for (i = 0; i < name->len; i++) + hash = (hash * 173) + p[i]; + bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1); + if (hash > INT_MAX) { + bucket = AFS_DIR_HASHTBL_SIZE - bucket; + bucket &= (AFS_DIR_HASHTBL_SIZE - 1); + } + return bucket; +} + +/* + * Reset a directory iterator. + */ +static bool afs_dir_reset_iter(struct afs_dir_iter *iter) +{ + unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode); + unsigned int nblocks; + + /* Work out the maximum number of steps we can take. */ + nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS); + if (!nblocks) + return false; + iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS); + iter->prev_entry = 0; /* Hash head is previous */ + return true; +} + +/* + * Initialise a directory iterator for looking up a name. + */ +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name) +{ + iter->nr_slots = afs_dir_calc_slots(name->len); + iter->bucket = afs_dir_hash_name(name); + return afs_dir_reset_iter(iter); +} + +/* + * Get a specific block. + */ +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block) +{ + struct folio_queue *fq = iter->fq; + struct afs_vnode *dvnode = iter->dvnode; + struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int slot = iter->fq_slot; + + _enter("%zx,%d", block, slot); + + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + + if (dvnode->directory_size < blend) + goto fail; + + if (!fq || blpos < fpos) { + fq = dvnode->directory; + slot = 0; + fpos = 0; + } + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (; slot < folioq_count(fq); slot++) { + size_t fsize = folioq_folio_size(fq, slot); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, slot); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = slot; + iter->fpos = fpos; + iter->block = kmap_local_folio(folio, blpos - fpos); + return iter->block; + } + fpos += fsize; + } + slot = 0; + } + +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; +} + +/* + * Search through a directory bucket. + */ +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid) +{ + const union afs_xdr_dir_block *meta; + unsigned int entry; + int ret = -ESTALE; + + meta = afs_dir_find_block(iter, 0); + if (!meta) + return -ESTALE; + + entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]); + _enter("%x,%x", iter->bucket, entry); + + while (entry) { + const union afs_xdr_dir_block *block; + const union afs_xdr_dirent *dire; + unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK; + unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + + _debug("search %x", entry); + + if (slot < resv) { + kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x", + iter->bucket, resv, slot, slot + iter->nr_slots - 1); + goto bad; + } + + block = afs_dir_find_block(iter, blnum); + if (!block) + goto bad; + dire = &block->dirents[slot]; + + if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK && + memcmp(dire->u.name, name->name, name->len) == 0 && + dire->u.name[name->len] == '\0') { + _fid->vnode = ntohl(dire->u.vnode); + _fid->unique = ntohl(dire->u.unique); + ret = entry; + goto found; + } + + iter->prev_entry = entry; + entry = ntohs(dire->u.hash_next); + if (!--iter->loop_check) { + kdebug("dir chain loop h=%x", iter->bucket); + goto bad; + } + } + + ret = -ENOENT; +found: + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + +bad: + if (ret == -ESTALE) + afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale); + _leave(" = %d", ret); + return ret; +} + +/* + * Search the appropriate hash chain in the contents of an AFS directory. + */ +int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version) +{ + struct afs_dir_iter iter = { .dvnode = dvnode, }; + int ret, retry_limit = 3; + + _enter("{%lu},,,", dvnode->netfs.inode.i_ino); + + if (!afs_dir_init_iter(&iter, name)) + return -ENOENT; + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, NULL); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode); + + ret = afs_dir_search_bucket(&iter, name, _fid); + up_read(&dvnode->validate_lock); + if (ret == -ESTALE) + afs_dir_reset_iter(&iter); + } while (ret == -ESTALE); + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index c4d2711e20ad..d8bf52f77d93 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -186,50 +186,6 @@ out: } /* - * Look up @cell in a dynroot directory. This is a substitution for the - * local cell name for the net namespace. - */ -static struct dentry *afs_lookup_atcell(struct dentry *dentry) -{ - struct afs_cell *cell; - struct afs_net *net = afs_d2net(dentry); - struct dentry *ret; - char *name; - int len; - - if (!net->ws_cell) - return ERR_PTR(-ENOENT); - - ret = ERR_PTR(-ENOMEM); - name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL); - if (!name) - goto out_p; - - down_read(&net->cells_lock); - cell = net->ws_cell; - if (cell) { - len = cell->name_len; - memcpy(name, cell->name, len + 1); - } - up_read(&net->cells_lock); - - ret = ERR_PTR(-ENOENT); - if (!cell) - goto out_n; - - ret = lookup_one_len(name, dentry->d_parent, len); - - /* We don't want to d_add() the @cell dentry here as we don't want to - * the cached dentry to hide changes to the local cell name. - */ - -out_n: - kfree(name); -out_p: - return ret; -} - -/* * Look up an entry in a dynroot directory. */ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, @@ -247,10 +203,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr return ERR_PTR(-ENAMETOOLONG); } - if (dentry->d_name.len == 5 && - memcmp(dentry->d_name.name, "@cell", 5) == 0) - return afs_lookup_atcell(dentry); - return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); } @@ -271,7 +223,8 @@ const struct dentry_operations afs_dynroot_dentry_operations = { int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) { struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir; + struct dentry *root, *subdir, *dsubdir; + char *dotname = cell->name - 1; int ret; if (!sb || atomic_read(&sb->s_active) == 0) @@ -286,14 +239,43 @@ int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) goto unlock; } - /* Note that we're retaining an extra ref on the dentry */ + dsubdir = lookup_one_len(dotname, root, cell->name_len + 1); + if (IS_ERR(dsubdir)) { + ret = PTR_ERR(dsubdir); + dput(subdir); + goto unlock; + } + + /* Note that we're retaining extra refs on the dentries. */ subdir->d_fsdata = (void *)1UL; + dsubdir->d_fsdata = (void *)1UL; ret = 0; unlock: inode_unlock(root->d_inode); return ret; } +static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len) +{ + struct dentry *subdir; + + /* Don't want to trigger a lookup call, which will re-add the cell */ + subdir = try_lookup_one_len(name, root, name_len); + if (IS_ERR_OR_NULL(subdir)) { + _debug("lookup %ld", PTR_ERR(subdir)); + return; + } + + _debug("rmdir %pd %u", subdir, d_count(subdir)); + + if (subdir->d_fsdata) { + _debug("unpin %u", d_count(subdir)); + subdir->d_fsdata = NULL; + dput(subdir); + } + dput(subdir); +} + /* * Remove a manually added cell mount directory. * - The caller must hold net->proc_cells_lock @@ -301,32 +283,141 @@ unlock: void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) { struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir; + char *dotname = cell->name - 1; if (!sb || atomic_read(&sb->s_active) == 0) return; - root = sb->s_root; - inode_lock(root->d_inode); + inode_lock(sb->s_root->d_inode); + afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len); + afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1); + inode_unlock(sb->s_root->d_inode); + _leave(""); +} - /* Don't want to trigger a lookup call, which will re-add the cell */ - subdir = try_lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR_OR_NULL(subdir)) { - _debug("lookup %ld", PTR_ERR(subdir)); - goto no_dentry; +static void afs_atcell_delayed_put_cell(void *arg) +{ + struct afs_cell *cell = arg; + + afs_put_cell(cell, afs_cell_trace_put_atcell); +} + +/* + * Read @cell or .@cell symlinks. + */ +static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell; + struct afs_net *net = afs_i2net(inode); + const char *name; + bool dotted = vnode->fid.vnode == 3; + + if (!net->ws_cell) + return ERR_PTR(-ENOENT); + + down_read(&net->cells_lock); + + cell = net->ws_cell; + if (dotted) + name = cell->name - 1; + else + name = cell->name; + afs_get_cell(cell, afs_cell_trace_get_atcell); + set_delayed_call(done, afs_atcell_delayed_put_cell, cell); + + up_read(&net->cells_lock); + return name; +} + +static const struct inode_operations afs_atcell_inode_operations = { + .get_link = afs_atcell_get_link, +}; + +/* + * Look up @cell or .@cell in a dynroot directory. This is a substitution for + * the local cell name for the net namespace. + */ +static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name) +{ + struct afs_vnode *vnode; + struct afs_fid fid = { .vnode = 2, .unique = 1, }; + struct dentry *dentry; + struct inode *inode; + + if (name[0] == '.') + fid.vnode = 3; + + dentry = d_alloc_name(root, name); + if (!dentry) + return ERR_PTR(-ENOMEM); + + inode = iget5_locked(dentry->d_sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) { + dput(dentry); + return ERR_PTR(-ENOMEM); } - _debug("rmdir %pd %u", subdir, d_count(subdir)); + vnode = AFS_FS_I(inode); - if (subdir->d_fsdata) { - _debug("unpin %u", d_count(subdir)); - subdir->d_fsdata = NULL; - dput(subdir); + /* there shouldn't be an existing inode */ + if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) { + iput(inode); + dput(dentry); + return ERR_PTR(-EIO); } - dput(subdir); -no_dentry: + + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); + d_splice_alias(inode, dentry); + return dentry; +} + +/* + * Create @cell and .@cell symlinks. + */ +static int afs_dynroot_symlink(struct afs_net *net) +{ + struct super_block *sb = net->dynroot_sb; + struct dentry *root, *symlink, *dsymlink; + int ret; + + /* Let the ->lookup op do the creation */ + root = sb->s_root; + inode_lock(root->d_inode); + symlink = afs_dynroot_create_symlink(root, "@cell"); + if (IS_ERR(symlink)) { + ret = PTR_ERR(symlink); + goto unlock; + } + + dsymlink = afs_dynroot_create_symlink(root, ".@cell"); + if (IS_ERR(dsymlink)) { + ret = PTR_ERR(dsymlink); + dput(symlink); + goto unlock; + } + + /* Note that we're retaining extra refs on the dentries. */ + symlink->d_fsdata = (void *)1UL; + dsymlink->d_fsdata = (void *)1UL; + ret = 0; +unlock: inode_unlock(root->d_inode); - _leave(""); + return ret; } /* @@ -341,6 +432,10 @@ int afs_dynroot_populate(struct super_block *sb) mutex_lock(&net->proc_cells_lock); net->dynroot_sb = sb; + ret = afs_dynroot_symlink(net); + if (ret < 0) + goto error; + hlist_for_each_entry(cell, &net->proc_cells, proc_link) { ret = afs_dynroot_mkdir(net, cell); if (ret < 0) diff --git a/fs/afs/file.c b/fs/afs/file.c index 6762eff97517..fc15497608c6 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -20,7 +20,6 @@ #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); -static int afs_symlink_read_folio(struct file *file, struct folio *folio); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -61,13 +60,6 @@ const struct address_space_operations afs_file_aops = { .writepages = afs_writepages, }; -const struct address_space_operations afs_symlink_aops = { - .read_folio = afs_symlink_read_folio, - .release_folio = netfs_release_folio, - .invalidate_folio = netfs_invalidate_folio, - .migrate_folio = filemap_migrate_folio, -}; - static const struct vm_operations_struct afs_vm_ops = { .open = afs_vm_open, .close = afs_vm_close, @@ -208,49 +200,12 @@ int afs_release(struct inode *inode, struct file *file) return ret; } -/* - * Allocate a new read record. - */ -struct afs_read *afs_alloc_read(gfp_t gfp) -{ - struct afs_read *req; - - req = kzalloc(sizeof(struct afs_read), gfp); - if (req) - refcount_set(&req->usage, 1); - - return req; -} - -/* - * Dispose of a ref to a read record. - */ -void afs_put_read(struct afs_read *req) -{ - if (refcount_dec_and_test(&req->usage)) { - if (req->cleanup) - req->cleanup(req); - key_put(req->key); - kfree(req); - } -} - static void afs_fetch_data_notify(struct afs_operation *op) { - struct afs_read *req = op->fetch.req; - struct netfs_io_subrequest *subreq = req->subreq; - int error = afs_op_error(op); - - req->error = error; - if (subreq) { - subreq->rreq->i_size = req->file_size; - if (req->pos + req->actual_len >= req->file_size) - __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - netfs_read_subreq_terminated(subreq, error, false); - req->subreq = NULL; - } else if (req->done) { - req->done(req); - } + struct netfs_io_subrequest *subreq = op->fetch.subreq; + + subreq->error = afs_op_error(op); + netfs_read_subreq_terminated(subreq); } static void afs_fetch_data_success(struct afs_operation *op) @@ -260,7 +215,7 @@ static void afs_fetch_data_success(struct afs_operation *op) _enter("op=%08x", op->debug_id); afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); - atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); + atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes); afs_fetch_data_notify(op); } @@ -270,107 +225,188 @@ static void afs_fetch_data_aborted(struct afs_operation *op) afs_fetch_data_notify(op); } -static void afs_fetch_data_put(struct afs_operation *op) -{ - op->fetch.req->error = afs_op_error(op); - afs_put_read(op->fetch.req); -} - -static const struct afs_operation_ops afs_fetch_data_operation = { +const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, .aborted = afs_fetch_data_aborted, .failed = afs_fetch_data_notify, - .put = afs_fetch_data_put, }; +static void afs_issue_read_call(struct afs_operation *op) +{ + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags)) + yfs_fs_fetch_data(op); + else + afs_fs_fetch_data(op); +} + +static void afs_end_read(struct afs_operation *op) +{ + if (op->call_responded && op->server) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) + afs_fetch_data_success(op); + else if (op->cumul_error.aborted) + afs_fetch_data_aborted(op); + else + afs_fetch_data_notify(op); + + afs_end_vnode_operation(op); + afs_put_operation(op); +} + +/* + * Perform I/O processing on an asynchronous call. The work item carries a ref + * to the call struct that we either need to release or to pass on. + */ +static void afs_read_receive(struct afs_call *call) +{ + struct afs_operation *op = call->op; + enum afs_call_state state; + + _enter(""); + + state = READ_ONCE(call->state); + if (state == AFS_CALL_COMPLETE) + return; + trace_afs_read_recv(op, call); + + while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) { + WRITE_ONCE(call->need_attention, false); + afs_deliver_to_call(call); + state = READ_ONCE(call->state); + } + + if (state < AFS_CALL_COMPLETE) { + netfs_read_subreq_progress(op->fetch.subreq); + if (rxrpc_kernel_check_life(call->net->socket, call->rxcall)) + return; + /* rxrpc terminated the call. */ + afs_set_call_complete(call, call->error, call->abort_code); + } + + op->call_abort_code = call->abort_code; + op->call_error = call->error; + op->call_responded = call->responded; + op->call = NULL; + call->op = NULL; + afs_put_call(call); + + /* If the call failed, then we need to crank the server rotation + * handle and try the next. + */ + if (afs_select_fileserver(op)) { + afs_issue_read_call(op); + return; + } + + afs_end_read(op); +} + +void afs_fetch_data_async_rx(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + afs_read_receive(call); + afs_put_call(call); +} + +void afs_fetch_data_immediate_cancel(struct afs_call *call) +{ + if (call->async) { + afs_get_call(call, afs_call_trace_wake); + if (!queue_work(afs_async_calls, &call->async_work)) + afs_deferred_put_call(call); + flush_work(&call->async_work); + } +} + /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct key *key = subreq->rreq->netfs_priv; _enter("%s{%llx:%llu.%u},%x,,,", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - key_serial(req->key)); + key_serial(key)); - op = afs_alloc_operation(req->key, vnode->volume); + op = afs_alloc_operation(key, vnode->volume); if (IS_ERR(op)) { - if (req->subreq) - netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false); - return PTR_ERR(op); + subreq->error = PTR_ERR(op); + netfs_read_subreq_terminated(subreq); + return; } afs_op_set_vnode(op, 0, vnode); - op->fetch.req = afs_get_read(req); + op->fetch.subreq = subreq; op->ops = &afs_fetch_data_operation; - return afs_do_sync_operation(op); -} - -static void afs_read_worker(struct work_struct *work) -{ - struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); - struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); - struct afs_read *fsreq; - - fsreq = afs_alloc_read(GFP_NOFS); - if (!fsreq) - return netfs_read_subreq_terminated(subreq, -ENOMEM, false); - - fsreq->subreq = subreq; - fsreq->pos = subreq->start + subreq->transferred; - fsreq->len = subreq->len - subreq->transferred; - fsreq->key = key_get(subreq->rreq->netfs_priv); - fsreq->vnode = vnode; - fsreq->iter = &subreq->io_iter; trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - afs_fetch_data(fsreq->vnode, fsreq); - afs_put_read(fsreq); -} - -static void afs_issue_read(struct netfs_io_subrequest *subreq) -{ - INIT_WORK(&subreq->work, afs_read_worker); - queue_work(system_long_wq, &subreq->work); -} -static int afs_symlink_read_folio(struct file *file, struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); - struct afs_read *fsreq; - int ret; + if (subreq->rreq->origin == NETFS_READAHEAD || + subreq->rreq->iocb) { + op->flags |= AFS_OPERATION_ASYNC; - fsreq = afs_alloc_read(GFP_NOFS); - if (!fsreq) - return -ENOMEM; + if (!afs_begin_vnode_operation(op)) { + subreq->error = afs_put_operation(op); + netfs_read_subreq_terminated(subreq); + return; + } - fsreq->pos = folio_pos(folio); - fsreq->len = folio_size(folio); - fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages, - fsreq->pos, fsreq->len); + if (!afs_select_fileserver(op)) { + afs_end_read(op); + return; + } - ret = afs_fetch_data(fsreq->vnode, fsreq); - if (ret == 0) - folio_mark_uptodate(folio); - folio_unlock(folio); - return ret; + afs_issue_read_call(op); + } else { + afs_do_sync_operation(op); + } } static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct afs_vnode *vnode = AFS_FS_I(rreq->inode); + if (file) rreq->netfs_priv = key_get(afs_file_key(file)); rreq->rsize = 256 * 1024; rreq->wsize = 256 * 1024 * 1024; + + switch (rreq->origin) { + case NETFS_READ_SINGLE: + if (!file) { + struct key *key = afs_request_key(vnode->volume->cell); + + if (IS_ERR(key)) + return PTR_ERR(key); + rreq->netfs_priv = key; + } + break; + case NETFS_WRITEBACK: + case NETFS_WRITETHROUGH: + case NETFS_UNBUFFERED_WRITE: + case NETFS_DIO_WRITE: + if (S_ISREG(rreq->inode->i_mode)) + rreq->io_streams[0].avail = true; + break; + case NETFS_WRITEBACK_SINGLE: + default: + break; + } return 0; } diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 428721bbe4f6..8418813ee043 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -49,6 +49,105 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo return op; } +struct afs_io_locker { + struct list_head link; + struct task_struct *task; + unsigned long have_lock; +}; + +/* + * Unlock the I/O lock on a vnode. + */ +static void afs_unlock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker *locker; + + spin_lock(&vnode->lock); + locker = list_first_entry_or_null(&vnode->io_lock_waiters, + struct afs_io_locker, link); + if (locker) { + list_del(&locker->link); + smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */ + smp_mb__after_atomic(); /* Store have_lock before task state */ + wake_up_process(locker->task); + } else { + clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags); + } + spin_unlock(&vnode->lock); +} + +/* + * Lock the I/O lock on a vnode uninterruptibly. We can't use an ordinary + * mutex as lockdep will complain if we unlock it in the wrong thread. + */ +static void afs_lock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/* + * Lock the I/O lock on a vnode interruptibly. We can't use an ordinary mutex + * as lockdep will complain if we unlock it in the wrong thread. + */ +static int afs_lock_for_io_interruptible(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + int ret = 0; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return 0; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */ + signal_pending(current)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + + /* If we got a signal, try to transfer the lock onto the next + * waiter. + */ + if (unlikely(signal_pending(current))) { + spin_lock(&vnode->lock); + if (myself.have_lock) { + spin_unlock(&vnode->lock); + afs_unlock_for_io(vnode); + } else { + list_del(&myself.link); + spin_unlock(&vnode->lock); + } + ret = -ERESTARTSYS; + } + return ret; +} + /* * Lock the vnode(s) being operated upon. */ @@ -60,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op) _enter(""); if (op->flags & AFS_OPERATION_UNINTR) { - mutex_lock(&vnode->io_lock); + afs_lock_for_io(vnode); op->flags |= AFS_OPERATION_LOCK_0; _leave(" = t [1]"); return true; @@ -72,7 +171,7 @@ static bool afs_get_io_locks(struct afs_operation *op) if (vnode2 > vnode) swap(vnode, vnode2); - if (mutex_lock_interruptible(&vnode->io_lock) < 0) { + if (afs_lock_for_io_interruptible(vnode) < 0) { afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; _leave(" = f [I 0]"); @@ -81,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op) op->flags |= AFS_OPERATION_LOCK_0; if (vnode2) { - if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { + if (afs_lock_for_io_interruptible(vnode2) < 0) { afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; - mutex_unlock(&vnode->io_lock); + afs_unlock_for_io(vnode); op->flags &= ~AFS_OPERATION_LOCK_0; _leave(" = f [I 1]"); return false; @@ -104,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op) _enter(""); if (op->flags & AFS_OPERATION_LOCK_1) - mutex_unlock(&vnode2->io_lock); + afs_unlock_for_io(vnode2); if (op->flags & AFS_OPERATION_LOCK_0) - mutex_unlock(&vnode->io_lock); + afs_unlock_for_io(vnode); } static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp, @@ -157,7 +256,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op) /* * Tidy up a filesystem cursor and unlock the vnode. */ -static void afs_end_vnode_operation(struct afs_operation *op) +void afs_end_vnode_operation(struct afs_operation *op) { _enter(""); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 098fa034a1cc..1d9ecd5418d8 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -301,19 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op) static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; const __be32 *bp; size_t count_before; int ret; _enter("{%u,%zu,%zu/%llu}", call->unmarshall, call->iov_len, iov_iter_count(call->iter), - req->actual_len); + call->remaining); switch (call->unmarshall) { case 0: - req->actual_len = 0; + call->remaining = 0; call->unmarshall++; if (call->operation_ID == FSFETCHDATA64) { afs_extract_to_tmp64(call); @@ -323,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } fallthrough; - /* Extract the returned data length into - * ->actual_len. This may indicate more or less data than was + /* Extract the returned data length into ->remaining. + * This may indicate more or less data than was * requested will be returned. */ case 1: @@ -333,42 +333,40 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (ret < 0) return ret; - req->actual_len = be64_to_cpu(call->tmp64); - _debug("DATA length: %llu", req->actual_len); + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); - if (req->actual_len == 0) + if (call->remaining == 0) goto no_more_data; - call->iter = req->iter; - call->iov_len = min(req->actual_len, req->len); + call->iter = &subreq->io_iter; + call->iov_len = umin(call->remaining, subreq->len - subreq->transferred); call->unmarshall++; fallthrough; /* extract the returned data */ case 2: count_before = call->iov_len; - _debug("extract data %zu/%llu", count_before, req->actual_len); + _debug("extract data %zu/%llu", count_before, call->remaining); ret = afs_extract_data(call, true); - if (req->subreq) { - req->subreq->transferred += count_before - call->iov_len; - netfs_read_subreq_progress(req->subreq, false); - } + subreq->transferred += count_before - call->iov_len; + call->remaining -= count_before - call->iov_len; if (ret < 0) return ret; call->iter = &call->def_iter; - if (req->actual_len <= req->len) + if (call->remaining) goto no_more_data; /* Discard any excess data the server gave us */ - afs_extract_discard(call, req->actual_len - req->len); + afs_extract_discard(call, call->remaining); call->unmarshall = 3; fallthrough; case 3: _debug("extract discard %zu/%llu", - iov_iter_count(call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), call->remaining); ret = afs_extract_data(call, true); if (ret < 0) @@ -390,8 +388,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) xdr_decode_AFSCallBack(&bp, call, &vp->scb); xdr_decode_AFSVolSync(&bp, &op->volsync); - req->data_version = vp->scb.status.data_version; - req->file_size = vp->scb.status.size; + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); call->unmarshall++; fallthrough; @@ -410,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", .op = afs_FS_FetchData, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", .op = afs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; @@ -426,8 +428,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = { */ static void afs_fs_fetch_data64(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; struct afs_call *call; __be32 *bp; @@ -437,16 +439,19 @@ static void afs_fs_fetch_data64(struct afs_operation *op) if (!call) return afs_op_nomem(op); + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; + /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA64); bp[1] = htonl(vp->fid.vid); bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); - bp[4] = htonl(upper_32_bits(req->pos)); - bp[5] = htonl(lower_32_bits(req->pos)); + bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred)); bp[6] = 0; - bp[7] = htonl(lower_32_bits(req->len)); + bp[7] = htonl(lower_32_bits(subreq->len - subreq->transferred)); call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); @@ -458,9 +463,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op) */ void afs_fs_fetch_data(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_read *req = op->fetch.req; __be32 *bp; if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) @@ -472,16 +477,14 @@ void afs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - req->call_debug_id = call->debug_id; - /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); bp[1] = htonl(vp->fid.vid); bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); - bp[4] = htonl(lower_32_bits(req->pos)); - bp[5] = htonl(lower_32_bits(req->len)); + bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->len + subreq->transferred)); call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); @@ -1733,6 +1736,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, .done = afs_fileserver_probe_result, + .immediate_cancel = afs_fileserver_probe_result, .destructor = afs_fs_get_capabilities_destructor, }; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index a95e77670b49..e9538e91f848 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,8 +25,94 @@ #include "internal.h" #include "afs_fs.h" +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op) +{ + size_t size = strlen(op->create.symlink) + 1; + size_t dsize = 0; + char *p; + + if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size, + mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0) + return; + + vnode->directory_size = dsize; + p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); + memcpy(p, op->create.symlink, size); + kunmap_local(p); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); +} + +static void afs_put_link(void *arg) +{ + struct folio *folio = virt_to_folio(arg); + + kunmap_local(arg); + folio_put(folio); +} + +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct folio *folio; + char *content; + ssize_t ret; + + if (!dentry) { + /* RCU pathwalk. */ + if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode)) + return ERR_PTR(-ECHILD); + goto good; + } + + if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto fetch; + + ret = afs_validate(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + + if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto good; + +fetch: + ret = afs_read_single(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + +good: + folio = folioq_folio(vnode->directory, 0); + folio_get(folio); + content = kmap_local_folio(folio, 0); + set_delayed_call(callback, afs_put_link, content); + return content; +} + +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + DEFINE_DELAYED_CALL(done); + const char *content; + int len; + + content = afs_get_link(dentry, d_inode(dentry), &done); + if (IS_ERR(content)) { + do_delayed_call(&done); + return PTR_ERR(content); + } + + len = umin(strlen(content), buflen); + if (copy_to_user(buffer, content, len)) + len = -EFAULT; + do_delayed_call(&done); + return len; +} + static const struct inode_operations afs_symlink_inode_operations = { - .get_link = page_get_link, + .get_link = afs_get_link, + .readlink = afs_readlink, }; static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) @@ -110,7 +196,9 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; - mapping_set_large_folios(inode->i_mapping); + __set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags); + /* Assume locally cached directory data will be valid. */ + __set_bit(AFS_VNODE_DIR_VALID, &vnode->flags); break; case AFS_FTYPE_SYMLINK: /* Symlinks with a mode of 0644 are actually mountpoints. */ @@ -122,13 +210,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; - inode->i_mapping->a_ops = &afs_symlink_aops; } else { inode->i_mode = S_IFLNK | status->mode; inode->i_op = &afs_symlink_inode_operations; - inode->i_mapping->a_ops = &afs_symlink_aops; } + inode->i_mapping->a_ops = &afs_dir_aops; inode_nohighmem(inode); + mapping_set_release_always(inode->i_mapping); break; default: dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); @@ -140,15 +228,17 @@ static int afs_inode_init_from_status(struct afs_operation *op, afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; + trace_afs_set_dv(vnode, status->data_version); inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver * didn't give us a callback) */ - atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); + afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink); } else { vnode->cb_server = op->server; - atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at); + afs_set_cb_promise(vnode, vp->scb.callback.expires_at, + afs_cb_promise_set_new_inode); } write_sequnlock(&vnode->cb_lock); @@ -207,12 +297,17 @@ static void afs_apply_status(struct afs_operation *op, if (vp->update_ctime) inode_set_ctime_to_ts(inode, op->ctime); - if (vnode->status.data_version != status->data_version) + if (vnode->status.data_version != status->data_version) { + trace_afs_set_dv(vnode, status->data_version); data_changed = true; + } vnode->status = *status; if (vp->dv_before + vp->dv_delta != status->data_version) { + trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta, + status->data_version); + if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", @@ -223,12 +318,10 @@ static void afs_apply_status(struct afs_operation *op, op->debug_id); vnode->invalid_before = status->data_version; - if (vnode->status.type == AFS_FTYPE_DIR) { - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_stat_v(vnode, n_inval); - } else { + if (vnode->status.type == AFS_FTYPE_DIR) + afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch); + else set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } change_size = true; data_changed = true; unexpected_jump = true; @@ -258,6 +351,8 @@ static void afs_apply_status(struct afs_operation *op, inode_set_ctime_to_ts(inode, t); inode_set_atime_to_ts(inode, t); } + if (op->ops == &afs_fetch_data_operation) + op->fetch.subreq->rreq->i_size = status->size; } } @@ -273,7 +368,7 @@ static void afs_apply_callback(struct afs_operation *op, if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { if (op->volume->type == AFSVL_RWVOL) vnode->cb_server = op->server; - atomic64_set(&vnode->cb_expires_at, cb->expires_at); + afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb); } } @@ -435,7 +530,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; - if (vnode->status.type != AFS_FTYPE_FILE) { + if (vnode->status.type != AFS_FTYPE_FILE && + vnode->status.type != AFS_FTYPE_DIR && + vnode->status.type != AFS_FTYPE_SYMLINK) { vnode->netfs.cache = NULL; return; } @@ -637,6 +734,7 @@ int afs_drop_inode(struct inode *inode) void afs_evict_inode(struct inode *inode) { struct afs_vnode_cache_aux aux; + struct afs_super_info *sbi = AFS_FS_S(inode->i_sb); struct afs_vnode *vnode = AFS_FS_I(inode); _enter("{%llx:%llu.%d}", @@ -648,8 +746,22 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + if ((S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + (inode->i_state & I_DIRTY) && + !sbi->dyn_root) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .for_sync = true, + .range_end = LLONG_MAX, + }; + + afs_single_writepages(inode->i_mapping, &wbc); + } + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); + netfs_free_folioq_buffer(vnode->directory); afs_set_cache_aux(vnode, &aux); netfs_clear_inode_writeback(inode, &aux); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6e1d3c4daf72..90f407774a9a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -130,6 +130,7 @@ struct afs_call { wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ + struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ @@ -162,6 +163,7 @@ struct afs_call { spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ + unsigned long long remaining; /* How much is left to receive */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ @@ -200,11 +202,17 @@ struct afs_call_type { /* clean up a call */ void (*destructor)(struct afs_call *call); + /* Async receive processing function */ + void (*async_rx)(struct work_struct *work); + /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); + + /* Handle a call being immediately cancelled. */ + void (*immediate_cancel)(struct afs_call *call); }; /* @@ -232,28 +240,6 @@ static inline struct key *afs_file_key(struct file *file) } /* - * Record of an outstanding read operation on a vnode. - */ -struct afs_read { - loff_t pos; /* Where to start reading */ - loff_t len; /* How much we're asking for */ - loff_t actual_len; /* How much we're actually getting */ - loff_t file_size; /* File size returned by server */ - struct key *key; /* The key to use to reissue the read */ - struct afs_vnode *vnode; /* The file being read into. */ - struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ - afs_dataversion_t data_version; /* Version number returned by server */ - refcount_t usage; - unsigned int call_debug_id; - unsigned int nr_pages; - int error; - void (*done)(struct afs_read *); - void (*cleanup)(struct afs_read *); - struct iov_iter *iter; /* Iterator representing the buffer */ - struct iov_iter def_iter; /* Default iterator */ -}; - -/* * AFS superblock private data * - there's one superblock per volume */ @@ -701,13 +687,14 @@ struct afs_vnode { struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ - struct mutex io_lock; /* Lock for serialising I/O on this mutex */ + struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; +#define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ @@ -718,7 +705,9 @@ struct afs_vnode { #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ +#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ + struct folio_queue *directory; /* Directory contents */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ @@ -727,6 +716,7 @@ struct afs_vnode { ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; + unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ @@ -906,7 +896,7 @@ struct afs_operation { bool new_negative; } rename; struct { - struct afs_read *req; + struct netfs_io_subrequest *subreq; } fetch; struct { afs_lock_type_t type; @@ -958,6 +948,7 @@ struct afs_operation { #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ +#define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ }; /* @@ -982,6 +973,21 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl i_size_read(&vnode->netfs.inode), flags); } +/* + * Directory iteration management. + */ +struct afs_dir_iter { + struct afs_vnode *dvnode; + union afs_xdr_dir_block *block; + struct folio_queue *fq; + unsigned int fpos; + int fq_slot; + unsigned int loop_check; + u8 nr_slots; + u8 bucket; + unsigned int prev_entry; +}; + #include <trace/events/afs.h> /*****************************************************************************/ @@ -1063,8 +1069,13 @@ extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc); /* * dir_edit.c @@ -1072,6 +1083,20 @@ extern void afs_check_for_remote_deletion(struct afs_operation *); extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why); +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); + +/* + * dir_search.c + */ +unsigned int afs_dir_hash_name(const struct qstr *name); +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid); +int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* * dir_silly.c @@ -1096,24 +1121,17 @@ extern void afs_dynroot_depopulate(struct super_block *); * file.c */ extern const struct address_space_operations afs_file_aops; -extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct afs_operation_ops afs_fetch_data_operation; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); -extern struct afs_read *afs_alloc_read(gfp_t); -extern void afs_put_read(struct afs_read *); - -static inline struct afs_read *afs_get_read(struct afs_read *req) -{ - refcount_inc(&req->usage); - return req; -} +void afs_fetch_data_async_rx(struct work_struct *work); +void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c @@ -1165,6 +1183,7 @@ extern void afs_fs_store_acl(struct afs_operation *); extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); +extern void afs_end_vnode_operation(struct afs_operation *op); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); @@ -1202,6 +1221,10 @@ extern void afs_fs_probe_cleanup(struct afs_net *); */ extern const struct afs_operation_ops afs_fetch_status_operation; +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback); +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); @@ -1331,7 +1354,9 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); +void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); +void afs_deliver_to_call(struct afs_call *call); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, @@ -1342,6 +1367,28 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); +static inline struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int r; + + __refcount_inc(&call->ref, &r); + + trace_afs_call(call->debug_id, why, r + 1, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + +static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) +{ + int r = refcount_read(&call->ref); + + trace_afs_call(call->debug_id, why, r, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); +} + static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { @@ -1708,6 +1755,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) return -EIO; } +/* + * Set the callback promise on a vnode. + */ +static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, + enum afs_cb_promise_trace trace) +{ + atomic64_set(&vnode->cb_expires_at, expires_at); + trace_afs_cb_promise(vnode, trace); +} + +/* + * Clear the callback promise on a vnode, returning true if it was promised. + */ +static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, + enum afs_cb_promise_trace trace) +{ + trace_afs_cb_promise(vnode, trace); + return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; +} + +/* + * Mark a directory as being invalid. + */ +static inline void afs_invalidate_dir(struct afs_vnode *dvnode, + enum afs_dir_invalid_trace trace) +{ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + trace_afs_dir_invalid(dvnode, trace); + afs_stat_v(dvnode, n_inval); + } +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/afs/main.c b/fs/afs/main.c index a14f6013e316..1ae0067f772d 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -177,7 +177,7 @@ static int __init afs_init(void) afs_wq = alloc_workqueue("afs", 0, 0); if (!afs_wq) goto error_afs_wq; - afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 297487ee8323..507c25a5b2cb 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = { const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, - .readlink = page_readlink, + .readlink = afs_readlink, .getattr = afs_getattr, }; @@ -118,9 +118,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ - struct page *page; + DEFINE_DELAYED_CALL(cleanup); + const char *content; loff_t size = i_size_read(d_inode(mntpt)); - char *buf; if (src_as->cell) ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); @@ -128,16 +128,16 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; - page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); + content = afs_get_link(mntpt, d_inode(mntpt), &cleanup); + if (IS_ERR(content)) { + do_delayed_call(&cleanup); + return PTR_ERR(content); + } - buf = kmap(page); ret = -EINVAL; - if (buf[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", buf, size - 1); - kunmap(page); - put_page(page); + if (content[size - 1] == '.') + ret = vfs_parse_fs_string(fc, "source", content, size - 1); + do_delayed_call(&cleanup); if (ret < 0) return ret; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 15eab053af6d..e7614f4f30c2 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -240,7 +240,13 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size) /* determine command to perform */ _debug("rootcell=%s", buf); - ret = afs_cell_init(net, buf); + ret = -EEXIST; + inode_lock(file_inode(file)); + if (!net->ws_cell) + ret = afs_cell_init(net, buf); + else + printk("busy\n"); + inode_unlock(file_inode(file)); out: _leave(" = %d", ret); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index d612983d6f38..a1c24f589d9e 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -99,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, write_seqlock(&vnode->cb_lock); ASSERTCMP(cb_server, ==, vnode->cb_server); vnode->cb_server = NULL; - if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server)) vnode->cb_break++; write_sequnlock(&vnode->cb_lock); } @@ -583,7 +583,7 @@ selected_server: if (vnode->cb_server != server) { vnode->cb_server = server; vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); - atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change); } retry_server: diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index c453428f3c8b..886416ea1d96 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -18,6 +18,7 @@ struct workqueue_struct *afs_async_calls; +static void afs_deferred_free_worker(struct work_struct *work); static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); @@ -148,7 +149,9 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->net = net; call->debug_id = atomic_inc_return(&rxrpc_debug_id); refcount_set(&call->ref, 1); - INIT_WORK(&call->async_work, afs_process_async_call); + INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call); + INIT_WORK(&call->work, call->type->work); + INIT_WORK(&call->free_work, afs_deferred_free_worker); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); call->iter = &call->def_iter; @@ -159,6 +162,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, return call; } +static void afs_free_call(struct afs_call *call) +{ + struct afs_net *net = call->net; + int o; + + ASSERT(!work_pending(&call->async_work)); + + rxrpc_kernel_put_peer(call->peer); + + if (call->rxcall) { + rxrpc_kernel_shutdown_call(net->socket, call->rxcall); + rxrpc_kernel_put_call(net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); + + afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); + kfree(call->request); + + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, + __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); + if (o == 0) + wake_up_var(&net->nr_outstanding_calls); +} + /* * Dispose of a reference on a call. */ @@ -173,45 +206,34 @@ void afs_put_call(struct afs_call *call) o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); + if (zero) + afs_free_call(call); +} - if (zero) { - ASSERT(!work_pending(&call->async_work)); - ASSERT(call->type->name != NULL); - - rxrpc_kernel_put_peer(call->peer); - - if (call->rxcall) { - rxrpc_kernel_shutdown_call(net->socket, call->rxcall); - rxrpc_kernel_put_call(net->socket, call->rxcall); - call->rxcall = NULL; - } - if (call->type->destructor) - call->type->destructor(call); - - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - kfree(call->request); - - trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, - __builtin_return_address(0)); - kfree(call); +static void afs_deferred_free_worker(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, free_work); - o = atomic_dec_return(&net->nr_outstanding_calls); - if (o == 0) - wake_up_var(&net->nr_outstanding_calls); - } + afs_free_call(call); } -static struct afs_call *afs_get_call(struct afs_call *call, - enum afs_call_trace why) +/* + * Dispose of a reference on a call, deferring the cleanup to a workqueue + * to avoid lock recursion. + */ +void afs_deferred_put_call(struct afs_call *call) { - int r; - - __refcount_inc(&call->ref, &r); + struct afs_net *net = call->net; + unsigned int debug_id = call->debug_id; + bool zero; + int r, o; - trace_afs_call(call->debug_id, why, r + 1, - atomic_read(&call->net->nr_outstanding_calls), + zero = __refcount_dec_and_test(&call->ref, &r); + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); - return call; + if (zero) + schedule_work(&call->free_work); } /* @@ -220,8 +242,6 @@ static struct afs_call *afs_get_call(struct afs_call *call, static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { - INIT_WORK(&call->work, call->type->work); - afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); @@ -396,11 +416,16 @@ void afs_make_call(struct afs_call *call, gfp_t gfp) return; error_do_abort: - if (ret != -ECONNABORTED) { + if (ret != -ECONNABORTED) rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, afs_abort_send_data_error); - } else { + if (call->async) { + afs_see_call(call, afs_call_trace_async_abort); + return; + } + + if (ret == -ECONNABORTED) { len = 0; iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, @@ -411,8 +436,10 @@ error_do_abort: call->error = ret; trace_afs_call_done(call); error_kill_call: - if (call->type->done) - call->type->done(call); + if (call->async) + afs_see_call(call, afs_call_trace_async_kill); + if (call->type->immediate_cancel) + call->type->immediate_cancel(call); /* We need to dispose of the extra ref we grabbed for an async call. * The call, however, might be queued on afs_async_calls and we need to @@ -467,7 +494,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) /* * deliver messages to a call */ -static void afs_deliver_to_call(struct afs_call *call) +void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; size_t len; @@ -568,7 +595,6 @@ local_abort: abort_code = 0; call_complete: afs_set_call_complete(call, ret, remote_abort); - state = AFS_CALL_COMPLETE; goto done; } @@ -640,7 +666,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, } /* - * wake up an asynchronous call + * Wake up an asynchronous call. The caller is holding the call notify + * spinlock around this, so we can't call afs_put_call(). */ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) @@ -657,7 +684,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) - afs_put_call(call); + afs_deferred_put_call(call); } } @@ -768,6 +795,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call) return -ENOTSUPP; trace_afs_cb_call(call); + call->work.func = call->type->work; /* pass responsibility for the remainer of this message off to the * cache manager op */ diff --git a/fs/afs/super.c b/fs/afs/super.c index f3ba1c3e72f5..a9bee610674e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -663,7 +663,7 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->netfs.inode); - mutex_init(&vnode->io_lock); + INIT_LIST_HEAD(&vnode->io_lock_waiters); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); @@ -696,6 +696,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; + vnode->directory = NULL; + vnode->directory_size = 0; vnode->flags = 1 << AFS_VNODE_UNSET; vnode->lock_state = AFS_VNODE_LOCK_NONE; diff --git a/fs/afs/validation.c b/fs/afs/validation.c index bef8af12ebe2..0ba8336c9025 100644 --- a/fs/afs/validation.c +++ b/fs/afs/validation.c @@ -120,22 +120,31 @@ bool afs_check_validity(const struct afs_vnode *vnode) { const struct afs_volume *volume = vnode->volume; + enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace; + time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at); time64_t deadline = ktime_get_real_seconds() + 10; if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) return true; - if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || - atomic64_read(&vnode->cb_expires_at) <= deadline || - volume->cb_expires_at <= deadline || - vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) || - vnode->cb_scrub != atomic_read(&volume->cb_scrub) || - test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { - _debug("inval"); - return false; - } - - return true; + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) + trace = afs_vnode_invalid_trace_cb_v_break; + else if (cb_expires_at == AFS_NO_CB_PROMISE) + trace = afs_vnode_invalid_trace_no_cb_promise; + else if (cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_expired; + else if (volume->cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_vol_expired; + else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot)) + trace = afs_vnode_invalid_trace_cb_ro_snapshot; + else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub)) + trace = afs_vnode_invalid_trace_cb_scrub; + else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + trace = afs_vnode_invalid_trace_zap_data; + else + return true; + trace_afs_vnode_invalid(vnode, trace); + return false; } /* diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 9f36e14f1c2d..f9e76b604f31 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -253,6 +253,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) { struct afs_cell *master; + size_t name_len; char *cell_name; cell_name = afs_vl_get_cell_name(cell, key); @@ -264,8 +265,11 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) return 0; } - master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name), - NULL, false); + name_len = strlen(cell_name); + if (!name_len || name_len > AFS_MAXCELLNAME) + master = ERR_PTR(-EOPNOTSUPP); + else + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index cac75f89b64a..3a23c0b08eb6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -370,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { .name = "VL.GetCapabilities", .op = afs_VL_GetCapabilities, .deliver = afs_deliver_vl_get_capabilities, + .immediate_cancel = afs_vlserver_probe_result, .done = afs_vlserver_probe_result, .destructor = afs_destroy_vl_get_capabilities, }; @@ -697,7 +698,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return ret; namesz = ntohl(call->tmp); - if (namesz > AFS_MAXCELLNAME) + if (namesz > YFS_VL_MAXCELLNAME) return afs_protocol_error(call, afs_eproto_cellname_len); paddedsz = (namesz + 3) & ~3; call->count = namesz; diff --git a/fs/afs/write.c b/fs/afs/write.c index 34107b55f834..18b0a9f1615e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work) if (subreq->debug_index == 3) return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } @@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work) afs_wait_for_operation(op); ret = afs_put_operation(op); switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: @@ -179,8 +182,8 @@ void afs_issue_write(struct netfs_io_subrequest *subreq) */ void afs_begin_writeback(struct netfs_io_request *wreq) { - afs_get_writeback_key(wreq); - wreq->io_streams[0].avail = true; + if (S_ISREG(wreq->inode->i_mode)) + afs_get_writeback_key(wreq); } /* @@ -193,6 +196,18 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st list_first_entry(&stream->subrequests, struct netfs_io_subrequest, rreq_link); + switch (wreq->origin) { + case NETFS_READAHEAD: + case NETFS_READPAGE: + case NETFS_READ_GAPS: + case NETFS_READ_SINGLE: + case NETFS_READ_FOR_WRITE: + case NETFS_DIO_READ: + return; + default: + break; + } + switch (subreq->error) { case -EACCES: case -EPERM: diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h index 8ca868164507..cc5f143d21a3 100644 --- a/fs/afs/xdr_fs.h +++ b/fs/afs/xdr_fs.h @@ -88,7 +88,7 @@ union afs_xdr_dir_block { struct { struct afs_xdr_dir_hdr hdr; - u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; + u8 alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR]; __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; } meta; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 024227aba4cd..257af259c04a 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -352,19 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) static int yfs_deliver_fs_fetch_data64(struct afs_call *call) { struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; const __be32 *bp; size_t count_before; int ret; _enter("{%u,%zu, %zu/%llu}", call->unmarshall, call->iov_len, iov_iter_count(call->iter), - req->actual_len); + call->remaining); switch (call->unmarshall) { case 0: - req->actual_len = 0; + call->remaining = 0; afs_extract_to_tmp64(call); call->unmarshall++; fallthrough; @@ -379,42 +379,39 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) if (ret < 0) return ret; - req->actual_len = be64_to_cpu(call->tmp64); - _debug("DATA length: %llu", req->actual_len); + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); - if (req->actual_len == 0) + if (call->remaining == 0) goto no_more_data; - call->iter = req->iter; - call->iov_len = min(req->actual_len, req->len); + call->iter = &subreq->io_iter; + call->iov_len = min(call->remaining, subreq->len - subreq->transferred); call->unmarshall++; fallthrough; /* extract the returned data */ case 2: count_before = call->iov_len; - _debug("extract data %zu/%llu", count_before, req->actual_len); + _debug("extract data %zu/%llu", count_before, call->remaining); ret = afs_extract_data(call, true); - if (req->subreq) { - req->subreq->transferred += count_before - call->iov_len; - netfs_read_subreq_progress(req->subreq, false); - } + subreq->transferred += count_before - call->iov_len; if (ret < 0) return ret; call->iter = &call->def_iter; - if (req->actual_len <= req->len) + if (call->remaining) goto no_more_data; /* Discard any excess data the server gave us */ - afs_extract_discard(call, req->actual_len - req->len); + afs_extract_discard(call, call->remaining); call->unmarshall = 3; fallthrough; case 3: _debug("extract discard %zu/%llu", - iov_iter_count(call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), call->remaining); ret = afs_extract_data(call, true); if (ret < 0) @@ -439,8 +436,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) xdr_decode_YFSCallBack(&bp, call, &vp->scb); xdr_decode_YFSVolSync(&bp, &op->volsync); - req->data_version = vp->scb.status.data_version; - req->file_size = vp->scb.status.size; + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); call->unmarshall++; fallthrough; @@ -459,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) static const struct afs_call_type yfs_RXYFSFetchData64 = { .name = "YFS.FetchData64", .op = yfs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, .deliver = yfs_deliver_fs_fetch_data64, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; @@ -468,14 +467,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = { */ void yfs_fs_fetch_data(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; struct afs_call *call; __be32 *bp; - _enter(",%x,{%llx:%llu},%llx,%llx", + _enter(",%x,{%llx:%llu},%llx,%zx", key_serial(op->key), vp->fid.vid, vp->fid.vnode, - req->pos, req->len); + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64, sizeof(__be32) * 2 + @@ -487,15 +487,16 @@ void yfs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - req->call_debug_id = call->debug_id; + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); - bp = xdr_encode_u64(bp, req->pos); - bp = xdr_encode_u64(bp, req->len); + bp = xdr_encode_u64(bp, subreq->start + subreq->transferred); + bp = xdr_encode_u64(bp, subreq->len - subreq->transferred); yfs_check_req(call, bp); call->fid = vp->fid; @@ -666,8 +667,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call) static void yfs_done_fs_remove_file2(struct afs_call *call) { if (call->error == -ECONNABORTED && - call->abort_code == RX_INVALID_OPERATION) { - set_bit(AFS_SERVER_FL_NO_RM2, &call->server->flags); + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RM2, &call->op->server->flags); call->op->flags |= AFS_OPERATION_DOWNGRADE; } } @@ -1335,7 +1335,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, if (until == 0 || ret < 0 || ret >= min_nr) return ret; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (until != KTIME_MAX) { hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); @@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); diff --git a/fs/attr.c b/fs/attr.c index c04d19b58f12..9caf63d20d03 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -272,6 +272,47 @@ out_big: EXPORT_SYMBOL(inode_newsize_ok); /** + * setattr_copy_mgtime - update timestamps for mgtime inodes + * @inode: inode timestamps to be updated + * @attr: attrs for the update + * + * With multigrain timestamps, take more care to prevent races when + * updating the ctime. Always update the ctime to the very latest using + * the standard mechanism, and use that to populate the atime and mtime + * appropriately (unless those are being set to specific values). + */ +static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct timespec64 now; + + if (ia_valid & ATTR_CTIME) { + /* + * In the case of an update for a write delegation, we must respect + * the value in ia_ctime and not use the current time. + */ + if (ia_valid & ATTR_DELEG) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else + now = inode_set_ctime_current(inode); + } else { + /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ + WARN_ON_ONCE(ia_valid & ATTR_MTIME); + now = current_time(inode); + } + + if (ia_valid & ATTR_ATIME_SET) + inode_set_atime_to_ts(inode, attr->ia_atime); + else if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, now); + + if (ia_valid & ATTR_MTIME_SET) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + else if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, now); +} + +/** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated @@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(inode, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, mode &= ~S_ISGID; inode->i_mode = mode; } + + if (is_mgtime(inode)) + return setattr_copy_mgtime(inode, attr); + + if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) { + if (ia_valid & ATTR_DELEG) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else + inode_set_ctime_to_ts(inode, attr->ia_ctime); + } } EXPORT_SYMBOL(setattr_copy); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index f011e026358e..6d57efbb8110 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { + unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); @@ -133,7 +134,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); - if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", @@ -141,8 +142,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } } else { - unsigned int inr = _IOC_NR(cmd); - if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { diff --git a/fs/backing-file.c b/fs/backing-file.c index 8860dac58c37..763fbe9b72b2 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -80,7 +80,7 @@ struct backing_aio { refcount_t ref; struct kiocb *orig_iocb; /* used for aio completion */ - void (*end_write)(struct file *); + void (*end_write)(struct kiocb *iocb, ssize_t); struct work_struct work; long res; }; @@ -108,10 +108,10 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res) struct kiocb *iocb = &aio->iocb; struct kiocb *orig_iocb = aio->orig_iocb; + orig_iocb->ki_pos = iocb->ki_pos; if (aio->end_write) - aio->end_write(orig_iocb->ki_filp); + aio->end_write(orig_iocb, res); - orig_iocb->ki_pos = iocb->ki_pos; backing_aio_put(aio); } @@ -200,7 +200,7 @@ out: revert_creds(old_cred); if (ctx->accessed) - ctx->accessed(ctx->user_file); + ctx->accessed(iocb->ki_filp); return ret; } @@ -219,7 +219,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, if (!iov_iter_count(iter)) return 0; - ret = file_remove_privs(ctx->user_file); + ret = file_remove_privs(iocb->ki_filp); if (ret) return ret; @@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); if (ctx->end_write) - ctx->end_write(ctx->user_file); + ctx->end_write(iocb, ret); } else { struct backing_aio *aio; @@ -270,7 +270,7 @@ out: } EXPORT_SYMBOL_GPL(backing_file_write_iter); -ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, +ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb, struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx) @@ -282,19 +282,19 @@ ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, return -EIO; old_cred = override_creds(ctx->cred); - ret = vfs_splice_read(in, ppos, pipe, len, flags); + ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); revert_creds(old_cred); if (ctx->accessed) - ctx->accessed(ctx->user_file); + ctx->accessed(iocb->ki_filp); return ret; } EXPORT_SYMBOL_GPL(backing_file_splice_read); ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags, + struct file *out, struct kiocb *iocb, + size_t len, unsigned int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; @@ -306,18 +306,18 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (!out->f_op->splice_write) return -EINVAL; - ret = file_remove_privs(ctx->user_file); + ret = file_remove_privs(iocb->ki_filp); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); - ret = out->f_op->splice_write(pipe, out, ppos, len, flags); + ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); file_end_write(out); revert_creds(old_cred); if (ctx->end_write) - ctx->end_write(ctx->user_file); + ctx->end_write(iocb, ret); return ret; } @@ -327,10 +327,10 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx) { const struct cred *old_cred; + struct file *user_file = vma->vm_file; int ret; - if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || - WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!file->f_op->mmap) @@ -343,7 +343,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, revert_creds(old_cred); if (ctx->accessed) - ctx->accessed(ctx->user_file); + ctx->accessed(user_file); return ret; } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367..ab6c95b895b3 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -24,6 +24,7 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 4e4a448f6931..c84a91572a1d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c) continue; } + if (k.k->p.offset < ca->mi.first_bucket) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + continue; + } + + if (k.k->p.offset >= ca->mi.nbuckets) { + bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + struct bch_alloc_v4 a; *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 0; @@ -1967,7 +1977,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) ca->mi.bucket_size, GFP_KERNEL); - int ret = bch2_trans_do(c, NULL, NULL, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_WATERMARK_btree| BCH_TRANS_COMMIT_no_enospc, bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); @@ -2127,14 +2137,15 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; if (ret) - break; + goto restart_err; if (!k.k) break; ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); +restart_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; if (ret) break; @@ -2340,24 +2351,19 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) /* Bucket IO clocks: */ -int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, - size_t bucket_nr, int rw) +static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_alloc_v4 *a; - u64 now; - int ret = 0; - - if (bch2_trans_relock(trans)) - bch2_trans_begin(trans); - a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); - ret = PTR_ERR_OR_ZERO(a); + struct btree_iter iter; + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); + int ret = PTR_ERR_OR_ZERO(a); if (ret) return ret; - now = bch2_current_io_time(c, rw); + u64 now = bch2_current_io_time(c, rw); if (a->v.io_time[rw] == now) goto out; @@ -2370,6 +2376,15 @@ out: return ret; } +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + if (bch2_trans_relock(trans)) + bch2_trans_begin(trans); + + return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw)); +} + /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index f8e87c6721b1..163a67b97a40 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type) static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { + if (a.data_type >= BCH_DATA_NR) + return 0; + if (!data_type_movable(a.data_type) || !bch2_bucket_sectors_fragmented(ca, a)) return 0; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index d0e0b56892e3..372178c8d416 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; + rcu_read_unlock(); + ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = ob - c->open_buckets; @@ -684,7 +688,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, struct bch_dev_usage usage; struct open_bucket *ob; - bch2_trans_do(c, NULL, NULL, 0, + bch2_trans_do(c, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, false, &usage))); return ob; @@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark); + avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); @@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, --c->open_buckets_partial_nr; swap(c->open_buckets_partial[i], c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); spin_lock(&c->freelist_lock); @@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - (!ca || ob->dev == ca->dev_idx)) + if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 47455a85c909..654a58132a4d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -52,6 +52,12 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + int ret = 0; + + bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, + c, backpointer_level_bad, + "backpointer level bad: %u >= %u", + bp.v->level, BTREE_MAX_DEPTH); rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); @@ -64,7 +70,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); rcu_read_unlock(); - int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), @@ -947,9 +952,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, - struct bkey_s_c_backpointer bp, + struct bkey_s_c bp_k, struct bkey_buf *last_flushed) { + if (bp_k.k->type != KEY_TYPE_backpointer) + return 0; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bch_fs *c = trans->c; struct btree_iter iter; struct bbpos pos = bp_to_bbpos(*bp.v); @@ -1004,9 +1013,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed); + check_one_backpointer(trans, start, end, k, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f4151ee51b03..e94a83b8113e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -555,6 +555,7 @@ struct bch_dev { u64 alloc_cursor[3]; unsigned nr_open_buckets; + unsigned nr_partial_buckets; unsigned nr_btree_reserve; size_t inc_gen_needs_gc; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 84832c2d4df9..5004f6ba997c 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -678,7 +678,8 @@ struct bch_sb_field_ext { x(disk_accounting_v2, BCH_VERSION(1, 9)) \ x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ - x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 587d7318a2e8..995ba32e9b6e 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -643,7 +643,7 @@ int bch2_bkey_format_invalid(struct bch_fs *c, enum bch_validate_flags flags, struct printbuf *err) { - unsigned i, bits = KEY_PACKED_BITS_START; + unsigned bits = KEY_PACKED_BITS_START; if (f->nr_fields != BKEY_NR_FIELDS) { prt_printf(err, "incorrect number of fields: got %u, should be %u", @@ -655,9 +655,8 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * Verify that the packed format can't represent fields larger than the * unpacked format: */ - for (i = 0; i < f->nr_fields; i++) { - if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && - bch2_bkey_format_field_overflows(f, i)) { + for (unsigned i = 0; i < f->nr_fields; i++) { + if (bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); unsigned packed_bits = min(64, f->bits_per_field[i]); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 6e4afb2b5441..7123019ab3bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -59,16 +59,38 @@ static inline size_t btree_cache_can_free(struct btree_cache_list *list) static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); + if (b->c.lock.readers) - list_move(&b->list, &bc->freed_pcpu); + list_add(&b->list, &bc->freed_pcpu); else - list_move(&b->list, &bc->freed_nonpcpu); + list_add(&b->list, &bc->freed_nonpcpu); +} + +static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(!b->data); + + bc->nr_freeable++; + list_add(&b->list, &bc->freeable); } -static void btree_node_data_free(struct bch_fs *c, struct btree *b) +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + mutex_lock(&bc->lock); + __bch2_btree_node_to_freelist(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); /* @@ -94,11 +116,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->nr_freeable--; - btree_node_to_freedlist(bc, b); } +static void btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); + --bc->nr_freeable; + __btree_node_data_free(bc, b); +} + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -174,21 +202,10 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); return b; } -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) { struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); @@ -236,11 +253,11 @@ void bch2_btree_cache_unpin(struct bch_fs *c) /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { lockdep_assert_held(&bc->lock); - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ @@ -248,17 +265,22 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; + --bc->live[btree_node_pinned(b)].nr; + list_del_init(&b->list); +} - bc->live[btree_node_pinned(b)].nr--; - bc->nr_freeable++; - list_move(&b->list, &bc->freeable); +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + __bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); - b->hash_val = btree_ptr_hash_val(&b->key); + b->hash_val = btree_ptr_hash_val(&b->key); int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); if (ret) @@ -270,10 +292,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) bool p = __btree_node_pinned(bc, b); mod_bit(BTREE_NODE_pinned, &b->flags, p); - list_move_tail(&b->list, &bc->live[p].list); + list_add_tail(&b->list, &bc->live[p].list); bc->live[p].nr++; - - bc->nr_freeable--; return 0; } @@ -485,7 +505,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, goto out; if (!btree_node_reclaim(c, b, true)) { - btree_node_data_free(c, b); + btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; @@ -501,10 +521,10 @@ restart: bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; } else if (!btree_node_reclaim(c, b, true)) { - bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_hash_remove(bc, b); + __btree_node_data_free(bc, b); freed++; - btree_node_data_free(c, b); bc->nr_freed++; six_unlock_write(&b->c.lock); @@ -587,7 +607,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); } BUG_ON(!bch2_journal_error(&c->journal) && @@ -786,8 +806,8 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); -got_node: +got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: @@ -796,7 +816,12 @@ got_node: if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); + + list_del_init(&b2->list); + --bc->nr_freeable; btree_node_to_freedlist(bc, b2); + mutex_unlock(&bc->lock); + six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -810,11 +835,8 @@ got_node: goto err; } - mutex_lock(&bc->lock); - bc->nr_freeable++; got_mem: - mutex_unlock(&bc->lock); - + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); @@ -845,7 +867,7 @@ err: if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); - bch2_btree_node_hash_remove(bc, b2); + __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); @@ -855,9 +877,9 @@ err: six_unlock_intent(&b2->c.lock); } else { b = b2; - list_del_init(&b->list); } + BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); @@ -936,7 +958,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b->hash_val = 0; mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); @@ -1312,9 +1334,12 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - if (!IS_ERR_OR_NULL(b)) + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + if (b) six_unlock_read(&b->c.lock); - return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); + return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1353,7 +1378,7 @@ wait_on_io: mutex_lock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 367acd217c6a..66e86d1a178d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -14,7 +14,9 @@ void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); +void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); + int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 771154e3a291..81dcf9e512c0 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -182,7 +182,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) bch2_btree_node_drop_keys_outside_node(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new->k_i); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); @@ -820,12 +820,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans, * fix that here: */ alloc_data_type_set(&gc, gc.data_type); - if (gc.data_type != old_gc.data_type || gc.dirty_sectors != old_gc.dirty_sectors) { ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc); if (ret) return ret; + + /* + * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not + * safe w.r.t. transaction restarts, so fixup the gc_bucket so + * we don't run it twice: + */ + percpu_down_read(&c->mark_lock); + struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); + gc_m->data_type = gc.data_type; + gc_m->dirty_sectors = gc.dirty_sectors; + percpu_up_read(&c->mark_lock); } if (fsck_err_on(new.data_type != gc.data_type, @@ -1224,17 +1234,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_gens_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->state_lock); + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 1c1448b52207..839d68802e42 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -733,11 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) { + offset, sectors, ptr_written ?: btree_sectors(c))) i->u64s = 0; - ret = 0; - goto out; - } btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, @@ -829,7 +826,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } -out: fsck_err: printbuf_exit(&buf2); printbuf_exit(&buf1); @@ -1838,10 +1834,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - __btree_node_write_done(c, b); - six_unlock_read(&b->c.lock); + /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); } static void btree_node_write_work(struct work_struct *work) @@ -1870,7 +1867,7 @@ static void btree_node_write_work(struct work_struct *work) } } else { - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index bfe9f0c1e1be..eef9b89c561d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); + if (!k.k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " at btree "); + bch2_btree_pos_to_text(&buf, c, l->b); + + ret = bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } bch2_bkey_buf_reassemble(out, c, k); @@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); +err: bch2_btree_and_journal_iter_exit(&jiter); return ret; } @@ -2381,9 +2394,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e else iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) : + bkey_gt(iter_pos, end))) goto end; break; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 78e63ad7d380..0bda054f80d7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(&(_iter))) + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) @@ -904,6 +912,8 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); _ret; \ }) +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) + struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 1e694fedc5da..30131c3bdd97 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) return; + if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) + return; + rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), @@ -183,7 +186,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, .ptrs[0].offset = offset, .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), }; rcu_read_unlock(); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 514df618548e..5d809e8bd170 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -668,7 +668,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, struct disk_reservation *disk_res, int flags, enum btree_iter_update_trigger_flags iter_flags) { - return bch2_trans_do(c, disk_res, NULL, flags, + return bch2_trans_commit_do(c, disk_res, NULL, flags, bch2_btree_insert_trans(trans, id, k, iter_flags)); } @@ -865,7 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, memcpy(l->d, buf.buf, buf.pos); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_do(c, NULL, NULL, + ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|commit_flags, __bch2_trans_log_msg(trans, &buf, u64s)); } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 6a454f2fa005..70b3c989fac2 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -192,7 +192,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_flags))) -#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) #define trans_for_each_update(_trans, _i) \ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 190bc1e81756..d596ef93239f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -237,10 +237,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) BUG_ON(b->will_make_reachable); clear_btree_node_noevict(b); - - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); } static void bch2_btree_node_free_inmem(struct btree_trans *trans, @@ -252,12 +248,12 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, bch2_btree_node_lock_write_nofail(trans, path, &b->c); + __btree_node_free(trans, b); + mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -289,7 +285,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); BUG_ON(p->nr >= ARRAY_SIZE(p->b)); @@ -521,8 +517,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + bch2_btree_node_to_freelist(c, b); } } } @@ -1434,6 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, } } +static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) +{ + if (insert_keys) + for_each_keylist_key(insert_keys, k) + if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) + return true; + return false; +} + /* * Move keys from n1 (original replacement node, now lower node) to n2 (higher * node) @@ -1441,7 +1445,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, static void __btree_split_node(struct btree_update *as, struct btree_trans *trans, struct btree *b, - struct btree *n[2]) + struct btree *n[2], + struct keylist *insert_keys) { struct bkey_packed *k; struct bpos n1_pos = POS_MIN; @@ -1476,7 +1481,8 @@ static void __btree_split_node(struct btree_update *as, if (b->c.level && u64s < n1_u64s && u64s + k->u64s >= n1_u64s && - bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || + key_deleted_in_insert(insert_keys, uk.p))) n1_u64s += k->u64s; i = u64s >= n1_u64s; @@ -1603,7 +1609,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - __btree_split_node(as, trans, b, n); + __btree_split_node(as, trans, b, n, keys); if (keys) { btree_split_insert_keys(as, trans, path, n1, keys); @@ -2239,10 +2245,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work) struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret; - ret = bch2_trans_do(c, NULL, NULL, 0, - async_btree_node_rewrite_trans(trans, a)); + int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); bch_err_fn_ratelimited(c, ret); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); @@ -2394,7 +2398,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (new_hash) { mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - bch2_btree_node_hash_remove(&c->btree_cache, b); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 3f56b584f8ec..1639c60dffa0 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -277,6 +277,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; + bch2_trans_unlock(trans); bch2_trans_begin(trans); @@ -491,7 +495,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, + bool *did_work) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; @@ -502,6 +507,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; + /* * On memory allocation failure, bch2_btree_write_buffer_flush_locked() * is not guaranteed to empty wb->inc: @@ -521,17 +528,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool did_work = false; - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; + bool did_work = false; trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); +} + +/* + * The write buffer requires flushing when going RO: keys in the journal for the + * write buffer don't have a journal pin yet + */ +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) +{ + if (bch2_journal_error(&c->journal)) + return false; + + bool did_work = false; + bch2_trans_run(c, btree_write_buffer_flush_seq(trans, + journal_cur_seq(&c->journal), &did_work)); + return did_work; } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 725e79654216..d535cea28bde 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -21,6 +21,7 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 546cd01a72e3..ec7d9a59bea9 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1160,11 +1160,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) #define SECTORS_CACHE 1024 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { struct bch_fs_pcpu *pcpu; u64 old, get; - s64 sectors_available; + u64 sectors_available; int ret; percpu_down_read(&c->mark_lock); @@ -1202,6 +1202,9 @@ recalculate: percpu_u64_set(&c->pcpu->sectors_available, 0); sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) + sectors = min(sectors, sectors_available); + if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index e2cb7b24b220..ccc78bfe2fd4 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -103,12 +103,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b) +static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) +{ + u8 *gen = bucket_gen(ca, b); + return gen ? *gen : -1; +} + +static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { rcu_read_lock(); - u8 gen = *bucket_gen(ca, b); + int ret = bucket_gen_get_rcu(ca, b); rcu_read_unlock(); - return gen; + return ret; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -169,10 +175,8 @@ static inline int gen_after(u8 a, u8 b) static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); - if (!gen) - return -1; - return gen_after(*gen, ptr->gen); + int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); + return gen < 0 ? gen : gen_after(gen, ptr->gen); } /** @@ -184,7 +188,6 @@ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr rcu_read_lock(); int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); - return ret; } @@ -344,14 +347,16 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c, } } -#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) +enum bch_reservation_flags { + BCH_DISK_RESERVATION_NOFAIL = 1 << 0, + BCH_DISK_RESERVATION_PARTIAL = 1 << 1, +}; -int __bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - u64, int); +int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *, + u64, enum bch_reservation_flags); static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - u64 sectors, int flags) + u64 sectors, enum bch_reservation_flags flags) { #ifdef __KERNEL__ u64 old, new; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index cbfd88f98472..2182b555c112 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -225,6 +225,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); opt_set(thr->opts, read_only, 1); + opt_set(thr->opts, ratelimit_errors, 0); /* We need request_key() to be called before we punt to kthread: */ opt_set(thr->opts, nostart, true); diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1d6b691e8da6..1f8e035d7119 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -14,21 +14,13 @@ static inline bool io_timer_cmp(const void *l, const void *r, void __always_unus return (*_l)->expire < (*_r)->expire; } -static inline void io_timer_swp(void *l, void *r, void __always_unused *args) -{ - struct io_timer **_l = (struct io_timer **)l; - struct io_timer **_r = (struct io_timer **)r; - - swap(*_l, *_r); -} +static const struct min_heap_callbacks callbacks = { + .less = io_timer_cmp, + .swp = NULL, +}; void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { @@ -48,11 +40,6 @@ out: void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; - spin_lock(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) @@ -142,10 +129,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = io_timer_cmp, - .swp = io_timer_swp, - }; if (clock->timers.nr && time_after_eq64(now, clock->timers.data[0]->expire)) { diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index 4f06cd8bbbe1..e86d36d23e9e 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -2,6 +2,7 @@ #include <linux/log2.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include "darray.h" int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) @@ -9,7 +10,19 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); - void *data = kvmalloc_array_noprof(new_size, element_size, gfp); + /* + * This is a workaround: kvmalloc() doesn't support > INT_MAX + * allocations, but vmalloc() does. + * The limit needs to be lifted from kvmalloc, and when it does + * we'll go back to just using that. + */ + size_t bytes; + if (unlikely(check_mul_overflow(new_size, element_size, &bytes))) + return -ENOMEM; + + void *data = likely(bytes < INT_MAX) + ? kvmalloc_noprof(bytes, gfp) + : vmalloc_noprof(bytes); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 462b1a2fe1ad..8e75a852b358 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc if (ptr2 == ptr) break; + ca = bch2_dev_have_ref(c, ptr2->dev); bucket = PTR_BUCKET_POS(ca, ptr2); bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); } @@ -235,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), ptr); rewrites_found |= 1U << i; } i++; @@ -283,7 +285,8 @@ restart_drop_extra_replicas: durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); goto restart_drop_extra_replicas; } } @@ -294,7 +297,7 @@ restart_drop_extra_replicas: bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, &should_check_enospc, @@ -557,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct data_update_opts data_opts) + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -568,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, if (ret) return ret; - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + while (data_opts->kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts->kill_ptrs); bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; + data_opts->kill_ptrs ^= 1U << drop; } /* @@ -580,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(n)); + bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); /* * Since we're not inserting through an extent iterator @@ -719,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); goto out; } diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 8d36365bdea8..e4b50723428e 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *, int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct data_update_opts); + struct bch_io_opts *, + struct data_update_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 84dd4a879d98..faffc98d5605 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -250,13 +250,6 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, return ret; } -static void dirent_copy_target(struct bkey_i_dirent *dst, - struct bkey_s_c_dirent src) -{ - dst->v.d_inum = src.v->d_inum; - dst->v.d_type = src.v->d_type; -} - int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, struct bkey_s_c_dirent d, subvol_inum *target) { diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 8945145865c5..53ad99666022 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -34,6 +34,13 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); +static inline void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 9f3133e3e7e5..07eb8fa1b026 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k) *p = swab64(*p); } +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos acc) +{ + unsafe_memcpy(r, &acc.replicas, + replicas_entry_bytes(&acc.replicas), + "variable length struct"); +} + static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; @@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - unsafe_memcpy(r, &acc_k.replicas, - replicas_entry_bytes(&acc_k.replicas), - "variable length struct"); + __accounting_to_replicas(r, acc_k); return true; default: return false; @@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return ret; } +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + /* * At startup time, initialize the in memory accounting from the btree (and * journal) @@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c) } keys->gap = keys->nr = dst - keys->data; - percpu_down_read(&c->mark_lock); - for (unsigned i = 0; i < acc->k.nr; i++) { - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + percpu_down_write(&c->mark_lock); + unsigned i = 0; + while (i < acc->k.nr) { + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) - continue; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); - struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) - continue; + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* - * If the replicas entry is invalid it'll get cleaned up by - * check_allocations: + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: */ - if (bch2_replicas_entry_validate(&r.e, c, &buf)) + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, + v, acc->k.data[idx].nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(acc->k.data[idx].v[0]); + free_percpu(acc->k.data[idx].v[1]); + darray_remove_item(&acc->k, &acc->k.data[idx]); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + ret = 0; continue; - - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &k), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_read(&c->mark_lock); } + + if (ret) + goto fsck_err; + i++; } preempt_disable(); @@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c) } preempt_enable(); fsck_err: - percpu_up_read(&c->mark_lock); + percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); @@ -777,8 +856,10 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc) }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; - int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc)); + int ret = bch2_trans_do(c, ({ + bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); + })); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 1587c6e1866a..6094afb0c6be 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); + bkey_fsck_err_on(s->csum_granularity_bits >= 64, + c, stripe_csum_granularity_bad, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + ret = bch2_bkey_ptrs_validate(c, k, flags); fsck_err: return ret; @@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, nr_data, s.nr_redundant); bch2_prt_csum_type(out, s.csum_type); - prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + prt_str(out, " gran "); + if (s.csum_granularity_bits < 64) + prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits); + else + prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits); if (s.disk_label) { prt_str(out, " label"); @@ -257,12 +266,12 @@ static int __mark_stripe_bucket(struct btree_trans *trans, if (!deleting) { a->stripe = s.k->p.offset; a->stripe_redundancy = s.v->nr_redundant; + alloc_data_type_set(a, data_type); } else { a->stripe = 0; a->stripe_redundancy = 0; + alloc_data_type_set(a, BCH_DATA_user); } - - alloc_data_type_set(a, data_type); err: printbuf_exit(&buf); return ret; @@ -1048,6 +1057,11 @@ static inline void ec_stripes_heap_swap(void *l, void *r, void *h) ec_stripes_heap_set_backpointer(_h, j); } +static const struct min_heap_callbacks callbacks = { + .less = ec_stripes_heap_cmp, + .swp = ec_stripes_heap_swap, +}; + static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; @@ -1060,11 +1074,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); @@ -1075,11 +1084,6 @@ void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; - mutex_lock(&c->ec_stripes_heap_lock); BUG_ON(min_heap_full(&c->ec_stripes_heap)); @@ -1098,10 +1102,6 @@ void bch2_stripes_heap_insert(struct bch_fs *c, void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { - const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, - }; ec_stripes_heap *h = &c->ec_stripes_heap; bool do_deletes; size_t i; @@ -1177,7 +1177,7 @@ static void ec_stripe_delete_work(struct work_struct *work) if (!idx) break; - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ec_stripe_delete(trans, idx)); bch_err_fn(c, ret); if (ret) @@ -1197,47 +1197,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c) /* stripe creation: */ static int ec_stripe_key_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - bool create) + struct bkey_i_stripe *old, + struct bkey_i_stripe *new) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; + bool create = !old; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_intent); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { - bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", - create ? "creating" : "updating", - bch2_bkey_types[k.k->type]); + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type])) { ret = -EINVAL; goto err; } if (k.k->type == KEY_TYPE_stripe) { - const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; - unsigned i; + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; - if (old->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + BUG_ON(old->v.nr_blocks != new->v.nr_blocks); + BUG_ON(old->v.nr_blocks != v->nr_blocks); + + for (unsigned i = 0; i < new->v.nr_blocks; i++) { + unsigned sectors = stripe_blockcount_get(v, i); + + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } - for (i = 0; i < new->v.nr_blocks; i++) { - unsigned v = stripe_blockcount_get(old, i); + /* + * If the stripe ptr changed underneath us, it must have + * been dev_remove_stripes() -> * invalidate_stripe_to_dev() + */ + if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) { + BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID); - BUG_ON(v && - (old->ptrs[i].dev != new->v.ptrs[i].dev || - old->ptrs[i].gen != new->v.ptrs[i].gen || - old->ptrs[i].offset != new->v.ptrs[i].offset)); + if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i])) + new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID; + } - stripe_blockcount_set(&new->v, i, v); + stripe_blockcount_set(&new->v, i, sectors); } } @@ -1495,12 +1510,14 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } - ret = bch2_trans_do(c, &s->res, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_key_update(trans, - bkey_i_to_stripe(&s->new_stripe.key), - !s->have_existing_stripe)); + ret = bch2_trans_commit_do(c, &s->res, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, + ec_stripe_key_update(trans, + s->have_existing_stripe + ? bkey_i_to_stripe(&s->existing_stripe.key) + : NULL, + bkey_i_to_stripe(&s->new_stripe.key))); bch_err_msg(c, ret, "creating stripe key"); if (ret) { goto err; @@ -1844,6 +1861,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, } h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); + if (!h) { + h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + goto err; + } found: if (h->rw_devs_change_count != c->rw_devs_change_count) ec_stripe_head_devs_update(c, h); @@ -1876,7 +1897,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { - __clear_bit(v->ptrs[i].dev, devs.d); + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to + * walk backpointers and update all extents that point to that + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) nr_have_data++; else diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 60b7875adada..9c4fe5cdbfb7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -83,6 +83,8 @@ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ x(ENOMEM, ENOMEM_disk_accounting) \ + x(ENOMEM, ENOMEM_stripe_head_alloc) \ + x(ENOMEM, ENOMEM_journal_read_bucket) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -222,6 +224,7 @@ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ @@ -268,7 +271,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 7a79f695ba2e..b679def8fb98 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -251,7 +251,10 @@ int __bch2_fsck_err(struct bch_fs *c, * delete the key) * - and we don't need to warn if we're not prompting */ - WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c)); + WARN_ON((flags & FSCK_CAN_FIX) && + !(flags & FSCK_AUTOFIX) && + !trans && + bch2_current_has_btree_trans(c)); if ((flags & FSCK_CAN_FIX) && test_bit(err, c->sb.errors_silent)) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index cc0d22085aef..37e3d69bec06 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, + struct bch_extent_ptr *ptr) +{ + if (!opts->promote_target || + !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + return false; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + + return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); +} + +void bch2_extent_ptr_set_cached(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; - union bch_extent_entry *ec = NULL; + struct extent_ptr_decoded p; - bkey_extent_entry_for_each(ptrs, entry) { + rcu_read_lock(); + if (!want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr_noerror(k, ptr); + goto out; + } + + /* + * Stripes can't contain cached data, for - reasons. + * + * Possibly something we can fix in the future? + */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (&entry->ptr == ptr) { - ptr->cached = true; - if (ec) - extent_entry_drop(k, ec); - return; + if (p.has_ec) + bch2_bkey_drop_ptr_noerror(k, ptr); + else + ptr->cached = true; + goto out; } - if (extent_entry_is_stripe_ptr(entry)) - ec = entry; - else if (extent_entry_is_ptr(entry)) - ec = NULL; - } - BUG(); +out: + rcu_read_unlock(); } /* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * @@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr) > 0); + (!(ca = bch2_dev_rcu(c, ptr->dev)) || + dev_ptr_stale_rcu(ca, ptr) > 0)); + rcu_read_unlock(); + + return bkey_deleted(k.k); +} + +/* + * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. + * + * Like bch2_extent_normalize(), but also only keeps a single cached pointer on + * the promote target. + */ +bool bch2_extent_normalize_by_opts(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k) +{ + struct bkey_ptrs ptrs; + bool have_cached_ptr; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr(k, ptr); + goto restart_drop_ptrs; + } + have_cached_ptr = true; + } rcu_read_unlock(); return bkey_deleted(k.k); @@ -1310,7 +1364,7 @@ void bch2_ptr_swab(struct bkey_s k) for (entry = ptrs.start; entry < ptrs.end; entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { + switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32: @@ -1330,6 +1384,9 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_rebalance: break; + default: + /* Bad entry type: will be caught by validate() */ + return; } } } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ed5001dd662e..bcffcf60aaaf 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -686,15 +686,28 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, + struct bkey_s, struct bch_extent_ptr *); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +{ + return (ptr1.cached == ptr2.cached && + ptr1.unwritten == ptr2.unwritten && + ptr1.offset == ptr2.offset && + ptr1.dev == ptr2.dev && + ptr1.dev == ptr2.dev); +} + void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 48a1ab9a649b..95972809e76d 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -856,6 +856,12 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, folios_trunc(&fs, fi); end = min(end, folio_end_pos(darray_last(fs))); } else { + if (!folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + folios_trunc(&fs, fi + 1); end = f_pos + f_reserved; } diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index ee1c0325f313..6d3a05ae5da8 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) static __always_inline long bch2_dio_write_done(struct dio_write *dio) { + struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; bool sync = dio->sync; @@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write); + bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index af3a24546aa3..1d4910ea0f1d 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -399,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c, bch2_quota_reservation_put(c, inode, &res->quota); } -int bch2_folio_reservation_get(struct bch_fs *c, +static int __bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, - size_t offset, size_t len) + size_t offset, size_t len, + bool partial) { struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; + struct disk_reservation disk_res = {}; + size_t reserved = len; int ret; if (!s) @@ -422,48 +425,65 @@ int bch2_folio_reservation_get(struct bch_fs *c, } if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors, + partial ? BCH_DISK_RESERVATION_PARTIAL : 0); if (unlikely(ret)) return ret; + + if (unlikely(disk_res.sectors != disk_sectors)) { + disk_sectors = quota_sectors = 0; + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); + if (disk_sectors > disk_res.sectors) { + /* + * Make sure to get a reservation that's + * aligned to the filesystem blocksize: + */ + unsigned reserved_offset = round_down(i << 9, block_bytes(c)); + reserved = clamp(reserved_offset, offset, offset + len) - offset; + + if (!reserved) { + bch2_disk_reservation_put(c, &disk_res); + return -BCH_ERR_ENOSPC_disk_reservation; + } + break; + } + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + } } if (quota_sectors) { ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true); if (unlikely(ret)) { - struct disk_reservation tmp = { .sectors = disk_sectors }; - - bch2_disk_reservation_put(c, &tmp); - res->disk.sectors -= disk_sectors; + bch2_disk_reservation_put(c, &disk_res); return ret; } } - return 0; + res->disk.sectors += disk_res.sectors; + return partial ? reserved : 0; } -ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, +int bch2_folio_reservation_get(struct bch_fs *c, struct bch_inode_info *inode, struct folio *folio, struct bch2_folio_reservation *res, size_t offset, size_t len) { - size_t l, reserved = 0; - int ret; - - while ((l = len - reserved)) { - while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) { - if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c)) - return reserved ?: ret; - - len = reserved + l; - l /= 2; - } - - offset += l; - reserved += l; - } + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false); +} - return reserved; +ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + size_t offset, size_t len) +{ + return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true); } static void bch2_clear_folio_bits(struct folio *folio) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 71d0fa387509..2456c41b215e 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -182,7 +182,7 @@ static int bch2_flush_inode(struct bch_fs *c, struct bch_inode_unpacked u; int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: - bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); bch2_write_ref_put(c, BCH_WRITE_REF_fsync); return ret; @@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, POS(inode->v.i_ino, start_sector), BTREE_ITER_slots|BTREE_ITER_intent); - while (!ret && bkey_lt(iter.pos, end_pos)) { + while (!ret) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; @@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); + if (bkey_ge(iter.pos, end_pos)) + break; + ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); if (ret) @@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, - opts.data_replicas, true)) + opts.data_replicas, true)) { ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, false), 0)); + if (ret) + goto bkey_err; + } bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) @@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) - drop_locks_do(trans, + iter.pos.offset, true)) { + ret = drop_locks_do(trans, bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, false)); + if (ret) + goto bkey_err; + } bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5bfc26d58270..a41d0d8a2f7b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) +{ + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return false; + + darray_init(&subvols); +restart_from_top: + + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +err: + darray_exit(&subvols); + return ret; +} + +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } @@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c, subvol_inum inum) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wait; + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); @@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); @@ -300,10 +396,10 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) BUG(); } -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) { struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, - bch2_inode_cache, GFP_NOFS); + bch2_inode_cache, gfp); if (!inode) return NULL; @@ -315,7 +411,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { kmem_cache_free(bch2_inode_cache, inode); return NULL; } @@ -328,12 +424,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { - struct bch_inode_info *inode = - memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, - __bch2_new_inode(trans->c)); + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); @@ -407,7 +501,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = __bch2_new_inode(c); + inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; @@ -562,7 +656,7 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct bch_inode_info *inode; - bch2_trans_do(c, NULL, NULL, 0, + bch2_trans_do(c, PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), &hash, &dentry->d_name))); if (IS_ERR(inode)) @@ -775,7 +869,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) - goto err; + goto err_tx_restart; if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, @@ -1172,7 +1266,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(ei->v.i_ino, start), 0); - while (true) { + while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1180,14 +1274,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); if (ret) - goto err; + continue; bch2_btree_iter_set_snapshot(&iter, snapshot); k = bch2_btree_iter_peek_upto(&iter, end); ret = bkey_err(k); if (ret) - goto err; + continue; if (!k.k) break; @@ -1207,7 +1301,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) - break; + continue; k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); @@ -1235,10 +1329,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); -err: - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; } bch2_trans_iter_exit(trans, &iter); @@ -1946,7 +2036,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); printbuf_nul_terminate(&buf); - seq_puts(seq, buf.buf); + seq_printf(seq, ",%s", buf.buf); int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index da74ecc236e7..59f9f7ae728d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) return inode->ei_inum; } -struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); - /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -148,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -198,10 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) -{ - return NULL; -} +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8a6ceb0cc7a..75c8a97a6954 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -326,17 +326,54 @@ err: return ret; } +static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) +{ + if (inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) + return false; + + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); +} + +static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, + SPOS(d_pos.inode, d_pos.offset, snapshot), + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bpos_eq(k.k->p, d_pos)) { + /* + * delet_at() doesn't work because the update path doesn't + * internally use BTREE_ITER_with_updates yet + */ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct bch_fs *c = trans->c; - struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; - struct qstr name; - u64 dir_offset = 0; - u32 dirent_snapshot = inode->bi_snapshot; int ret; + u32 dirent_snapshot = inode->bi_snapshot; if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; @@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; - dir_hash = bch2_hash_info_init(c, &lostfound); + struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound); + struct qstr name = (struct qstr) QSTR(name_buf); - name = (struct qstr) QSTR(name_buf); + inode->bi_dir = lostfound.bi_inum; ret = bch2_dirent_create_snapshot(trans, inode->bi_parent_subvol, lostfound.bi_inum, @@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * inode_d_type(inode), &name, inode->bi_subvol ?: inode->bi_inum, - &dir_offset, + &inode->bi_dir_offset, STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; + ret = __bch2_fsck_write_inode(trans, inode); + if (ret) + return ret; + + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { + snapshot_id_list whiteouts_done; + struct btree_iter iter; + struct bkey_s_c k; + + darray_init(&whiteouts_done); + + for_each_btree_key_reverse_norestart(trans, iter, + BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1), + BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bkey_is_inode(k.k) || + !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) || + snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot)) + continue; + + struct bch_inode_unpacked child_inode; + bch2_inode_unpack(k, &child_inode); - return __bch2_fsck_write_inode(trans, inode); + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, + SPOS(lostfound.bi_inum, inode->bi_dir_offset, + dirent_snapshot), + k.k->p.snapshot); + if (ret) + break; + + ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) + break; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; + child_inode.bi_dir_offset = inode->bi_dir_offset; + + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + break; + } + } + darray_exit(&whiteouts_done); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; } static int remove_backpointer(struct btree_trans *trans, @@ -838,35 +929,138 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int hash_redo_key(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c k) +static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) { - struct bkey_i *delete; - struct bkey_i *tmp; + if (d.v->d_type == DT_SUBVOL) { + u32 snap; + u64 inum; + int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); - if (IS_ERR(delete)) - return PTR_ERR(delete); + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} - tmp = bch2_bkey_make_mut_noupdate(trans, k); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); +/* + * Prefer to delete the first one, since that will be the one at the wrong + * offset: + * return value: 0 -> delete k1, 1 -> delete k2 + */ +static int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; - bkey_init(&delete->k); - delete->k.p = k_iter->pos; - return bch2_btree_iter_traverse(k_iter) ?: - bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_in_snapshot(trans, desc, hash_info, - (subvol_inum) { 0, k.k->p.inode }, - k.k->p.snapshot, tmp, - STR_HASH_must_create| - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +static int fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) +{ + if (new->k.type != KEY_TYPE_dirent) + return 0; + + struct bkey_i_dirent *d = bkey_i_to_dirent(new); + struct inode_walker target = inode_walker_init(); + int ret = 0; + + if (d->v.d_type == DT_SUBVOL) { + BUG(); + } else { + ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); + if (ret) + goto err; + + darray_for_each(target.inodes, i) { + i->inode.bi_dir_offset = d->k.p.offset; + ret = __bch2_fsck_write_inode(trans, &i->inode); + if (ret) + goto err; + } + } +err: + inode_walker_exit(&target); + return ret; +} + +static int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); } static int hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c hash_k) @@ -895,16 +1089,9 @@ static int hash_check_key(struct btree_trans *trans, if (bkey_eq(k.k->p, hash_k.k->p)) break; - if (fsck_err_on(k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k), - trans, hash_table_key_duplicate, - "duplicate hash table keys:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - buf.buf))) { - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; - break; - } + if (k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k)) + goto duplicate_entries; if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); @@ -917,18 +1104,66 @@ out: return ret; bad_hash: if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - bch_err_fn(c, ret); + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); if (ret) - return ret; - ret = -BCH_ERR_transaction_restart_nested; + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } fsck_err: goto out; +duplicate_entries: + ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; } static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, @@ -994,7 +1229,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans, */ inode->bi_dir = 0; inode->bi_dir_offset = 0; - inode->bi_flags &= ~BCH_INODE_backptr_untrusted; *write_inode = true; } @@ -1006,28 +1240,37 @@ fsck_err: return ret; } -static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) +static int get_snapshot_root_inode(struct btree_trans *trans, + struct bch_inode_unpacked *root, + u64 inum) { - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; - /* snapshot tree corruption, can't safely delete */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found_root; } - - return __bch2_inode_hash_find(c, inum) != NULL; + if (ret) + goto err; + BUG(); +found_root: + BUG_ON(bch2_inode_unpack(k, root)); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct bch_inode_unpacked *prev, - struct snapshots_seen *s, - bool full) + struct bch_inode_unpacked *snapshot_root, + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -1050,22 +1293,19 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); - if (!full && - !(u.bi_flags & (BCH_INODE_i_size_dirty| - BCH_INODE_i_sectors_dirty| - BCH_INODE_unlinked))) - return 0; - - if (prev->bi_inum != u.bi_inum) - *prev = u; + if (snapshot_root->bi_inum != u.bi_inum) { + ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + if (ret) + goto err; + } - if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || - inode_d_type(prev) != inode_d_type(&u), + if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { - bch_err(c, "repair not implemented yet"); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err_noprint; + u.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); + do_update = true; } if (u.bi_dir || u.bi_dir_offset) { @@ -1101,28 +1341,27 @@ static int check_inode(struct btree_trans *trans, ret = 0; } - if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && - bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); - if (ret) - goto err; - - u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, &u); + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; - bch_err_msg(c, ret, "in fsck updating inode"); + if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot), + trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be %u)\n%s", + ret, + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { if (ret) - goto err_noprint; - - if (!bpos_eq(new_min_pos, POS_MIN)) - bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); - goto err_noprint; + u.bi_flags |= BCH_INODE_has_child_snapshot; + else + u.bi_flags &= ~BCH_INODE_has_child_snapshot; + do_update = true; } + ret = 0; - if (u.bi_flags & BCH_INODE_unlinked) { + if ((u.bi_flags & BCH_INODE_unlinked) && + !(u.bi_flags & BCH_INODE_has_child_snapshot)) { if (!test_bit(BCH_FS_started, &c->flags)) { /* * If we're not in online fsck, don't delete unlinked @@ -1147,7 +1386,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1155,69 +1398,10 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } - /* i_size_dirty is vestigal, since we now have logged ops for truncate * */ - if (u.bi_flags & BCH_INODE_i_size_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_size_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_size dirty", - u.bi_inum))) { - bch_verbose(c, "truncating inode %llu", u.bi_inum); - - /* - * XXX: need to truncate partial blocks too here - or ideally - * just switch units to bytes and that issue goes away - */ - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, - iter->pos.snapshot), - POS(u.bi_inum, U64_MAX), - 0, NULL); - bch_err_msg(c, ret, "in fsck truncating inode"); - if (ret) - return ret; - - /* - * We truncated without our normal sector accounting hook, just - * make sure we recalculate it: - */ - u.bi_flags |= BCH_INODE_i_sectors_dirty; - - u.bi_flags &= ~BCH_INODE_i_size_dirty; - do_update = true; - } - - /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */ - if (u.bi_flags & BCH_INODE_i_sectors_dirty && - (!test_bit(BCH_FS_clean_recovery, &c->flags) || - fsck_err(trans, inode_i_sectors_dirty_but_clean, - "filesystem marked clean, but inode %llu has i_sectors dirty", - u.bi_inum))) { - s64 sectors; - - bch_verbose(c, "recounting sectors for inode %llu", - u.bi_inum); - - sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); - if (sectors < 0) { - bch_err_msg(c, sectors, "in fsck recounting inode sectors"); - return sectors; - } - - u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_i_sectors_dirty; - do_update = true; - } - - if (u.bi_flags & BCH_INODE_backptr_untrusted) { - u.bi_dir = 0; - u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_backptr_untrusted; - do_update = true; - } - if (fsck_err_on(u.bi_parent_subvol && (u.bi_subvol == 0 || u.bi_subvol == BCACHEFS_ROOT_SUBVOL), @@ -1274,8 +1458,7 @@ err_noprint: int bch2_check_inodes(struct bch_fs *c) { - bool full = c->opts.fsck; - struct bch_inode_unpacked prev = { 0 }; + struct bch_inode_unpacked snapshot_root = {}; struct snapshots_seen s; snapshots_seen_init(&s); @@ -1285,13 +1468,104 @@ int bch2_check_inodes(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_inode(trans, &iter, k, &prev, &s, full))); + check_inode(trans, &iter, k, &snapshot_root, &s))); snapshots_seen_exit(&s); bch_err_fn(c, ret); return ret; } +static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + /* + * We look for inodes to reattach in natural key order, leaves first, + * but we should do the reattach at the oldest version that needs to be + * reattached: + */ + for_each_btree_key_norestart(trans, iter, + BTREE_ID_inodes, + SPOS(0, inode->bi_inum, inode->bi_snapshot + 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode->bi_inum) + break; + + if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot)) + continue; + + if (!bkey_is_inode(k.k)) + break; + + struct bch_inode_unpacked parent_inode; + bch2_inode_unpack(k, &parent_inode); + + if (!inode_should_reattach(&parent_inode)) + break; + + *inode = parent_inode; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!bkey_is_inode(k.k)) + return 0; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + if (!inode_should_reattach(&inode)) + return 0; + + ret = find_oldest_inode_needs_reattach(trans, &inode); + if (ret) + return ret; + + if (fsck_err(trans, inode_unreachable, + "unreachable inode:\n%s", + (bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) + ret = reattach_inode(trans, &inode); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * Reattach unreachable (but not unlinked) inodes + * + * Run after check_inodes() and check_dirents(), so we node that inode + * backpointer fields point to valid dirents, and every inode that has a dirent + * that points to it has its backpointer field set - so we're just looking for + * non-unlinked inodes without backpointers: + * + * XXX: this is racy w.r.t. hardlink removal in online fsck + */ +int bch2_check_unreachable_inodes(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_unreachable_inode(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) { switch (btree) { @@ -1694,8 +1968,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", @@ -2207,7 +2480,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); + ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2321,7 +2594,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); + ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -2409,7 +2682,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2450,22 +2723,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (ret) break; - /* - * We've checked that inode backpointers point to valid dirents; - * here, it's sufficient to check that the subvolume root has a - * dirent: - */ - if (fsck_err_on(!subvol_root.bi_dir, - trans, subvol_unreachable, - "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, &subvol_root), - buf.buf))) { - ret = reattach_subvol(trans, s); - break; - } - u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { @@ -2526,12 +2783,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -/* - * Check that a given inode is reachable from its subvolume root - we already - * verified subvolume connectivity: - * - * XXX: we should also be verifying that inodes are in the right subvolumes - */ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; @@ -2545,6 +2796,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino BUG_ON(bch2_inode_unpack(inode_k, &inode)); + if (!S_ISDIR(inode.bi_mode)) + return 0; + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; @@ -2559,21 +2813,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); if (bch2_err_matches(ret, ENOENT)) { - ret = 0; - if (fsck_err(trans, inode_unreachable, - "unreachable inode\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, inode_k), - buf.buf))) - ret = reattach_inode(trans, &inode); + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, inode_k); + bch_err(c, "unreachable inode in check_directory_structure: %s\n%s", + bch2_err_str(ret), buf.buf); goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode.bi_mode)) - break; - ret = darray_push(p, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, @@ -2626,9 +2874,8 @@ fsck_err: } /* - * Check for unreachable inodes, as well as loops in the directory structure: - * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's - * unreachable: + * Check for loops in the directory structure: all other connectivity issues + * have been fixed by prior passes */ int bch2_check_directory_structure(struct bch_fs *c) { @@ -2756,6 +3003,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, if (S_ISDIR(u.bi_mode)) continue; + /* + * Previous passes ensured that bi_nlink is nonzero if + * it had multiple hardlinks: + */ if (!u.bi_nlink) continue; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index a4ef94271784..1cca31011530 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); int bch2_check_subvolume_structure(struct bch_fs *); +int bch2_check_unreachable_inodes(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3e5bc01961b8..039cb7a22244 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "extent_update.h" +#include "fs.h" #include "inode.h" #include "str_hash.h" #include "snapshot.h" @@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = { }; #undef x +static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -160,8 +163,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, unsigned fieldnr = 0, field_bits; int ret; -#define x(_name, _bits) \ - if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ +#define x(_name, _bits) \ + if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ memset((void *) unpacked + offset, 0, \ sizeof(*unpacked) - offset); \ @@ -280,6 +283,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); + unpacked->bi_snapshot = k.k->p.snapshot; + switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -290,10 +295,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - if (INODE_NEW_VARINT(inode.v)) { + if (INODEv1_NEW_VARINT(inode.v)) { return bch2_inode_unpack_v2(unpacked, inode.v->fields, bkey_val_end(inode), - INODE_NR_FIELDS(inode.v)); + INODEv1_NR_FIELDS(inode.v)); } else { return bch2_inode_unpack_v1(inode, unpacked); } @@ -468,10 +473,10 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; - bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, + bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, inode_str_hash_invalid, "invalid str hash type (%llu >= %u)", - INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); ret = __bch2_inode_validate(c, k, flags); fsck_err: @@ -530,6 +535,10 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "(%x)\n", inode->bi_flags); prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq); + prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed); + prt_printf(out, "hash_type="); + bch2_prt_str_hash_type(out, INODE_STR_HASH(inode)); + prt_newline(out); prt_printf(out, "bi_size=%llu\n", inode->bi_size); prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors); prt_printf(out, "bi_version=%llu\n", inode->bi_version); @@ -575,9 +584,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) } } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f); + return; + case KEY_TYPE_inode_v2: + bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f); + return; + case KEY_TYPE_inode_v3: + bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f); + return; + default: + BUG(); + } +} + +static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) { - return bkey_inode_flags(k) & BCH_INODE_unlinked; + unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + + return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); +} + +static struct bkey_s_c +bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +static struct bkey_s_c +bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos, unsigned flags) +{ + struct bkey_s_c k; +again: + k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags); + if (!k.k || + bkey_err(k) || + bkey_is_inode(k.k)) + return k; + + bch2_trans_iter_exit(trans, iter); + pos = k.k->p; + goto again; +} + +int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_upto_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) && + bkey_is_inode(k.k)) { + ret = 1; + break; + } + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int update_inode_has_children(struct btree_trans *trans, + struct bkey_s k, + bool have_child) +{ + if (!have_child) { + int ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) + return ret < 0 ? ret : 0; + } + + u64 f = bkey_inode_flags(k.s_c); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) + bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot); + + return 0; +} + +static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos, + bool have_child) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans, + &iter, pos, BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + if (!have_child) { + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret) { + ret = ret < 0 ? ret : 0; + goto err; + } + } + + u64 f = bkey_inode_flags(k); + if (have_child != !!(f & BCH_INODE_has_child_snapshot)) { + struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } int bch2_trigger_inode(struct btree_trans *trans, @@ -586,6 +723,8 @@ int bch2_trigger_inode(struct btree_trans *trans, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { BUG_ON(!trans->journal_res.seq); bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); @@ -599,13 +738,41 @@ int bch2_trigger_inode(struct btree_trans *trans, return ret; } - int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) - - (int) bkey_is_deleted_inode(old); - if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, - new.k->p, deleted_delta > 0); - if (ret) - return ret; + if (flags & BTREE_TRIGGER_transactional) { + int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) - + (int) bkey_is_unlinked_inode(old); + if (unlinked_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, unlinked_delta > 0); + if (ret) + return ret; + } + + /* + * If we're creating or deleting an inode at this snapshot ID, + * and there might be an inode in a parent snapshot ID, we might + * need to set or clear the has_child_snapshot flag on the + * parent. + */ + int deleted_delta = (int) bkey_is_inode(new.k) - + (int) bkey_is_inode(old.k); + if (deleted_delta && + bch2_snapshot_parent(c, new.k->p.snapshot)) { + int ret = update_parent_inode_has_children(trans, new.k->p, + deleted_delta > 0); + if (ret) + return ret; + } + + /* + * When an inode is first updated in a new snapshot, we may need + * to clear has_child_snapshot + */ + if (deleted_delta > 0) { + int ret = update_inode_has_children(trans, new, false); + if (ret) + return ret; + } } return 0; @@ -639,10 +806,8 @@ void bch2_inode_init_early(struct bch_fs *c, memset(inode_u, 0, sizeof(*inode_u)); - /* ick */ - inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->bi_hash_seed, - sizeof(inode_u->bi_hash_seed)); + SET_INODE_STR_HASH(inode_u, str_hash); + get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, @@ -888,6 +1053,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (ret) + goto err2; + + ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +err2: bch2_trans_put(trans); return ret; } @@ -921,8 +1091,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans, int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(trans, inum, inode)); + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); } int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) @@ -992,7 +1161,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } -int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; @@ -1055,6 +1224,45 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } +/* + * After deleting an inode, there may be versions in older snapshots that should + * also be deleted - if they're not referenced by sibling snapshots and not open + * in other subvolumes: + */ +static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; +next_parent: + ret = lockrestart_do(trans, + bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0))); + if (ret || !k.k) + return ret; + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; + bch2_trans_iter_exit(trans, &iter); + + if (!unlinked) + return 0; + + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; + + ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); + if (ret) + return ret; + goto next_parent; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); +} + static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos, @@ -1064,6 +1272,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; int ret; k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); @@ -1099,6 +1308,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans, pos.offset, pos.snapshot)) goto delete; + if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto out; + + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &inode), + buf.buf))) { + inode.bi_flags |= BCH_INODE_has_child_snapshot; + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto out; + } + goto delete; + + } + if (test_bit(BCH_FS_clean_recovery, &c->flags) && !fsck_err(trans, deleted_inode_but_clean, "filesystem marked as clean but have deleted inode %llu:%u", @@ -1107,33 +1341,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { - struct bpos new_min_pos; - - ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); - if (ret) - goto out; - - inode.bi_flags &= ~BCH_INODE_unlinked; - - ret = bch2_inode_write_flags(trans, &inode_iter, &inode, - BTREE_UPDATE_internal_snapshot_node); - bch_err_msg(c, ret, "clearing inode unlinked flag"); - if (ret) - goto out; - - /* - * We'll need another write buffer flush to pick up the new - * unlinked inodes in the snapshot leaves: - */ - *need_another_pass = true; - goto out; - } - ret = 1; out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); return ret; delete: ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 9c1f67705684..eab82b5eb897 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -5,6 +5,7 @@ #include "bkey.h" #include "bkey_methods.h" #include "opts.h" +#include "snapshot.h" enum bch_validate_flags; extern const char * const bch2_inode_opts[]; @@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); + +static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) +{ + return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0 + ? __bch2_inode_has_child_snapshots(trans, pos) + : 0; +} + int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); @@ -82,6 +92,7 @@ struct bch_inode_unpacked { BCH_INODE_FIELDS_v3() #undef x }; +BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24); struct bkey_inode_buf { struct bkey_i_inode_v3 inode; diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 83d107331edf..7928d0c6954f 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -133,7 +133,8 @@ enum inode_opt_id { x(i_size_dirty, 5) \ x(i_sectors_dirty, 6) \ x(unlinked, 7) \ - x(backptr_untrusted, 8) + x(backptr_untrusted, 8) \ + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ @@ -149,9 +150,9 @@ enum __bch_inode_flags { #undef x }; -LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); +LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32); LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 307ed0a45184..f283051758d6 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -377,7 +377,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, * check for missing subvolume before fpunch, as in resume we don't want * it to be a fatal error */ - ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors); + ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors)); if (ret) return ret; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index e4fc17c548fd..b3b934a87c6d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -262,7 +262,8 @@ err: bio_free_pages(&(*rbio)->bio); kfree(*rbio); *rbio = NULL; - kfree(op); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -409,8 +410,8 @@ retry: bch2_trans_begin(trans); rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) goto err; bch2_bkey_buf_reassemble(&sk, c, k); @@ -557,8 +558,8 @@ out: static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); + bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio)); } /* Inner part that may run in process context */ @@ -802,16 +803,15 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - u8 *gen = bucket_gen(ca, iter.pos.offset); - if (gen) { - + int gen = bucket_gen_get(ca, iter.pos.offset); + if (gen >= 0) { prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - prt_printf(&buf, "memory gen: %u", *gen); + prt_printf(&buf, "memory gen: %u", gen); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); if (!ret) { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index b5fe9e0dc155..96720adcfee0 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1300,11 +1300,8 @@ retry: bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); - rcu_read_lock(); - u8 *gen = bucket_gen(ca, i->b.offset); - stale = !gen ? -1 : gen_after(*gen, i->gen); - rcu_read_unlock(); - + int gen = bucket_gen_get(ca, i->b.offset); + stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; @@ -1437,7 +1434,7 @@ again: * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), @@ -1447,7 +1444,7 @@ again: op->nr_replicas_required, op->watermark, op->flags, - &op->cl, &wp)); + &op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index f5f7db50ca31..2dc0d60c1745 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, { int ret; + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), + HZ * 10)) + return ret; + + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", + buf.buf); + printbuf_exit(&buf); + closure_wait_event(&j->async_wait, (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || (flags & JOURNAL_RES_GET_NONBLOCK)); @@ -745,7 +758,7 @@ out: return ret; } -int bch2_journal_flush_seq(struct journal *j, u64 seq) +int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state) { u64 start_time = local_clock(); int ret, ret2; @@ -756,7 +769,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) if (seq <= j->flushed_seq_ondisk) return 0; - ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + ret = wait_event_state(j->wait, + (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)), + task_state); if (!ret) bch2_time_stats_update(j->flush_seq_time, start_time); @@ -775,7 +790,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent) int bch2_journal_flush(struct journal *j) { - return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); + return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE); } /* @@ -838,7 +853,7 @@ int bch2_journal_meta(struct journal *j) bch2_journal_res_put(j, &res); - return bch2_journal_flush_seq(j, res.seq); + return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } /* block/unlock the journal: */ diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 377a3750406e..2762be6f9814 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -401,7 +401,7 @@ void bch2_journal_entry_res_resize(struct journal *, int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 954f6a96e0f4..fb35dd336331 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -708,6 +708,9 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_dev_usage, entry); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + if (vstruct_bytes(entry) < sizeof(*u)) + return; + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); printbuf_indent_add(out, 2); @@ -1012,6 +1015,8 @@ reread: nr_bvecs = buf_pages(buf->data, sectors_read << 9); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_journal_read_bucket; bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8c456d8b8b99..0ef4a86850bb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas) { if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 232be8a44051..0e2ee262fbd4 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -63,7 +63,7 @@ const char * const bch2_compression_opts[] = { NULL }; -const char * const bch2_str_hash_types[] = { +const char * const __bch2_str_hash_types[] = { BCH_STR_HASH_TYPES() NULL }; @@ -115,6 +115,7 @@ PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); +PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, struct printbuf *err) @@ -225,7 +226,7 @@ const struct bch_option bch2_opt_table[] = { #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices), \ + .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ .choices = _choices #define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = U64_MAX, \ @@ -427,7 +428,9 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); @@ -594,6 +597,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { + if (!*opt) + continue; + name = strsep(&opt, "="); val = opt; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index cb2e244a2429..23dda014e331 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; -extern const char * const bch2_str_hash_types[]; +extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; @@ -29,6 +29,7 @@ void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); void bch2_prt_data_type(struct printbuf *, enum bch_data_type); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); +void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); static inline const char *bch2_d_type_str(unsigned d_type) { diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index c32a05e252e2..74f45a8162ad 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -869,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_set_quota_trans(trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 2d299a37cf07..cd6647374353 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -70,7 +70,9 @@ err: int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + int ret = bch2_trans_commit_do(c, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_lazy_rw, __bch2_set_rebalance_needs_scan(trans, inum)); rebalance_wakeup(c); return ret; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 6db72d3bad7d..3c7f941dde39 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); @@ -287,7 +286,8 @@ int bch2_journal_replay(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| - BCH_TRANS_COMMIT_no_journal_res, + BCH_TRANS_COMMIT_no_journal_res| + BCH_WATERMARK_reclaim, bch2_journal_replay_accounting_key(trans, k)); if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) goto err; @@ -862,6 +862,13 @@ use_clean: if (ret) goto err; + /* + * Normally set by the appropriate recovery pass: when cleared, this + * indicates we're in early recovery and btree updates should be done by + * being applied to the journal replay keys. _Must_ be cleared before + * multithreaded use: + */ + set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ @@ -1001,6 +1008,7 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); + struct bch_member *m; int ret; bch_notice(c, "initializing new filesystem"); @@ -1017,6 +1025,14 @@ int bch2_fs_initialize(struct bch_fs *c) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } + + for_each_member_device(c, ca) { + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); + ca->mi = bch2_mi_to_cpu(m); + } + + bch2_write_super(c); mutex_unlock(&c->sb_lock); c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; @@ -1090,7 +1106,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init_early(c, &lostfound_inode); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_create_trans(trans, BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 735b8adc8f9d..dff589ddc984 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -27,6 +27,12 @@ const char * const bch2_recovery_passes[] = { NULL }; +/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ +static int bch2_recovery_pass_empty(struct bch_fs *c) +{ + return 0; +} + static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -221,6 +227,12 @@ int bch2_run_recovery_passes(struct bch_fs *c) { int ret = 0; + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { if (c->opts.recovery_pass_last && c->curr_recovery_pass > c->opts.recovery_pass_last) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 50406ce0e4ef..94dc20ca2065 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ @@ -46,6 +47,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index bcb3276747e0..477ef0997949 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } -static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r, - struct bch_sb *sb, - struct printbuf *err) +static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r, + struct bch_sb *sb, + struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); @@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_fs *c, struct printbuf *err) { - mutex_lock(&c->sb_lock); - int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err); - mutex_unlock(&c->sb_lock); - return ret; + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (r->devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); - int ret = bch2_replicas_entry_validate_locked(e, sb, err); + int ret = bch2_replicas_entry_sb_validate(e, sb, err); if (ret) return ret; @@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, rcu_read_lock(); for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } + nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 5102059a0f1d..8767c33c2b51 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -78,7 +78,10 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -140,6 +143,9 @@ UPGRADE_TABLE() static int have_stripes(struct bch_fs *c) { + if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) + return 0; + return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4135b1ea2fec..9feb6739f77a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -136,7 +136,9 @@ enum bch_fsck_flags { x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ x(need_discard_freespace_key_bad, 124, 0) \ + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ x(backpointer_to_missing_ptr, 128, 0) \ @@ -177,9 +179,12 @@ enum bch_fsck_flags { x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_pos_bad, 292, 0) \ x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ + x(stripe_csum_granularity_bad, 290, 0) \ x(stripe_sector_count_wrong, 169, 0) \ x(snapshot_tree_pos_bad, 170, 0) \ x(snapshot_tree_to_missing_snapshot, 171, 0) \ @@ -225,11 +230,13 @@ enum bch_fsck_flags { x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ x(key_in_missing_inode, 216, 0) \ x(key_in_wrong_inode_type, 217, 0) \ @@ -264,8 +271,8 @@ enum bch_fsck_flags { x(journal_entry_dup_same_device, 246, 0) \ x(inode_bi_subvol_missing, 247, 0) \ x(inode_bi_subvol_wrong, 248, 0) \ - x(inode_points_to_missing_dirent, 249, 0) \ - x(inode_points_to_wrong_dirent, 250, 0) \ + x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ + x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ x(inode_bi_parent_nonzero, 251, 0) \ x(dirent_to_missing_parent_subvol, 252, 0) \ x(dirent_not_visible_in_parent_subvol, 253, 0) \ @@ -289,6 +296,7 @@ enum bch_fsck_flags { x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ @@ -298,7 +306,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 287, 0) + x(MAX, 295, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 02bcde3c1b02..116131f95815 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { + prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out, prt_newline(out); prt_printf(out, "Btree allocated bitmap blocksize:\t"); - prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + if (m.btree_bitmap_shift < 64) + prt_units_u64(out, 1ULL << m.btree_bitmap_shift); + else + prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); prt_newline(out); prt_printf(out, "Btree allocated bitmap:\t"); @@ -442,7 +450,7 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } - BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); BUG_ON(end > 64ULL << m->btree_bitmap_shift); for (unsigned bit = start >> m->btree_bitmap_shift; diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index d727d2dfda08..2adf1221a440 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -66,6 +66,12 @@ struct bch_member { }; /* + * btree_allocated_bitmap can represent sector addresses of a u64: it itself has + * 64 elements, so 64 - ilog2(64) + */ +#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 + +/* * This limit comes from the bucket_gens array - it's a single allocation, and * kernel allocation are limited to INT_MAX */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 1809442b00ee..ae57638506c3 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) if (bch2_snapshot_equiv(c, id)) return 0; - /* 0 is an invalid tree ID */ + /* Do we need to reconstruct the snapshot_tree entry as well? */ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; u32 tree_id = 0; - int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + if (ret) return ret; + if (!tree_id) { + ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + } + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); ret = PTR_ERR_OR_ZERO(snapshot); if (ret) @@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) snapshot->v.tree = cpu_to_le32(tree_id); snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: @@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - - return s->children[1] ?: s->children[0]; -} - -static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -{ - u32 child; - - while ((child = bch2_snapshot_smallest_child(c, id))) - id = child; - return id; -} - -static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c interior_k, - u32 leaf_id, struct bpos *new_min_pos) -{ - struct btree_iter iter; - struct bpos pos = interior_k.k->p; - struct bkey_s_c k; - struct bkey_i *new; - int ret; - - pos.snapshot = leaf_id; - - bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - - /* key already overwritten in this snapshot? */ - if (k.k->p.snapshot != interior_k.k->p.snapshot) - goto out; - - if (bpos_eq(*new_min_pos, POS_MIN)) { - *new_min_pos = k.k->p; - new_min_pos->snapshot = leaf_id; - } - - new = bch2_bkey_make_mut_noupdate(trans, interior_k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - new->k.p.snapshot = leaf_id; - ret = bch2_trans_update(trans, &iter, new, 0); -out: - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, - enum btree_id btree, - struct bkey_s_c k, - struct bpos *new_min_pos) -{ - struct bch_fs *c = trans->c; - struct bkey_buf sk; - u32 restart_count = trans->restart_count; - int ret = 0; - - bch2_bkey_buf_init(&sk); - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - *new_min_pos = POS_MIN; - - for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); - id < k.k->p.snapshot; - id++) { - if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || - !bch2_snapshot_is_leaf(c, id)) - continue; -again: - ret = btree_trans_too_many_iters(trans) ?: - bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: - bch2_trans_commit(trans, NULL, NULL, 0); - if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bch2_trans_begin(trans); - goto again; - } - - if (ret) - break; - } - - bch2_bkey_buf_exit(&sk, c); - - return ret ?: trans_was_restarted(trans, restart_count); -} - static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index eb5ef64221d6..29c94716293e 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } -int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, - struct bkey_s_c, struct bpos *); - int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 215eed4cce6d..ec2b1feea520 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -46,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { /* XXX ick */ struct bch_hash_info info = { - .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & - ~(~0U << INODE_STR_HASH_BITS), + .type = INODE_STR_HASH(bi), .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -253,19 +252,20 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_in_snapshot(struct btree_trans *trans, +struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { - struct btree_iter iter, slot = { NULL }; + struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; int ret; - for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), @@ -280,7 +280,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(&slot, &iter); + bch2_trans_copy_iter(&slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -290,29 +290,50 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, ret = -BCH_ERR_ENOSPC_str_hash_create; out: bch2_trans_iter_exit(trans, &slot); - bch2_trans_iter_exit(trans, &iter); - - return ret; + bch2_trans_iter_exit(trans, iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; found: found = true; not_found: - - if (!found && (flags & STR_HASH_must_replace)) { + if (found && (flags & STR_HASH_must_create)) { + bch2_trans_iter_exit(trans, &slot); + return k; + } else if (!found && (flags & STR_HASH_must_replace)) { ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; - } else if (found && (flags & STR_HASH_must_create)) { - ret = -BCH_ERR_EEXIST_str_hash_set; } else { if (!found && slot.path) - swap(iter, slot); + swap(*iter, slot); - insert->k.p = iter.pos; - ret = bch2_trans_update(trans, &iter, insert, flags); + insert->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, insert, flags); } goto out; } static __always_inline +int bch2_hash_set_in_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + enum btree_iter_update_trigger_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum, + snapshot, insert, flags); + int ret = bkey_err(k); + if (ret) + return ret; + if (k.k) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_EEXIST_str_hash_set; + } + + return 0; +} + +static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, @@ -363,8 +384,11 @@ int bch2_hash_delete(struct btree_trans *trans, struct btree_iter iter; struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_intent); - int ret = bkey_err(k) ?: - bch2_hash_delete_at(trans, desc, info, &iter, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 91d8187ee168..80e5efaff524 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -319,8 +319,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) { - return bch2_trans_do(c, NULL, NULL, 0, - bch2_subvol_is_ro_trans(trans, subvol)); + return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); } int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, @@ -676,8 +675,8 @@ err: /* set bi_subvol on root inode */ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ce7410d72089..7c71594f6a8b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return -BCH_ERR_invalid_sb_layout_nr_superblocks; } + if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { + prt_printf(out, "Invalid superblock layout: max_size_bits too high"); + return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; + } + max_sectors = 1 << layout->sb_max_size_bits; prev_offset = le64_to_cpu(layout->sb_offset[0]); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 873e4be7e1dc..a6ed9a0bf1c7 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); +static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); @@ -271,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) clean_passes++; if (bch2_btree_interior_updates_flush(c) || + bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { @@ -620,9 +622,7 @@ void __bch2_fs_stop(struct bch_fs *c) up_write(&c->state_lock); for_each_member_device(c, ca) - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); @@ -1187,9 +1187,7 @@ static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); - if (ca->kobj.state_in_sysfs && - ca->disk_sb.bdev) - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); @@ -1226,10 +1224,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); - if (ca->kobj.state_in_sysfs) { - sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); - sysfs_remove_link(&ca->kobj, "block"); - } + bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); @@ -1251,6 +1246,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref) complete(&ca->io_ref_completion); } +static void bch2_dev_unlink(struct bch_dev *ca) +{ + struct kobject *b; + + /* + * This is racy w.r.t. the underlying block device being hot-removed, + * which removes it from sysfs. + * + * It'd be lovely if we had a way to handle this race, but the sysfs + * code doesn't appear to provide a good method and block/holder.c is + * susceptible as well: + */ + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev && + (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { + sysfs_remove_link(b, "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } +} + static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; @@ -1958,7 +1973,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - ret = bch2_trans_do(ca->fs, NULL, NULL, 0, + ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index b2f209743afe..fb5c1543e52f 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -450,7 +450,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, k.k_i.k.p.snapshot = snapid; k.k_i.k.size = len; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, BTREE_UPDATE_internal_snapshot_node)); bch_err_fn(c, ret); @@ -510,7 +510,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr) if (ret) return ret; - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_commit_do(c, NULL, NULL, 0, bch2_snapshot_node_create(trans, U32_MAX, snapids, snapid_subvols, @@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, unsigned i; u64 time; + if (nr == 0 || nr_threads == 0) { + pr_err("nr of iterations or threads is not allowed to be 0"); + return -EINVAL; + } + atomic_set(&j.ready, nr_threads); init_waitqueue_head(&j.ready_wait); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 56c8d3fe55a4..952aca400faf 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -330,7 +330,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_trans_do(c, NULL, NULL, 0, + int ret = bch2_trans_do(c, bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); if (ret < 0 && bch2_err_matches(ret, ENOENT)) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index f92f108840f5..8f430ff8e445 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -11,12 +11,13 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/nls.h> #include <linux/buffer_head.h> #include <linux/vfs.h> -#include <linux/parser.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/cred.h> @@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static void befs_put_super(struct super_block *); -static int befs_remount(struct super_block *, int *, char *); static int befs_statfs(struct dentry *, struct kstatfs *); static int befs_show_options(struct seq_file *, struct dentry *); -static int parse_options(char *, struct befs_mount_options *); static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_get_parent(struct dentry *child); +static void befs_free_fc(struct fs_context *fc); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ .free_inode = befs_free_inode, /* deallocate an inode */ .put_super = befs_put_super, /* uninit super */ .statfs = befs_statfs, /* statfs */ - .remount_fs = befs_remount, .show_options = befs_show_options, }; @@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child) } enum { - Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, + Opt_uid, Opt_gid, Opt_charset, Opt_debug, }; -static const match_table_t befs_tokens = { - {Opt_uid, "uid=%d"}, - {Opt_gid, "gid=%d"}, - {Opt_charset, "iocharset=%s"}, - {Opt_debug, "debug"}, - {Opt_err, NULL} +static const struct fs_parameter_spec befs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_string ("iocharset", Opt_charset), + fsparam_flag ("debug", Opt_debug), + {} }; static int -parse_options(char *options, struct befs_mount_options *opts) +befs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - kuid_t uid; - kgid_t gid; - - /* Initialize options */ - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->use_uid = 0; - opts->use_gid = 0; - opts->iocharset = NULL; - opts->debug = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, befs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - uid = INVALID_UID; - if (option >= 0) - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - pr_err("Invalid uid %d, " - "using default\n", option); - break; - } - opts->uid = uid; - opts->use_uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - gid = INVALID_GID; - if (option >= 0) - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - pr_err("Invalid gid %d, " - "using default\n", option); - break; - } - opts->gid = gid; - opts->use_gid = 1; - break; - case Opt_charset: - kfree(opts->iocharset); - opts->iocharset = match_strdup(&args[0]); - if (!opts->iocharset) { - pr_err("allocation failure for " - "iocharset string\n"); - return 0; - } - break; - case Opt_debug: - opts->debug = 1; - break; - default: - pr_err("Unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct befs_mount_options *opts = fc->fs_private; + int token; + struct fs_parse_result result; + + /* befs ignores all options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; + + token = fs_parse(fc, befs_param_spec, param, &result); + if (token < 0) + return token; + + switch (token) { + case Opt_uid: + opts->uid = result.uid; + opts->use_uid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->use_gid = 1; + break; + case Opt_charset: + kfree(opts->iocharset); + opts->iocharset = param->string; + param->string = NULL; + break; + case Opt_debug: + opts->debug = 1; + break; + default: + return -EINVAL; } - return 1; + return 0; } static int befs_show_options(struct seq_file *m, struct dentry *root) @@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +/* + * Copy the parsed options into the sbi mount_options member + */ +static void +befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts) +{ + sbi->mount_opts.uid = opts->uid; + sbi->mount_opts.gid = opts->gid; + sbi->mount_opts.use_uid = opts->use_uid; + sbi->mount_opts.use_gid = opts->use_gid; + sbi->mount_opts.debug = opts->debug; + sbi->mount_opts.iocharset = opts->iocharset; + opts->iocharset = NULL; +} + /* Allocate private field of the superblock, fill it. * * Finish filling the public superblock fields @@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb) * Load a set of NLS translations if needed. */ static int -befs_fill_super(struct super_block *sb, void *data, int silent) +befs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh; struct befs_sb_info *befs_sb; @@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) const unsigned long sb_block = 0; const off_t x86_sb_off = 512; int blocksize; + struct befs_mount_options *parsed_opts = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) @@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_sb = BEFS_SB(sb); - if (!parse_options((char *) data, &befs_sb->mount_opts)) { - if (!silent) - befs_error(sb, "cannot parse mount options"); - goto unacquire_priv_sbp; - } + befs_set_options(befs_sb, parsed_opts); befs_debug(sb, "---> %s", __func__); @@ -934,10 +907,10 @@ unacquire_none: } static int -befs_remount(struct super_block *sb, int *flags, char *data) +befs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - if (!(*flags & SB_RDONLY)) + sync_filesystem(fc->root->d_sb); + if (!(fc->sb_flags & SB_RDONLY)) return -EINVAL; return 0; } @@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry * -befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data) +static int befs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, befs_fill_super); +} + +static const struct fs_context_operations befs_context_ops = { + .parse_param = befs_parse_param, + .get_tree = befs_get_tree, + .reconfigure = befs_reconfigure, + .free = befs_free_fc, +}; + +static int befs_init_fs_context(struct fs_context *fc) +{ + struct befs_mount_options *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + /* Initialize options */ + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; + + fc->fs_private = opts; + fc->ops = &befs_context_ops; + + return 0; +} + +static void befs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); + struct befs_mount_options *opts = fc->fs_private; + + kfree(opts->iocharset); + kfree(fc->fs_private); } static struct file_system_type befs_fs_type = { .owner = THIS_MODULE, .name = "befs", - .mount = befs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = befs_init_fs_context, + .parameters = befs_param_spec, }; MODULE_ALIAS_FS("befs"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 06dc4a57ba78..106f0e8af177 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -258,6 +258,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); +#endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, @@ -1251,6 +1257,7 @@ out_free_interp: } reloc_func_desc = interp_load_addr; + allow_write_access(interpreter); fput(interpreter); kfree(interp_elf_ex); @@ -1347,6 +1354,7 @@ out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); out_free_file: + allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_ph: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 4fe5bb9f1b1f..f1a7c4875c4a 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -394,6 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) goto error; } + allow_write_access(interpreter); fput(interpreter); interpreter = NULL; } @@ -465,8 +466,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) retval = 0; error: - if (interpreter) + if (interpreter) { + allow_write_access(interpreter); fput(interpreter); + } kfree(interpreter_name); kfree(exec_params.phdrs); kfree(exec_params.loadmap); @@ -624,6 +627,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); +#endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index df6a229b5e62..5a7ebd160724 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -247,10 +247,13 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto ret; - if (fmt->flags & MISC_FMT_OPEN_FILE) + if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = file_clone_open(fmt->interp_file); - else + if (!IS_ERR(interp_file)) + deny_write_access(interp_file); + } else { interp_file = open_exec(fmt->interpreter); + } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) goto ret; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 4fb925e8c981..fa8515598341 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -78,6 +78,32 @@ config BTRFS_ASSERT If unsure, say N. +config BTRFS_EXPERIMENTAL + bool "Btrfs experimental features" + depends on BTRFS_FS + default n + help + Enable experimental features. These features may not be stable enough + for end users. This is meant for btrfs developers or users who wish + to test the functionality and report problems. + + Current list: + + - extent map shrinker - performance problems with too frequent shrinks + + - send stream protocol v3 - fs-verity support + + - checksum offload mode - sysfs knob to affect when checksums are + calculated (at IO time, or in a thread) + + - raid-stripe-tree - additional mapping of extents to devices to + support RAID1* profiles on zoned devices, + RAID56 not yet supported + + - extent tree v2 - complex rework of extent tracking + + If unsure, say N. + config BTRFS_FS_REF_VERIFY bool "Btrfs with the ref verify tool compiled in" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 87617f2968bc..2d5f0482678b 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ - tests/free-space-tree-tests.o tests/extent-map-tests.o + tests/free-space-tree-tests.o tests/extent-map-tests.o \ + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 361a866c1995..a4c51600a408 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -18,7 +18,7 @@ enum { }; #define NO_THRESHOLD (-1) -#define DFT_THRESHOLD (32) +#define DEFAULT_THRESHOLD (32) struct btrfs_workqueue { struct workqueue_struct *normal_wq; @@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, ret->limit_active = limit_active; if (thresh == 0) - thresh = DFT_THRESHOLD; + thresh = DEFAULT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ - if (thresh < DFT_THRESHOLD) { + if (thresh < DEFAULT_THRESHOLD) { ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f8e1d5b2c512..3d3923cfc357 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1, return 0; } +static int prelim_ref_rb_add_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct prelim_ref *ref_new = + rb_entry(new, struct prelim_ref, rbnode); + const struct prelim_ref *ref_exist = + rb_entry(exist, struct prelim_ref, rbnode); + + /* + * prelim_ref_compare() expects the first parameter as the existing one, + * different from the rb_find_add_cached() order. + */ + return prelim_ref_compare(ref_exist, ref_new); +} + static void update_share_count(struct share_check *sc, int oldcount, int newcount, const struct prelim_ref *newref) { @@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, struct share_check *sc) { struct rb_root_cached *root; - struct rb_node **p; - struct rb_node *parent = NULL; - struct prelim_ref *ref; - int result; - bool leftmost = true; + struct rb_node *exist; root = &preftree->root; - p = &root->rb_root.rb_node; + exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp); + if (exist) { + struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode); + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; - while (*p) { - parent = *p; - ref = rb_entry(parent, struct prelim_ref, rbnode); - result = prelim_ref_compare(ref, newref); - if (result < 0) { - p = &(*p)->rb_left; - } else if (result > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - /* Identical refs, merge them and free @newref */ - struct extent_inode_elem *eie = ref->inode_list; - - while (eie && eie->next) - eie = eie->next; + while (eie && eie->next) + eie = eie->next; - if (!eie) - ref->inode_list = newref->inode_list; - else - eie->next = newref->inode_list; - trace_btrfs_prelim_ref_merge(fs_info, ref, newref, - preftree->count); - /* - * A delayed ref can have newref->count < 0. - * The ref->count is updated to follow any - * BTRFS_[ADD|DROP]_DELAYED_REF actions. - */ - update_share_count(sc, ref->count, - ref->count + newref->count, newref); - ref->count += newref->count; - free_pref(newref); - return; - } + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count, newref); + ref->count += newref->count; + free_pref(newref); + return; } update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); - rb_link_node(&newref->rbnode, parent, p); - rb_insert_color_cached(&newref->rbnode, root, leftmost); } /* @@ -1442,7 +1441,8 @@ again: */ delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); + head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs, + ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -3021,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); INIT_LIST_HEAD(&cache->pending_edge); INIT_LIST_HEAD(&cache->useless_node); cache->fs_info = fs_info; @@ -3131,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { - struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; if (!node) return; - BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct btrfs_backref_edge, list[LOWER]); - upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - - /* - * Add the node to leaf node list if no other child block - * cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } } btrfs_backref_drop_node(cache, node); @@ -3165,33 +3150,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) { struct btrfs_backref_node *node; - int i; - - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct btrfs_backref_node, list); - btrfs_backref_cleanup_node(cache, node); - } - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct btrfs_backref_node, lower); + while ((node = rb_entry_safe(rb_first(&cache->rb_root), + struct btrfs_backref_node, rb_node))) btrfs_backref_cleanup_node(cache, node); - } - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - while (!list_empty(&cache->pending[i])) { - node = list_first_entry(&cache->pending[i], - struct btrfs_backref_node, - list); - btrfs_backref_cleanup_node(cache, node); - } - } ASSERT(list_empty(&cache->pending_edge)); ASSERT(list_empty(&cache->useless_node)); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); ASSERT(!cache->nr_nodes); ASSERT(!cache->nr_edges); } @@ -3315,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, root = btrfs_get_fs_root(fs_info, ref_key->offset, false); if (IS_ERR(root)) return PTR_ERR(root); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - cur->cowonly = 1; + + /* We shouldn't be using backref cache for non-shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + return -EUCLEAN; + } if (btrfs_root_level(&root->root_item) == cur->level) { /* Tree root */ @@ -3402,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, goto out; } upper->owner = btrfs_header_owner(eb); - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - upper->cowonly = 1; + + /* We shouldn't be using backref cache for non shareable roots. */ + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + btrfs_backref_free_node(cache, upper); + ret = -EUCLEAN; + goto out; + } /* * If we know the block isn't shared we can avoid @@ -3594,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - /* Insert this node to cache if it's not COW-only */ - if (!start->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, - &start->rb_node); - if (rb_node) - btrfs_backref_panic(cache->fs_info, start->bytenr, - -EEXIST); - list_add_tail(&start->lower, &cache->leaves); - } + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); /* * Use breadth first search to iterate all related edges. @@ -3641,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, * parents have already been linked. */ if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - list_add_tail(&edge->list[UPPER], &upper->lower); continue; } @@ -3656,23 +3621,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - /* Sanity check, COW-only node has non-COW-only parent */ - if (start->cowonly != upper->cowonly) { - ASSERT(0); + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (unlikely(rb_node)) { + btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; } - /* Only cache non-COW-only (subvolume trees) tree blocks */ - if (!upper->cowonly) { - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) { - btrfs_backref_panic(cache->fs_info, - upper->bytenr, -EEXIST); - return -EUCLEAN; - } - } - list_add_tail(&edge->list[UPPER], &upper->lower); /* diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e8c22cccb5c1..74e614031274 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -318,6 +318,12 @@ struct btrfs_backref_node { u64 bytenr; }; /* Use rb_simple_node for search/insert */ + /* + * This is a sanity check, whenever we COW a block we will update + * new_bytenr with it's current location, and we will check this in + * various places to validate that the cache makes sense, it shouldn't + * be used for anything else. + */ u64 new_bytenr; /* Objectid of tree block owner, can be not uptodate */ u64 owner; @@ -335,10 +341,6 @@ struct btrfs_backref_node { struct extent_buffer *eb; /* Level of the tree block */ unsigned int level:8; - /* Is the block in a non-shareable tree */ - unsigned int cowonly:1; - /* 1 if no child node is in the cache */ - unsigned int lowest:1; /* Is the extent buffer locked */ unsigned int locked:1; /* Has the block been processed */ @@ -391,12 +393,6 @@ struct btrfs_backref_cache { * level blocks may not reflect the new location */ struct list_head pending[BTRFS_MAX_LEVEL]; - /* List of backref nodes with no child node */ - struct list_head leaves; - /* List of blocks that have been COWed in current transaction */ - struct list_head changed; - /* List of detached backref node. */ - struct list_head detached; u64 last_trans; diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fec5c6cde0a7..bc2555c44a12 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* @@ -80,6 +81,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, &btrfs_clone_bioset); + if (IS_ERR(bio)) + return ERR_CAST(bio); + bbio = btrfs_bio(bio); btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; @@ -113,41 +117,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -366,7 +358,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -409,7 +401,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -423,7 +415,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } @@ -461,6 +453,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); + /* + * Track reads if tracking is enabled; ignore I/O operations before the + * filesystem is fully initialized. + */ + if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info) + percpu_counter_add(&dev->fs_info->stats_read_blocks, + bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits); + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -598,7 +598,7 @@ static bool should_async_write(struct btrfs_bio *bbio) { bool auto_csum_mode = true; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); @@ -660,8 +660,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } @@ -689,7 +695,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); - goto fail; + btrfs_bio_counter_dec(fs_info); + goto end_bbio; } map_length = min(map_length, length); @@ -697,7 +704,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) map_length = btrfs_append_map_length(bbio, map_length); if (map_length < length) { - bbio = btrfs_split_bio(fs_info, bbio, map_length); + struct btrfs_bio *split; + + split = btrfs_split_bio(fs_info, bbio, map_length); + if (IS_ERR(split)) { + ret = errno_to_blk_status(PTR_ERR(split)); + btrfs_bio_counter_dec(fs_info); + goto end_bbio; + } + bbio = split; bio = &bbio->bio; } @@ -718,8 +733,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } - if (is_data_bbio(bbio) && bioc && - btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + if (is_data_bbio(bbio) && bioc && bioc->use_rst) { /* * No locking for the list update, as we only add to * the list in the I/O submission path, and list @@ -771,6 +785,7 @@ fail: btrfs_bio_end_io(remaining, ret); } +end_bbio: btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e48612340745..e2fe16074ad6 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -79,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 7980b2e33a92..c0a8f7d92acc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) } } +static int btrfs_bg_start_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_block_group *new_bg = + rb_entry(new, struct btrfs_block_group, cache_node); + const struct btrfs_block_group *exist_bg = + rb_entry(exist, struct btrfs_block_group, cache_node); + + if (new_bg->start < exist_bg->start) + return -1; + if (new_bg->start > exist_bg->start) + return 1; + return 0; +} + /* * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group *cache; - bool leftmost = true; + struct rb_node *exist; + int ret = 0; ASSERT(block_group->length != 0); write_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_root.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group, cache_node); - if (block_group->start < cache->start) { - p = &(*p)->rb_left; - } else if (block_group->start > cache->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color_cached(&block_group->cache_node, - &info->block_group_cache_tree, leftmost); + exist = rb_find_add_cached(&block_group->cache_node, + &info->block_group_cache_tree, btrfs_bg_start_cmp); + if (exist) + ret = -EEXIST; write_unlock(&info->block_group_cache_lock); - return 0; + return ret; } /* @@ -1223,7 +1221,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->space_info->total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); - btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, -block_group->zone_unusable); block_group->space_info->disk_total -= block_group->length * factor; @@ -1396,8 +1394,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes to readonly */ sinfo->bytes_readonly += cache->zone_unusable; - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - -cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable); cache->zone_unusable = 0; } cache->ro++; @@ -1645,8 +1642,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - btrfs_space_info_update_bytes_pinned(fs_info, space_info, - -block_group->pinned); + btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; block_group->pinned = 0; @@ -2672,7 +2668,6 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -2797,7 +2792,7 @@ next: * uncompressed data size, because the compression is only done * when writeback triggered and we don't know how much space we * are actually going to need, so we reserve the uncompressed - * size because the data may be uncompressible in the worst case. + * size because the data may be incompressible in the worst case. */ if (ret == 0) { bool used; @@ -3060,8 +3055,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) (cache->alloc_offset - cache->used - cache->pinned - cache->reserved) + (cache->length - cache->zone_capacity); - btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, - cache->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); sinfo->bytes_readonly -= cache->zone_unusable; } num_bytes = cache->length - cache->reserved - @@ -3123,7 +3117,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); /* @@ -3699,7 +3692,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); space_info->bytes_used -= num_bytes; space_info->disk_used -= num_bytes * factor; if (READ_ONCE(space_info->periodic_reclaim)) @@ -3781,8 +3774,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved += num_bytes; trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(cache->fs_info, - space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; @@ -3819,6 +3811,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) space_info->bytes_readonly += num_bytes; + else if (btrfs_is_zoned(cache->fs_info)) + space_info->bytes_zone_unusable += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a07b9594dc70..3f3608299c0b 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, - space_info, - num_bytes); + btrfs_space_info_free_bytes_may_use(space_info, num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, num_bytes); block_rsv->reserved = block_rsv->size; } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); + btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; btrfs_try_granting_tickets(fs_info, sinfo); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e152fde888fc..b2fa33911c28 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict); + bool nowait); void btrfs_del_delalloc_inode(struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); @@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size); +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 90aef2627ca2..0c4d486c3048 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - folio = __filemap_get_folio(mapping, pg_index, 0, 0); + folio = filemap_get_folio(mapping, pg_index); if (!IS_ERR(folio)) { u64 folio_sz = folio_size(folio); u64 offset = offset_in_folio(folio, cur); @@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, folio, cur, - add_size); + btrfs_folio_set_lock(fs_info, folio, cur, add_size); folio_put(folio); cur += add_size; } @@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(unsigned int level) +static struct list_head *alloc_heuristic_ws(void) { struct heuristic_ws *ws; @@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { static struct list_head *alloc_workspace(int type, unsigned int level) { switch (type) { - case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); - case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); default: /* @@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); + const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping workspace = get_workspace(type, level); ret = compression_compress_pages(type, workspace, mapping, start, folios, out_folios, total_in, total_out); + /* The total read-in bytes should be no larger than the input. */ + ASSERT(*total_in <= orig_len); put_workspace(type, workspace); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b6563b6a333e..954034086d0d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); +struct list_head *lzo_alloc_workspace(void); void lzo_free_workspace(struct list_head *ws); int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0cc919d15b14..92071ca0655f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); - -static const struct btrfs_csums { - u16 size; - const char name[10]; - const char driver[12]; -} btrfs_csums[] = { - [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, - [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, - [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", - .driver = "blake2b-256" }, -}; - /* * The leaf data grows from end-to-front in the node. this returns the address * of the start of the last item, which is the stop of the leaf data stack. @@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } -/* This exists for btrfs-progs usages. */ -u16 btrfs_csum_type_size(u16 type) -{ - return btrfs_csums[type].size; -} - -int btrfs_super_csum_size(const struct btrfs_super_block *s) -{ - u16 t = btrfs_super_csum_type(s); - /* - * csum type is validated at mount time - */ - return btrfs_csum_type_size(t); -} - -const char *btrfs_super_csum_name(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].name; -} - -/* - * Return driver name if defined, otherwise the name that's also a valid driver - * name - */ -const char *btrfs_super_csum_driver(u16 csum_type) -{ - /* csum type is validated at mount time */ - return btrfs_csums[csum_type].driver[0] ? - btrfs_csums[csum_type].driver : - btrfs_csums[csum_type].name; -} - -size_t __attribute_const__ btrfs_get_num_csums(void) -{ - return ARRAY_SIZE(btrfs_csums); -} - struct btrfs_path *btrfs_alloc_path(void) { might_sleep(); @@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p) } /* - * We want the transaction abort to print stack trace only for errors where the - * cause could be a bug, eg. due to ENOSPC, and not for common errors that are - * caused by external factors. - */ -bool __cold abort_should_print_stack(int error) -{ - switch (error) { - case -EIO: - case -EROFS: - case -ENOMEM: - return false; - } - return true; -} - -/* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node * at the root of the tree. See btrfs_lock_root_node for the @@ -654,6 +587,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, goto error_unlock_cow; } } + + trace_btrfs_cow_block(root, buf, cow); if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); @@ -710,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; - int ret; if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { btrfs_abort_transaction(trans, -EUCLEAN); @@ -751,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, - cow_ret, search_start, 0, nest); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); } ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); @@ -1508,26 +1438,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level, */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, + struct extent_buffer **eb_ret, int slot, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; - u64 gen; - struct extent_buffer *tmp; - int ret; + struct extent_buffer *tmp = NULL; + int ret = 0; int parent_level; - bool unlock_up; + int err; + bool read_tmp = false; + bool tmp_locked = false; + bool path_released = false; - unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); - gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); check.has_first_key = true; check.level = parent_level - 1; - check.transid = gen; + check.transid = btrfs_node_ptr_generation(*eb_ret, slot); check.owner_root = btrfs_root_id(root); /* @@ -1540,79 +1470,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple * parents (shared tree blocks). */ - if (btrfs_verify_level_key(tmp, - parent_level - 1, &check.first_key, gen)) { - free_extent_buffer(tmp); - return -EUCLEAN; + if (btrfs_verify_level_key(tmp, &check)) { + ret = -EUCLEAN; + goto out; } *eb_ret = tmp; - return 0; + tmp = NULL; + ret = 0; + goto out; } if (p->nowait) { - free_extent_buffer(tmp); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) - btrfs_unlock_up_safe(p, level + 1); - - /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, &check); - if (ret) { - free_extent_buffer(tmp); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return ret; + ret = -EAGAIN; + path_released = true; } - if (unlock_up) - ret = -EAGAIN; + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; + } + if (ret == 0) { + ASSERT(!tmp_locked); + *eb_ret = tmp; + tmp = NULL; + } goto out; } else if (p->nowait) { - return -EAGAIN; + ret = -EAGAIN; + goto out; } - if (unlock_up) { - btrfs_unlock_up_safe(p, level + 1); + if (!p->skip_locking) { + btrfs_unlock_up_safe(p, parent_level + 1); ret = -EAGAIN; - } else { - ret = 0; } if (p->reada != READA_NONE) - reada_for_search(fs_info, p, level, slot, key->objectid); + reada_for_search(fs_info, p, parent_level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, &check); + tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + tmp = NULL; + goto out; + } + read_tmp = true; + + if (!p->skip_locking) { + ASSERT(ret == -EAGAIN); + tmp_locked = true; + btrfs_tree_read_lock(tmp); btrfs_release_path(p); - return PTR_ERR(tmp); + path_released = true; + } + + /* Now we're allowed to do a blocking uptodate check. */ + err = btrfs_read_extent_buffer(tmp, &check); + if (err) { + ret = err; + goto out; } + /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now * and give up so that our caller doesn't loop forever * on our EAGAINs. */ - if (!extent_buffer_uptodate(tmp)) + if (!extent_buffer_uptodate(tmp)) { ret = -EIO; + goto out; + } -out: if (ret == 0) { + ASSERT(!tmp_locked); *eb_ret = tmp; - } else { - free_extent_buffer(tmp); - btrfs_release_path(p); + tmp = NULL; } +out: + if (tmp) { + if (tmp_locked) + btrfs_tree_read_unlock(tmp); + if (read_tmp && ret && ret != -EAGAIN) + free_extent_buffer_stale(tmp); + else + free_extent_buffer(tmp); + } + if (ret && !path_released) + btrfs_release_path(p); return ret; } @@ -2010,7 +1976,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info; struct extent_buffer *b; int slot; int ret; @@ -2023,6 +1989,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + if (!root) + return -EINVAL; + + fs_info = root->fs_info; might_sleep(); lowest_level = p->lowest_level; @@ -2197,8 +2167,8 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2324,8 +2294,8 @@ again: goto done; } - err = read_block_for_search(root, p, &b, level, slot, key); - if (err == -EAGAIN) + err = read_block_for_search(root, p, &b, slot, key); + if (err == -EAGAIN && !p->nowait) goto again; if (err) { ret = err; @@ -2334,7 +2304,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3863,6 +3833,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_RAID_STRIPE_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); if (btrfs_leaf_free_space(leaf) >= ins_len) @@ -4930,8 +4901,7 @@ again: } next = c; - ret = read_block_for_search(root, path, &next, level, - slot, &key); + ret = read_block_for_search(root, path, &next, slot, &key); if (ret == -EAGAIN && !path->nowait) goto again; @@ -4974,8 +4944,7 @@ again: if (!level) break; - ret = read_block_for_search(root, path, &next, level, - 0, &key); + ret = read_block_for_search(root, path, &next, 0, &key); if (ret == -EAGAIN && !path->nowait) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 317a3712270f..1096a80a64e7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -7,7 +7,6 @@ #define BTRFS_CTREE_H #include "linux/cleanup.h" -#include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> @@ -371,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi } /* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + +/* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ @@ -487,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sectorsize_bits) - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ - return mapping_gfp_constraint(mapping, ~__GFP_FS); -} - -void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); -int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); - -/* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); @@ -737,23 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } -u16 btrfs_csum_type_size(u16 type); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - -/* - * We use page status Private2 to indicate there is an ordered extent with - * unfinished IO. - * - * Rename the Private2 accessors to Ordered, to improve readability. - */ -#define PageOrdered(page) PagePrivate2(page) -#define SetPageOrdered(page) SetPagePrivate2(page) -#define ClearPageOrdered(page) ClearPagePrivate2(page) -#define folio_test_ordered(folio) folio_test_private_2(folio) -#define folio_set_ordered(folio) folio_set_private_2(folio) -#define folio_clear_ordered(folio) folio_clear_private_2(folio) - #endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b95ef44c326b..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && (em->flags & EXTENT_FLAG_MERGED) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7aa8a395d838..88e900e5a43d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 508bdbae29a0..0b4933c6a889 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -366,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( return NULL; } +static int btrfs_delayed_item_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_delayed_item *new_item = + rb_entry(new, struct btrfs_delayed_item, rb_node); + const struct btrfs_delayed_item *exist_item = + rb_entry(exist, struct btrfs_delayed_item, rb_node); + + if (new_item->index < exist_item->index) + return -1; + if (new_item->index > exist_item->index) + return 1; + return 0; +} + static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins) { - struct rb_node **p, *node; - struct rb_node *parent_node = NULL; struct rb_root_cached *root; - struct btrfs_delayed_item *item; - bool leftmost = true; + struct rb_node *exist; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; else root = &delayed_node->del_root; - p = &root->rb_root.rb_node; - node = &ins->rb_node; - - while (*p) { - parent_node = *p; - item = rb_entry(parent_node, struct btrfs_delayed_item, - rb_node); - - if (item->index < ins->index) { - p = &(*p)->rb_right; - leftmost = false; - } else if (item->index > ins->index) { - p = &(*p)->rb_left; - } else { - return -EEXIST; - } - } - - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp); + if (exist) + return -EEXIST; if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && ins->index >= delayed_node->index_cnt) @@ -1038,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1561,8 +1555,7 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_node *node, +static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, u64 index) { struct btrfs_delayed_item *item; @@ -1620,7 +1613,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); + ret = btrfs_delete_delayed_insertion_item(node, index); if (!ret) goto end; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7cfefdfe54ea..f4d9feac0d0e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -64,9 +64,9 @@ struct btrfs_delayed_node { struct mutex mutex; struct btrfs_inode_item inode_item; refcount_t refs; + int count; u64 index_cnt; unsigned long flags; - int count; /* * The size of the next batch of dir index items to insert (if this * node is from a directory inode). Protected by @mutex. diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ad9ef8312e41..98c5b61dabe8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -9,6 +9,7 @@ #include "messages.h" #include "ctree.h" #include "delayed-ref.h" +#include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" @@ -92,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) u64 num_bytes; u64 reserved_bytes; + if (btrfs_is_testing(fs_info)) + return; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); @@ -253,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); if (to_free > 0) - btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, @@ -264,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2) +static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; @@ -278,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -static int comp_refs(struct btrfs_delayed_ref_node *ref1, - struct btrfs_delayed_ref_node *ref2, +static int comp_refs(const struct btrfs_delayed_ref_node *ref1, + const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; @@ -298,7 +302,7 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) - return -1; + return 1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } @@ -313,142 +317,57 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return 0; } -/* insert a new ref to head ref rbtree */ -static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, - struct rb_node *node) +static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) { - struct rb_node **p = &root->rb_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_head *entry; - struct btrfs_delayed_ref_head *ins; - u64 bytenr; - bool leftmost = true; - - ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->bytenr; - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, - href_node); - - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + const struct btrfs_delayed_ref_node *new_node = + rb_entry(new, struct btrfs_delayed_ref_node, ref_node); + const struct btrfs_delayed_ref_node *exist_node = + rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); - return NULL; + return comp_refs(new_node, exist_node, true); } static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { - struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - bool leftmost = true; - - while (*p) { - int comp; - - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - ref_node); - comp = comp_refs(ins, entry, true); - if (comp < 0) { - p = &(*p)->rb_left; - } else if (comp > 0) { - p = &(*p)->rb_right; - leftmost = false; - } else { - return entry; - } - } + struct rb_node *exist; - rb_link_node(node, parent_node, p); - rb_insert_color_cached(node, root, leftmost); + exist = rb_find_add_cached(node, root, cmp_refs_node); + if (exist) + return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); return NULL; } static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; + unsigned long from = 0; - n = rb_first_cached(&dr->href_root); - if (!n) - return NULL; - - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - return entry; -} + lockdep_assert_held(&dr->lock); -/* - * Find a head entry based on bytenr. This returns the delayed ref head if it - * was able to find one, or NULL if nothing was in that spot. If return_bigger - * is given, the next bigger entry is returned if no exact match is found. - */ -static struct btrfs_delayed_ref_head *find_ref_head( - struct btrfs_delayed_ref_root *dr, u64 bytenr, - bool return_bigger) -{ - struct rb_root *root = &dr->href_root.rb_root; - struct rb_node *n; - struct btrfs_delayed_ref_head *entry; - - n = root->rb_node; - entry = NULL; - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return entry; - } - if (entry && return_bigger) { - if (bytenr > entry->bytenr) { - n = rb_next(&entry->href_node); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_delayed_ref_head, - href_node); - } - return entry; - } - return NULL; + return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) +static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) - return 0; + return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_NODE(&head->href_node)) { + if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); - return -EAGAIN; + return false; } btrfs_put_delayed_ref_head(head); - return 0; + return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, @@ -462,7 +381,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); - atomic_dec(&delayed_refs->num_entries); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } @@ -558,33 +476,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; + unsigned long start_index; + unsigned long found_index; + bool found_head = false; + bool locked; - lockdep_assert_held(&delayed_refs->lock); + spin_lock(&delayed_refs->lock); again: - head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, - true); - if (!head && delayed_refs->run_delayed_start != 0) { - delayed_refs->run_delayed_start = 0; - head = find_first_ref_head(delayed_refs); + start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); + xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { + if (!head->processing) { + found_head = true; + break; + } } - if (!head) - return NULL; - - while (head->processing) { - struct rb_node *node; - - node = rb_next(&head->href_node); - if (!node) { - if (delayed_refs->run_delayed_start == 0) - return NULL; - delayed_refs->run_delayed_start = 0; - goto again; + if (!found_head) { + if (delayed_refs->run_delayed_start == 0) { + spin_unlock(&delayed_refs->lock); + return NULL; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); + delayed_refs->run_delayed_start = 0; + goto again; } head->processing = true; @@ -592,23 +508,73 @@ again: delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; + + locked = btrfs_delayed_ref_lock(delayed_refs, head); + spin_unlock(&delayed_refs->lock); + + /* + * We may have dropped the spin lock to get the head mutex lock, and + * that might have given someone else time to free the head. If that's + * true, it has been removed from our list and we can move on. + */ + if (!locked) + return ERR_PTR(-EAGAIN); + return head; } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = false; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); + xa_erase(&delayed_refs->head_refs, index); + head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; } +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + + lockdep_assert_held(&head->mutex); + lockdep_assert_held(&head->lock); + + if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) + return NULL; + + /* + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. + */ + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); + + ref = rb_entry(rb_first_cached(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); + ASSERT(list_empty(&ref->add_list)); + return ref; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -629,7 +595,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; @@ -649,7 +614,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); - list_del(&exist->add_list); + list_del_init(&exist->add_list); } else { ASSERT(0); } @@ -813,7 +778,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); - RB_CLEAR_NODE(&head_ref->href_node); + head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); @@ -830,7 +795,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } @@ -840,6 +804,8 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. + * + * Returns an error pointer in case of an error. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, @@ -847,31 +813,48 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; + const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; + lockdep_assert_held(&delayed_refs->lock); + +#if BITS_PER_LONG == 32 + if (head_ref->bytenr >= MAX_LFS_FILESIZE) { + if (qrecord) + xa_release(&delayed_refs->dirty_extents, index); + btrfs_err_rl(fs_info, +"delayed ref head %llu is beyond 32bit page cache and xarray index limit", + head_ref->bytenr); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } +#endif /* Record qgroup extent info if provided */ if (qrecord) { int ret; - ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, + head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); + xa_release(&delayed_refs->dirty_extents, index); + /* Caller responsible for freeing qrecord on error. */ + if (ret < 0) + return ERR_PTR(ret); kfree(qrecord); } else { qrecord_inserted = true; } } - trace_add_delayed_ref_head(trans->fs_info, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); - existing = htree_insert(&delayed_refs->href_root, - &head_ref->href_node); + existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* @@ -881,6 +864,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); + if (xa_is_err(existing)) { + /* Memory was preallocated by the caller. */ + ASSERT(xa_err(existing) != -ENOMEM); + return ERR_PTR(xa_err(existing)); + } else if (WARN_ON(existing)) { + /* + * Shouldn't happen we just did a lookup before under + * delayed_refs->lock. + */ + return ERR_PTR(-EEXIST); + } + head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs @@ -890,12 +886,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += - btrfs_csum_bytes_to_leaves(trans->fs_info, - head_ref->num_bytes); + btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; - atomic_inc(&delayed_refs->num_entries); } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -1000,42 +994,67 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; + const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); + bool qrecord_reserved = false; bool qrecord_inserted; int action = generic_ref->action; bool merged; + int ret; node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) + if (!head_ref) { + ret = -ENOMEM; goto free_node; + } + + delayed_refs = &trans->transaction->delayed_refs; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) + if (!record) { + ret = -ENOMEM; goto free_head_ref; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, - generic_ref->bytenr, GFP_NOFS)) + } + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { + ret = -ENOMEM; goto free_record; + } + qrecord_reserved = true; + } + + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + if (qrecord_reserved) + xa_release(&delayed_refs->dirty_extents, index); + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; - delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); /* * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); + new_head_ref = add_delayed_ref_head(trans, head_ref, record, + action, &qrecord_inserted); + if (IS_ERR(new_head_ref)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); + ret = PTR_ERR(new_head_ref); + goto free_record; + } + head_ref = new_head_ref; merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); @@ -1054,7 +1073,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: @@ -1063,7 +1082,7 @@ free_head_ref: kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); free_node: kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - return -ENOMEM; + return ret; } /* @@ -1093,7 +1112,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { + const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_ref generic_ref = { .type = BTRFS_REF_METADATA, @@ -1102,6 +1123,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, .num_bytes = num_bytes, .tree_ref.level = level, }; + int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) @@ -1111,11 +1133,22 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL); + ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); + if (ret) { + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return ret; + } + spin_lock(&delayed_refs->lock); + head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, + BTRFS_UPDATE_DELAYED_HEAD, NULL); + if (IS_ERR(head_ref_ret)) { + xa_release(&delayed_refs->head_refs, index); + spin_unlock(&delayed_refs->lock); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); + return PTR_ERR(head_ref_ret); + } spin_unlock(&delayed_refs->lock); /* @@ -1139,11 +1172,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 bytenr) { + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); + lockdep_assert_held(&delayed_refs->lock); - return find_ref_head(delayed_refs, bytenr, false); + return xa_load(&delayed_refs->head_refs, index); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) @@ -1213,6 +1250,84 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, return found; } +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + bool testing = btrfs_is_testing(fs_info); + + spin_lock(&delayed_refs->lock); + while (true) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; + bool pin_bytes = false; + + head = find_first_ref_head(delayed_refs); + if (!head) + break; + + if (!btrfs_delayed_ref_lock(delayed_refs, head)) + continue; + + spin_lock(&head->lock); + while ((n = rb_first_cached(&head->ref_tree)) != NULL) { + struct btrfs_delayed_ref_node *ref; + + ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); + drop_delayed_ref(fs_info, delayed_refs, head, ref); + } + if (head->must_insert_reserved) + pin_bytes = true; + btrfs_free_delayed_extent_op(head->extent_op); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + mutex_unlock(&head->mutex); + + if (!testing && pin_bytes) { + struct btrfs_block_group *bg; + + bg = btrfs_lookup_block_group(fs_info, head->bytenr); + if (WARN_ON_ONCE(bg == NULL)) { + /* + * Unexpected and there's nothing we can do here + * because we are in a transaction abort path, + * so any errors can only be ignored or reported + * while attempting to cleanup all resources. + */ + btrfs_err(fs_info, +"block group for delayed ref at %llu was not found while destroying ref head", + head->bytenr); + } else { + spin_lock(&bg->space_info->lock); + spin_lock(&bg->lock); + bg->pinned += head->num_bytes; + btrfs_space_info_update_bytes_pinned(bg->space_info, + head->num_bytes); + bg->reserved -= head->num_bytes; + bg->space_info->bytes_reserved -= head->num_bytes; + spin_unlock(&bg->lock); + spin_unlock(&bg->space_info->lock); + + btrfs_put_block_group(bg); + } + + btrfs_error_unpin_extent_range(fs_info, head->bytenr, + head->bytenr + head->num_bytes - 1); + } + if (!testing) + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + btrfs_put_delayed_ref_head(head); + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + if (!testing) + btrfs_qgroup_destroy_extent_records(trans); + + spin_unlock(&delayed_refs->lock); +} + void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 085f30968aba..a35067cebb97 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the - * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + * refs rbtree in the corresponding delayed ref head + * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; @@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; /* - * For insertion into struct btrfs_delayed_ref_root::href_root. - * Keep it in the same cache line as 'bytenr' for more efficient - * searches in the rbtree. - */ - struct rb_node href_node; - /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ @@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head { bool is_data; bool is_system; bool processing; + /* + * Indicate if it's currently in the data structure that tracks head + * refs (struct btrfs_delayed_ref_root::head_refs). + */ + bool tracked; }; enum btrfs_delayed_ref_flags { @@ -199,30 +199,52 @@ enum btrfs_delayed_ref_flags { }; struct btrfs_delayed_ref_root { - /* head ref rbtree */ - struct rb_root_cached href_root; + /* + * Track head references. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits. This is both to get a more + * dense index space (optimizes xarray structure) and because indexes in + * xarrays are of "unsigned long" type, meaning they are 32 bits wide on + * 32 bits platforms, limiting the extent range to 4G which is too low + * and makes it unusable (truncated index values) on 32 bits platforms. + * Protected by the spinlock 'lock' defined below. + */ + struct xarray head_refs; - /* Track dirty extent records. */ + /* + * Track dirty extent records. + * The keys correspond to the logical address of the extent ("bytenr") + * right shifted by fs_info->sectorsize_bits, for same reasons as above. + */ struct xarray dirty_extents; - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; - - /* how many delayed ref updates we've queued, used by the - * throttling code + /* + * Protects the xarray head_refs, its entries and the following fields: + * num_heads, num_heads_ready, pending_csums and run_delayed_start. */ - atomic_t num_entries; + spinlock_t lock; - /* total number of head nodes in tree */ + /* Total number of head refs, protected by the spinlock 'lock'. */ unsigned long num_heads; - /* total number of head nodes ready for processing */ + /* + * Total number of head refs ready for processing, protected by the + * spinlock 'lock'. + */ unsigned long num_heads_ready; + /* + * Track space reserved for deleting csums of data extents. + * Protected by the spinlock 'lock'. + */ u64 pending_csums; unsigned long flags; + /* + * Track from which bytenr to start searching ref heads. + * Protected by the spinlock 'lock'. + */ u64 run_delayed_start; /* @@ -364,19 +386,23 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } -void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, +void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( + const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs); +void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); @@ -391,6 +417,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); +void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 83d5cdd77f29..f86fbea0b3de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -45,7 +45,7 @@ * * - Copy existing extents * - * This happens by re-using scrub facility, as scrub also iterates through + * This happens by reusing scrub facility, as scrub also iterates through * existing extents from commit root. * * Location: scrub_write_block_to_dev_replace() from @@ -440,9 +440,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); @@ -641,6 +638,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; down_write(&dev_replace->rwsem); + dev_replace->replace_task = current; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -994,6 +992,7 @@ error: list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); fs_devices->rw_devices++; + dev_replace->replace_task = NULL; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 001c0c2f872c..ccf91de29f80 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle const char *name, int name_len) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; @@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); @@ -93,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -153,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -190,7 +187,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_match_dir_item_name(path, name, name_len); } /* @@ -341,14 +338,13 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root->fs_info, path, - name->name, name->len); + di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ - if (ret > 0) - ret = 0; + if (ret >= 0) + ret = -ENOENT; return ERR_PTR(ret); } @@ -378,8 +374,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 5f6dfafc91f1..28d69970bc70 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - const struct btrfs_path *path, +struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index bd38df5647e3..8567af46e16f 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); block_start = extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, - &file_extent, false, false) == 1) { + if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -834,7 +833,7 @@ relock: return ret; } - ret = btrfs_write_check(iocb, from, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ad5db619b00..f09db62e61a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -226,7 +226,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -917,8 +917,7 @@ fail: return ERR_PTR(ret); } -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, { struct btrfs_root *log_root; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item; int ret; - log_root = alloc_log_tree(trans, fs_info); + log_root = alloc_log_tree(fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1259,6 +1258,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); @@ -1959,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2328,6 +2328,71 @@ out: return ret; } +static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, + const struct btrfs_super_block *sb) +{ + unsigned int cur = 0; /* Offset inside the sys chunk array */ + /* + * At sb read time, fs_info is not fully initialized. Thus we have + * to use super block sectorsize, which should have been validated. + */ + const u32 sectorsize = btrfs_super_sectorsize(sb); + u32 sys_array_size = btrfs_super_sys_array_size(sb); + + if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + return -EUCLEAN; + } + + while (cur < sys_array_size) { + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + struct btrfs_key key; + u64 type; + u16 num_stripes; + u32 len; + int ret; + + disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur); + len = sizeof(*disk_key); + + if (cur + len > sys_array_size) + goto short_read; + cur += len; + + btrfs_disk_key_to_cpu(&key, disk_key); + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + key.type, cur); + return -EUCLEAN; + } + chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size) + goto short_read; + type = btrfs_stack_chunk_type(chunk); + if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur); + return -EUCLEAN; + } + ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset, + sectorsize); + if (ret < 0) + return ret; + cur += btrfs_chunk_item_size(num_stripes); + } + return 0; +short_read: + btrfs_err(fs_info, + "super block sys chunk array short read, cur=%u sys_array_size=%u", + cur, sys_array_size); + return -EUCLEAN; +} + /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. @@ -2496,6 +2561,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + ret = validate_sys_chunk_array(fs_info, sb); + /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk @@ -2786,6 +2853,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_scrub(fs_info); btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); + btrfs_init_extent_map_shrinker_work(fs_info); rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; @@ -2852,12 +2920,14 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; - spin_lock_init(&fs_info->extent_map_shrinker_lock); - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL); + if (ret) + return ret; + fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -3202,8 +3272,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) return 0; } -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options) +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { u32 sectorsize; u32 nodesize; @@ -3324,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; + fs_info->fs_devices->fs_info = fs_info; /* * Handle the space caching options appropriately now that we have the @@ -4186,7 +4256,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "transaction %llu (with %llu dirty metadata bytes) is not committed", trans->transid, dirty_bytes); - btrfs_cleanup_one_transaction(trans, fs_info); + btrfs_cleanup_one_transaction(trans); if (trans == fs_info->running_transaction) fs_info->running_transaction = NULL; @@ -4265,6 +4335,15 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * already the cleaner, but below we run all pending delayed iputs. */ btrfs_flush_workqueue(fs_info->fixup_workers); + /* + * Similar case here, we have to wait for delalloc workers before we + * proceed below and stop the cleaner kthread, otherwise we trigger a + * use-after-tree on the cleaner kthread task_struct when a delalloc + * worker running submit_compressed_extents() adds a delayed iput, which + * does a wake up on the cleaner kthread, which was already freed below + * when we call kthread_stop(). + */ + btrfs_flush_workqueue(fs_info->delalloc_workers); /* * After we parked the cleaner kthread, ordered extents may have @@ -4294,6 +4373,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->em_shrinker_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4531,75 +4611,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } -static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; - struct btrfs_delayed_ref_node *ref; - - spin_lock(&delayed_refs->lock); - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { - struct btrfs_delayed_ref_head *head; - struct rb_node *n; - bool pin_bytes = false; - - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_lock(delayed_refs, head)) - continue; - - spin_lock(&head->lock); - while ((n = rb_first_cached(&head->ref_tree)) != NULL) { - ref = rb_entry(n, struct btrfs_delayed_ref_node, - ref_node); - rb_erase_cached(&ref->ref_node, &head->ref_tree); - RB_CLEAR_NODE(&ref->ref_node); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - atomic_dec(&delayed_refs->num_entries); - btrfs_put_delayed_ref(ref); - btrfs_delayed_refs_rsv_release(fs_info, 1, 0); - } - if (head->must_insert_reserved) - pin_bytes = true; - btrfs_free_delayed_extent_op(head->extent_op); - btrfs_delete_ref_head(delayed_refs, head); - spin_unlock(&head->lock); - spin_unlock(&delayed_refs->lock); - mutex_unlock(&head->mutex); - - if (pin_bytes) { - struct btrfs_block_group *cache; - - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - BUG_ON(!cache); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += head->num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, - cache->space_info, head->num_bytes); - cache->reserved -= head->num_bytes; - cache->space_info->bytes_reserved -= head->num_bytes; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - btrfs_put_block_group(cache); - - btrfs_error_unpin_extent_range(fs_info, head->bytenr, - head->bytenr + head->num_bytes - 1); - } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); - btrfs_put_delayed_ref_head(head); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - btrfs_qgroup_destroy_extent_records(trans); - - spin_unlock(&delayed_refs->lock); -} - static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; @@ -4805,9 +4816,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->fs_roots_radix_lock); } -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) { + struct btrfs_fs_info *fs_info = cur_trans->fs_info; struct btrfs_device *dev, *tmp; btrfs_cleanup_dirty_bgs(cur_trans, fs_info); @@ -4819,7 +4830,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, list_del_init(&dev->post_commit_list); } - btrfs_destroy_delayed_refs(cur_trans, fs_info); + btrfs_destroy_delayed_refs(cur_trans); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); @@ -4865,7 +4876,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } else { spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, fs_info); + btrfs_cleanup_one_transaction(t); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 99af64d3f277..587842991b24 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block( int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); -int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, - const char *options); +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); @@ -97,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. - * - * If you want to ensure the whole tree is safe, you should use - * fs_info->subvol_srcu */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { @@ -127,8 +123,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); -void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); +void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5966324607d..3014a1a23efd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -182,7 +182,7 @@ search_again: delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -570,7 +570,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -618,7 +617,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -795,7 +793,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; - path->keep_locks = 1; } else extra_size = -1; @@ -946,6 +943,25 @@ again: ret = -EAGAIN; goto out; } + + if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key tmp_key; + + btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); + if (tmp_key.objectid == bytenr && + tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = -EAGAIN; + goto out; + } + goto out_no_entry; + } + + if (!path->keep_locks) { + btrfs_release_path(path); + path->keep_locks = 1; + goto again; + } + /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. @@ -959,13 +975,15 @@ again: goto out; } } +out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert) { + if (path->keep_locks) { path->keep_locks = 0; - path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } + if (insert) + path->search_for_extension = 0; return ret; } @@ -1030,7 +1048,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1175,7 +1192,6 @@ static noinline_for_stack int update_inline_extent_backref( item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1240,12 +1256,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); + u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; - len = round_down(len, 1 << SECTOR_SHIFT); + len = round_down(len, SECTOR_SIZE); start = aligned_start; } @@ -1300,13 +1316,29 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, bytes_left = end - start; } - if (bytes_left) { + while (bytes_left) { + u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - bytes_left >> SECTOR_SHIFT, + bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); - if (!ret) - *discarded_bytes += bytes_left; + + if (ret) { + if (ret != -EOPNOTSUPP) + break; + continue; + } + + start += bytes_to_discard; + bytes_left -= bytes_to_discard; + *discarded_bytes += bytes_to_discard; + + if (btrfs_trim_interrupted()) { + ret = -ERESTARTSYS; + break; + } } + return ret; } @@ -1491,7 +1523,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ @@ -1675,8 +1706,6 @@ again: ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -1767,40 +1796,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static inline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_node *ref; - - if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) - return NULL; - - /* - * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. - * This is to prevent a ref count from going down to zero, which deletes - * the extent item from the extent tree, when there still are references - * to add, which would fail because they would not find the extent item. - */ - if (!list_empty(&head->ref_add_list)) - return list_first_entry(&head->ref_add_list, - struct btrfs_delayed_ref_node, add_list); - - ref = rb_entry(rb_first_cached(&head->ref_tree), - struct btrfs_delayed_ref_node, ref_node); - ASSERT(list_empty(&ref->add_list)); - return ref; -} - -static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - spin_lock(&delayed_refs->lock); - head->processing = false; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(head); -} - static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { @@ -1875,7 +1870,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { - unselect_delayed_ref_head(delayed_refs, head); + btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { @@ -1894,7 +1889,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -1917,39 +1912,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, return ret; } -static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( - struct btrfs_trans_handle *trans) -{ - struct btrfs_delayed_ref_root *delayed_refs = - &trans->transaction->delayed_refs; - struct btrfs_delayed_ref_head *head = NULL; - int ret; - - spin_lock(&delayed_refs->lock); - head = btrfs_select_ref_head(delayed_refs); - if (!head) { - spin_unlock(&delayed_refs->lock); - return head; - } - - /* - * Grab the lock that says we are going to process all the refs for - * this head - */ - ret = btrfs_delayed_ref_lock(delayed_refs, head); - spin_unlock(&delayed_refs->lock); - - /* - * We may have dropped the spin lock to get the head mutex lock, and - * that might have given someone else time to free the head. If that's - * true, it has been removed from our list and we can move on. - */ - if (ret == -EAGAIN) - head = ERR_PTR(-EAGAIN); - - return head; -} - static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) @@ -1966,11 +1928,11 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); - while ((ref = select_delayed_ref(locked_ref))) { + while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } @@ -1993,7 +1955,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - atomic_dec(&delayed_refs->num_entries); /* * Record the must_insert_reserved flag before we drop the @@ -2019,7 +1980,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - unselect_delayed_ref_head(delayed_refs, locked_ref); + btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } @@ -2057,7 +2018,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, do { if (!locked_ref) { - locked_ref = btrfs_obtain_ref_head(trans); + locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; @@ -2204,7 +2165,7 @@ again: btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { + if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } @@ -2238,10 +2199,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return ret; } -static noinline int check_delayed_ref(struct btrfs_root *root, +static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -2259,7 +2221,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); @@ -2315,7 +2277,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, * then we have a cross reference. */ if (ref->ref_root != btrfs_root_id(root) || - ref_owner != objectid || ref_offset != offset) { + ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } @@ -2326,11 +2288,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root, return ret; } -static noinline int check_committed_ref(struct btrfs_root *root, +/* + * Check if there are references for a data extent other than the one belonging + * to the given inode and offset. + * + * @inode: The only inode we expect to find associated with the data extent. + * @path: A path to use for searching the extent tree. + * @offset: The only offset we expect to find associated with the data extent. + * @bytenr: The logical address of the data extent. + * + * When the extent does not have any other references other than the one we + * expect to find, we always return a value of 0 with the path having a locked + * leaf that contains the extent's extent item - this is necessary to ensure + * we don't race with a task running delayed references, and our caller must + * have such a path when calling check_delayed_ref() - it must lock a delayed + * ref head while holding the leaf locked. In case the extent item is not found + * in the extent tree, we return -ENOENT with the path having the leaf (locked) + * where the extent item should be, in order to prevent races with another task + * running delayed references, so that we don't miss any reference when calling + * check_delayed_ref(). + * + * Note: this may return false positives, and this is because we want to be + * quick here as we're called in write paths (when flushing delalloc and + * in the direct IO write path). For example we can have an extent with + * a single reference but that reference is not inlined, or we may have + * many references in the extent tree but we also have delayed references + * that cancel all the reference except the one for our inode and offset, + * but it would be expensive to do such checks and complex due to all + * locking to avoid races between the checks and flushing delayed refs, + * plus non-inline references may be located on leaves other than the one + * that contains the extent item in the extent tree. The important thing + * here is to not return false negatives and that the false positives are + * not very common. + * + * Returns: 0 if there are no cross references and with the path having a locked + * leaf from the extent tree that contains the extent's extent item. + * + * 1 if there are cross references (false positives can happen). + * + * < 0 in case of an error. In case of -ENOENT the leaf in the extent + * tree where the extent item should be located at is read locked and + * accessible in the given path. + */ +static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr, - bool strict) + u64 offset, u64 bytenr) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; @@ -2349,35 +2353,32 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = -ENOENT; if (path->slots[0] == 0) - goto out; + return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; + return -ENOENT; - ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) - goto out; + return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2385,56 +2386,69 @@ static noinline int check_committed_ref(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) - goto out; - - /* - * If extent created before last snapshot => it's shared unless the - * snapshot has been deleted. Use the heuristic if strict is false. - */ - if (!strict && - (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item))) - goto out; + return 1; /* If this extent has SHARED_DATA_REF then it's shared */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) - goto out; + return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; + return 1; - ret = 0; -out: - return ret; + return 0; } -int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict, struct btrfs_path *path) +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, + u64 bytenr, struct btrfs_path *path) { int ret; do { - ret = check_committed_ref(root, path, objectid, - offset, bytenr, strict); + ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; - ret = check_delayed_ref(root, path, objectid, offset, bytenr); - } while (ret == -EAGAIN); + /* + * The path must have a locked leaf from the extent tree where + * the extent item for our extent is located, in case it exists, + * or where it should be located in case it doesn't exist yet + * because it's new and its delayed ref was not yet flushed. + * We need to lock the delayed ref head at check_delayed_ref(), + * if one exists, while holding the leaf locked in order to not + * race with delayed ref flushing, missing references and + * incorrectly reporting that the extent is not shared. + */ + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + struct extent_buffer *leaf = path->nodes[0]; + + ASSERT(leaf != NULL); + btrfs_assert_tree_read_locked(leaf); + + if (ret != -ENOENT) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.objectid == bytenr); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); + } + } + + ret = check_delayed_ref(inode, path, offset, bytenr); + } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); - if (btrfs_is_data_reloc_root(root)) + if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } @@ -2579,13 +2593,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = cache->fs_info; - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, - num_bytes); + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -2732,15 +2743,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; - u64 len; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { + u64 len; + readonly = false; if (!cache || start >= cache->start + cache->length) { @@ -2786,37 +2797,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ - btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, - len); + btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); - if (!readonly && return_free_space && - global_rsv->space_info == space_info) { - spin_lock(&global_rsv->lock); - if (!global_rsv->full) { - u64 to_add = min(len, global_rsv->size - - global_rsv->reserved); - - global_rsv->reserved += to_add; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, to_add); - if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; - len -= to_add; - } - spin_unlock(&global_rsv->lock); - } - /* Add to any tickets we may have */ - if (!readonly && return_free_space && len) - btrfs_try_granting_tickets(fs_info, space_info); + if (!readonly && return_free_space) + btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } @@ -3128,7 +3121,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, break; } - /* Quick path didn't find the EXTEMT/METADATA_ITEM */ + /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -3267,7 +3260,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3361,13 +3353,14 @@ out: static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; @@ -3385,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - btrfs_delete_ref_head(delayed_refs, head); + btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); @@ -3395,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; - btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); + btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -4834,7 +4827,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4909,7 +4901,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -5254,7 +5245,7 @@ struct walk_control { * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, - struct extent_buffer *eb, u64 refs, u64 flags, int slot) + struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; @@ -5292,7 +5283,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5347,7 +5338,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5368,7 +5359,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* If we don't need to visit this node don't reada. */ - if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) + if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); @@ -5502,7 +5493,7 @@ again: */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) goto out; if (!mutex_trylock(&head->mutex)) { @@ -5690,7 +5681,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } @@ -5721,8 +5712,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], - wc->refs[level - 1], wc->flags[level - 1], - path->slots[level])) + wc->flags[level - 1], path->slots[level])) goto skip; /* @@ -6459,7 +6449,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 2ad51130c037..cfa52264f678 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, +int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -163,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39c9677c47d5..d9f856358704 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info, btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) - btrfs_folio_end_writer_lock(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } static void __process_folios_contig(struct address_space *mapping, @@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping, u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; struct folio_batch fbatch; int i; @@ -221,7 +220,7 @@ static void __process_folios_contig(struct address_space *mapping, } } -static noinline void __unlock_for_delalloc(const struct inode *inode, +static noinline void unlock_delalloc_folio(const struct inode *inode, const struct folio *locked_folio, u64 start, u64 end) { @@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; - pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; - pgoff_t index = start_index; u64 processed_end = start; struct folio_batch fbatch; @@ -262,22 +260,23 @@ static noinline int lock_delalloc_folios(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - u32 len = end + 1 - start; + u64 range_start; + u32 range_len; if (folio == locked_folio) continue; - if (btrfs_folio_start_writer_lock(fs_info, folio, start, - len)) - goto out; - + folio_lock(folio); if (!folio_test_dirty(folio) || folio->mapping != mapping) { - btrfs_folio_end_writer_lock(fs_info, folio, start, - len); + folio_unlock(folio); goto out; } + range_start = max_t(u64, folio_pos(folio), start); + range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + end + 1) - range_start; + btrfs_folio_set_lock(fs_info, folio, range_start, range_len); - processed_end = folio_pos(folio) + folio_size(folio) - 1; + processed_end = range_start + range_len - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -287,8 +286,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_folio, start, - processed_end); + unlock_delalloc_folio(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -389,7 +387,7 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_folio, delalloc_start, + unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -437,7 +435,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le if (!btrfs_is_subpage(fs_info, folio->mapping)) folio_unlock(folio); else - btrfs_subpage_end_reader(fs_info, folio, start, len); + btrfs_folio_end_lock(fs_info, folio, start, len); } /* @@ -494,7 +492,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -709,6 +707,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; bbio->inode = inode; bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; @@ -785,7 +784,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -861,11 +860,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, return ret; } -int set_page_extent_mapped(struct page *page) -{ - return set_folio_extent_mapped(page_folio(page)); -} - int set_folio_extent_mapped(struct folio *folio) { struct btrfs_fs_info *fs_info; @@ -900,9 +894,9 @@ void clear_folio_extent_mapped(struct folio *folio) folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, - struct folio *folio, u64 start, - u64 len, struct extent_map **em_cached) +static struct extent_map *get_extent_map(struct btrfs_inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; struct extent_state *cached_state = NULL; @@ -921,14 +915,14 @@ static struct extent_map *__get_extent_map(struct inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); - em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); + btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); + em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -984,8 +978,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, folio, cur, end - cur + 1, - em_cached); + em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); @@ -1101,15 +1094,59 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } +static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, + u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + unsigned int start_bit; + unsigned int nbits; + + ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + nbits = len >> fs_info->sectorsize_bits; + ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); + bitmap_set(delalloc_bitmap, start_bit, nbits); +} + +static bool find_next_delalloc_bitmap(struct folio *folio, + unsigned long *delalloc_bitmap, u64 start, + u64 *found_start, u32 *found_len) +{ + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + const u64 folio_start = folio_pos(folio); + const unsigned int bitmap_size = fs_info->sectors_per_page; + unsigned int start_bit; + unsigned int first_zero; + unsigned int first_set; + + ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + + start_bit = (start - folio_start) >> fs_info->sectorsize_bits; + first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); + if (first_set >= bitmap_size) + return false; + + *found_start = folio_start + (first_set << fs_info->sectorsize_bits); + first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); + *found_len = (first_zero - first_set) << fs_info->sectorsize_bits; + return true; +} + /* - * helper for extent_writepage(), doing all of the delayed allocation setup. + * Do all of the delayed allocation setup. + * + * Return >0 if all the dirty blocks are submitted async (compression) or inlined. + * The @folio should no longer be touched (treat it as already unlocked). * - * This returns 1 if btrfs_run_delalloc_range function did all the work required - * to write the page (copy into inline extent). In this case the IO has - * been started and the page is already unlocked. + * Return 0 if there is still dirty block that needs to be submitted through + * extent_writepage_io(). + * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be + * submitted, and @folio is still kept locked. * - * This returns 0 if all went well (page still locked) - * This returns < 0 if there were errors (page still locked) + * Return <0 if there is any error hit. + * Any allocated ordered extent range covering this folio will be marked + * finished (IOERR), and @folio is still kept locked. */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, @@ -1120,16 +1157,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the * last delalloc end. */ u64 last_delalloc_end = 0; + /* + * The range end (exclusive) of the last successfully finished delalloc + * range. + * Any range covered by ordered extent must either be manually marked + * finished (error handling), or has IO submitted (and finish the + * ordered extent normally). + * + * This records the end of ordered extent cleanup if we hit an error. + */ + u64 last_finished_delalloc_end = page_start; u64 delalloc_start = page_start; u64 delalloc_end = page_end; u64 delalloc_to_write = 0; int ret = 0; + int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { @@ -1139,6 +1188,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, bio_ctrl->submit_bitmap = 1; } + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + u64 start = page_start + (bit << fs_info->sectorsize_bits); + + btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1147,9 +1202,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = delalloc_end + 1; continue; } - btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, - min(delalloc_end, page_end) + 1 - - delalloc_start); + set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, + min(delalloc_end, page_end) + 1 - delalloc_start); last_delalloc_end = delalloc_end; delalloc_start = delalloc_end + 1; } @@ -1174,7 +1228,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; found = true; } else { - found = btrfs_subpage_find_writer_locked(fs_info, folio, + found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, &found_start, &found_len); } if (!found) @@ -1188,11 +1242,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, found_len = last_delalloc_end + 1 - found_start; if (ret >= 0) { + /* + * Some delalloc range may be created by previous folios. + * Thus we still need to clean up this range during error + * handling. + */ + last_finished_delalloc_end = found_start; /* No errors hit so far, run the current delalloc range. */ ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); + if (ret >= 0) + last_finished_delalloc_end = found_start + found_len; + if (unlikely(ret < 0)) + btrfs_err_rl(fs_info, +"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d", + btrfs_root_id(inode->root), + btrfs_ino(inode), + folio_pos(folio), + fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, + found_start, found_len, ret); } else { /* * We've hit an error during previous delalloc range, @@ -1200,7 +1271,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, folio, + unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } @@ -1227,8 +1298,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, delalloc_start = found_start + found_len; } - if (ret < 0) + /* + * It's possible we had some ordered extents created before we hit + * an error, cleanup non-async successfully created delalloc ranges. + */ + if (unlikely(ret < 0)) { + unsigned int bitmap_size = min( + (last_finished_delalloc_end - page_start) >> + fs_info->sectorsize_bits, + fs_info->sectors_per_page); + + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) + btrfs_mark_ordered_io_finished(inode, folio, + page_start + (bit << fs_info->sectorsize_bits), + fs_info->sectorsize, false); return ret; + } out: if (last_delalloc_end) delalloc_end = last_delalloc_end; @@ -1288,7 +1373,7 @@ static int submit_one_sector(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, filepos, sectorsize); if (IS_ERR(em)) - return PTR_ERR_OR_ZERO(em); + return PTR_ERR(em); extent_offset = filepos - em->start; em_end = extent_map_end(em); @@ -1306,7 +1391,14 @@ static int submit_one_sector(struct btrfs_inode *inode, free_extent_map(em); em = NULL; - btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * a folio for a range already written to disk. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); /* * Above call should set the whole folio with writeback flag, even * just for a single subpage sector. @@ -1315,13 +1407,6 @@ static int submit_one_sector(struct btrfs_inode *inode, */ ASSERT(folio_test_writeback(folio)); - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * folio for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); submit_extent_folio(bio_ctrl, disk_bytenr, folio, sectorsize, filepos - folio_pos(folio)); return 0; @@ -1344,6 +1429,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; bool submitted_io = false; + bool error = false; const u64 folio_start = folio_pos(folio); u64 cur; int bit; @@ -1386,13 +1472,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) { + /* + * bio_ctrl may contain a bio crossing several folios. + * Submit it immediately so that the bio has a chance + * to finish normally, other than marked as error. + */ + submit_one_bio(bio_ctrl); + /* + * Failed to grab the extent map which should be very rare. + * Since there is no bio submitted to finish the ordered + * extent, we have to manually finish this sector. + */ + btrfs_mark_ordered_io_finished(inode, folio, cur, + fs_info->sectorsize, false); + error = true; + continue; + } submitted_io = true; } - btrfs_folio_assert_not_dirty(fs_info, folio, start, len); -out: /* * If we didn't submitted any sector (>= i_size), folio dirty get * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared @@ -1400,8 +1499,11 @@ out: * * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. + * + * If we hit any error, the corresponding sector will still be dirty + * thus no need to clear PAGECACHE_TAG_DIRTY. */ - if (!submitted_io) { + if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } @@ -1419,15 +1521,14 @@ out: */ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct inode *inode = folio->mapping->host; - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - const u64 page_start = folio_pos(folio); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; size_t pg_offset; - loff_t i_size = i_size_read(inode); + loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace_extent_writepage(folio, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); WARN_ON(!folio_test_locked(folio)); @@ -1451,41 +1552,37 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); + ret = writepage_delalloc(inode, folio, bio_ctrl); if (ret == 1) return 0; if (ret) goto done; - ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + ret = extent_writepage_io(inode, folio, folio_pos(folio), PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; + if (ret < 0) + btrfs_err_rl(fs_info, +"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", + btrfs_root_id(inode->root), btrfs_ino(inode), + folio_pos(folio), fs_info->sectors_per_page, + &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; done: - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - page_start, PAGE_SIZE, !ret); + if (ret < 0) mapping_set_error(folio->mapping, ret); - } - /* * Only unlock ranges that are submitted. As there can be some async * submitted ranges inside the folio. */ - btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); + btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } -void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, - TASK_UNINTERRUPTIBLE); -} - /* * Lock extent buffer status and pages for writeback. * @@ -1626,11 +1723,10 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; - bool uptodate = !bbio->bio.bi_status; struct folio_iter fi; u32 bio_offset = 0; - if (!uptodate) + if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { @@ -1707,7 +1803,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1721,8 +1817,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -2115,7 +2210,27 @@ retry: continue; } - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * For subpage case, compression can lead to mixed + * writeback and dirty flags, e.g: + * 0 32K 64K 96K 128K + * | |//////||/////| |//| + * + * In above case, [32K, 96K) is asynchronously submitted + * for compression, and [124K, 128K) needs to be written back. + * + * If we didn't wait wrtiteback for page 64K, [128K, 128K) + * won't be submitted as the page still has writeback flag + * and will be skipped in the next check. + * + * This mixed writeback and dirty case is only possible for + * subpage case. + * + * TODO: Remove this check after migrating compression to + * regular submission. + */ + if (wbc->sync_mode != WB_SYNC_NONE || + btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2200,7 +2315,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f u32 cur_len = cur_end + 1 - cur; struct folio *folio; - folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); + folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); /* * This shouldn't happen, the pages are pinned and locked, this @@ -2228,12 +2343,9 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (ret == 1) goto next_page; - if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, - cur, cur_len, !ret); + if (ret) mapping_set_error(mapping, ret); - } - btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); + btrfs_folio_end_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2317,7 +2429,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct folio *folio, gfp_t mask) + struct folio *folio) { u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; @@ -2428,12 +2540,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, folio, mask); -} - -static void __free_extent_buffer(struct extent_buffer *eb) -{ - kmem_cache_free(extent_buffer_cache, eb); + return try_release_extent_state(io_tree, folio); } static int extent_buffer_under_io(const struct extent_buffer *eb) @@ -2442,7 +2549,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) +static bool folio_range_has_eb(struct folio *folio) { struct btrfs_subpage *subpage; @@ -2452,12 +2559,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; - /* - * Even there is no eb refs here, we may still have - * end_folio_read() call relying on page::private. - */ - if (atomic_read(&subpage->readers)) - return true; } return false; } @@ -2516,14 +2617,14 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!folio_range_has_eb(fs_info, folio)) + if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio); spin_unlock(&folio->mapping->i_private_lock); } -/* Release all pages attached to the extent buffer */ -static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) +/* Release all folios attached to the extent buffer */ +static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) { ASSERT(!extent_buffer_under_io(eb)); @@ -2545,9 +2646,9 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb) */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_pages(eb); + btrfs_release_extent_buffer_folios(eb); btrfs_leak_debug_del_eb(eb); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static struct extent_buffer * @@ -2645,7 +2746,7 @@ err: folio_put(eb->folios[i]); } } - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return NULL; } @@ -2772,13 +2873,12 @@ free_eb: } #endif -static struct extent_buffer *grab_extent_buffer( - struct btrfs_fs_info *fs_info, struct page *page) +static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, + struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *exists; - lockdep_assert_held(&page->mapping->i_private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); /* * For subpage case, we completely rely on radix tree to ensure we @@ -2793,7 +2893,7 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* - * We could have already allocated an eb for this page and attached one + * We could have already allocated an eb for this folio and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can * just overwrite folio private. @@ -2802,16 +2902,19 @@ static struct extent_buffer *grab_extent_buffer( if (atomic_inc_not_zero(&exists->refs)) return exists; - WARN_ON(PageDirty(page)); + WARN_ON(folio_test_dirty(folio)); folio_detach_private(folio); return NULL; } -static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +/* + * Validate alignment constraints of eb at logical address @start. + */ +static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) { if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); - return -EINVAL; + return true; } if (fs_info->nodesize < PAGE_SIZE && @@ -2819,14 +2922,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) btrfs_err(fs_info, "tree block crosses page boundary, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (fs_info->nodesize >= PAGE_SIZE && !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); - return -EINVAL; + return true; } if (!IS_ALIGNED(start, fs_info->nodesize) && !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { @@ -2834,10 +2937,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) "tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", start, fs_info->nodesize); } - return 0; + return false; } - /* * Return 0 if eb->folios[i] is attached to btree inode successfully. * Return >0 if there is already another extent buffer for the range, @@ -2893,8 +2995,7 @@ finish: } else if (existing_folio) { struct extent_buffer *existing_eb; - existing_eb = grab_extent_buffer(fs_info, - folio_page(existing_folio, 0)); + existing_eb = grab_extent_buffer(fs_info, existing_folio); if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; @@ -3091,7 +3192,7 @@ again: * live buffer and won't free them prematurely. */ for (int i = 0; i < num_folios; i++) - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); return eb; out: @@ -3115,13 +3216,13 @@ out: for (int i = 0; i < attached; i++) { ASSERT(eb->folios[i]); detach_extent_buffer_folio(eb, eb->folios[i]); - unlock_page(folio_page(eb->folios[i], 0)); + folio_unlock(eb->folios[i]); folio_put(eb->folios[i]); eb->folios[i] = NULL; } /* * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utlizing page->mapping. + * so it can be cleaned up without utilizing page->mapping. */ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -3137,7 +3238,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) struct extent_buffer *eb = container_of(head, struct extent_buffer, rcu_head); - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); } static int release_extent_buffer(struct extent_buffer *eb) @@ -3161,11 +3262,11 @@ static int release_extent_buffer(struct extent_buffer *eb) } btrfs_leak_debug_del_eb(eb); - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_pages(eb); + /* Should be safe to release folios at this point. */ + btrfs_release_extent_buffer_folios(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { - __free_extent_buffer(eb); + kmem_cache_free(extent_buffer_cache, eb); return 1; } #endif @@ -3324,12 +3425,12 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(folio_page(eb->folios[0], 0)); + folio_lock(eb->folios[0]); for (int i = 0; i < num_folios; i++) btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], eb->start, eb->len); if (subpage) - unlock_page(folio_page(eb->folios[0], 0)); + folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); @@ -3439,8 +3540,8 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_put(&bbio->bio); } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - const struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; bool ret; @@ -3458,7 +3559,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, /* Someone else is already reading the buffer, just wait for it. */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) - goto done; + return 0; /* * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above @@ -3498,14 +3599,21 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, } } btrfs_submit_bbio(bbio, mirror_num); + return 0; +} -done: - if (wait == WAIT_COMPLETE) { - wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - } +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *check) +{ + int ret; + ret = read_extent_buffer_pages_nowait(eb, mirror_num, check); + if (ret < 0) + return ret; + + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return -EIO; return 0; } @@ -4221,7 +4329,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { struct btrfs_tree_parent_check check = { - .has_first_key = 0, .level = level, .transid = gen }; @@ -4237,7 +4344,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); + ret = read_extent_buffer_pages_nowait(eb, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8a36117ed453..6c5328bfabc2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,7 +248,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); -int set_page_extent_mapped(struct page *page); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -262,12 +261,17 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE 0 -#define WAIT_COMPLETE 1 -#define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, +int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *parent_check); -void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, + const struct btrfs_tree_parent_check *parent_check); + +static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +{ + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); +} + void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 25d191f1ac10..67ce85ff0ae2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len) return start + len; } -static void dec_evictable_extent_maps(struct btrfs_inode *inode) +static void remove_em(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + rb_erase(&em->rb_node, &inode->extent_tree.root); + RB_CLEAR_NODE(&em->rb_node); + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -230,7 +233,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma if (extent_map_end(prev) != next->start) return false; - if (prev->flags != next->flags) + /* + * The merged flag is not an on-disk flag, it just indicates we had the + * extent maps of 2 (or more) adjacent extents merged, so factor it out. + */ + if ((prev->flags & ~EXTENT_FLAG_MERGED) != + (next->flags & ~EXTENT_FLAG_MERGED)) return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) @@ -243,13 +251,19 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma /* * Handle the on-disk data extents merge for @prev and @next. * + * @prev: left extent to merge + * @next: right extent to merge + * @merged: the extent we will not discard after the merge; updated with new values + * + * After this, one of the two extents is the new merged extent and the other is + * removed from the tree and likely freed. Note that @merged is one of @prev/@next + * so there is const/non-const aliasing occurring here. + * * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes. * For now only uncompressed regular extent can be merged. - * - * @prev and @next will be both updated to point to the new merged range. - * Thus one of them should be removed by the caller. */ -static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next) +static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next, + struct extent_map *merged) { u64 new_disk_bytenr; u64 new_disk_num_bytes; @@ -284,15 +298,10 @@ static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *nex new_disk_bytenr; new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr; - prev->disk_bytenr = new_disk_bytenr; - prev->disk_num_bytes = new_disk_num_bytes; - prev->ram_bytes = new_disk_num_bytes; - prev->offset = new_offset; - - next->disk_bytenr = new_disk_bytenr; - next->disk_num_bytes = new_disk_num_bytes; - next->ram_bytes = new_disk_num_bytes; - next->offset = new_offset; + merged->disk_bytenr = new_disk_bytenr; + merged->disk_num_bytes = new_disk_num_bytes; + merged->ram_bytes = new_disk_num_bytes; + merged->offset = new_offset; } static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix, @@ -333,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -361,14 +369,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(merge, em); + merge_ondisk_extents(merge, em, em); em->flags |= EXTENT_FLAG_MERGED; validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -378,14 +384,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - merge_ondisk_extents(em, merge); + merge_ondisk_extents(em, merge, em); validate_extent_map(fs_info, em); - rb_erase(&merge->rb_node, &tree->root); - RB_CLEAR_NODE(&merge->rb_node); em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; + remove_em(inode, merge); free_extent_map(merge); - dec_evictable_extent_maps(inode); } } @@ -582,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); - rb_erase(&em->rb_node, &tree->root); if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - RB_CLEAR_NODE(&em->rb_node); - dec_evictable_extent_maps(inode); + remove_em(inode, em); } static void replace_extent_mapping(struct btrfs_inode *inode, @@ -1116,13 +1118,12 @@ out_free_pre: struct btrfs_em_shrink_ctx { long nr_to_scan; long scanned; - u64 last_ino; - u64 last_root; }; static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) { - const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); struct extent_map_tree *tree = &inode->extent_tree; long nr_dropped = 0; struct rb_node *node; @@ -1195,7 +1196,8 @@ next: * lock. This is to avoid slowing other tasks trying to take the * lock. */ - if (need_resched() || rwlock_needbreak(&tree->lock)) + if (need_resched() || rwlock_needbreak(&tree->lock) || + btrfs_fs_closing(fs_info)) break; node = next; } @@ -1207,19 +1209,21 @@ next: static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_inode *inode; long nr_dropped = 0; - u64 min_ino = ctx->last_ino + 1; + u64 min_ino = fs_info->em_shrinker_last_ino + 1; inode = btrfs_find_first_inode(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); min_ino = btrfs_ino(inode) + 1; - ctx->last_ino = btrfs_ino(inode); + fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan) + if (ctx->scanned >= ctx->nr_to_scan || + btrfs_fs_closing(inode->root->fs_info)) break; cond_resched(); @@ -1235,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx * inode if there is one or we will find out this was the last * one and move to the next root. */ - ctx->last_root = btrfs_root_id(root); + fs_info->em_shrinker_last_root = btrfs_root_id(root); } else { /* * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * that when processing the next root we start from its first inode. */ - ctx->last_ino = 0; - ctx->last_root = btrfs_root_id(root) + 1; + fs_info->em_shrinker_last_ino = 0; + fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; } return nr_dropped; } -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +static void btrfs_extent_map_shrinker_worker(struct work_struct *work) { + struct btrfs_fs_info *fs_info; struct btrfs_em_shrink_ctx ctx; u64 start_root_id; u64 next_root_id; bool cycled = false; long nr_dropped = 0; - ctx.scanned = 0; - ctx.nr_to_scan = nr_to_scan; + fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); - /* - * In case we have multiple tasks running this shrinker, make the next - * one start from the next inode in case it starts before we finish. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - ctx.last_ino = fs_info->extent_map_shrinker_last_ino; - fs_info->extent_map_shrinker_last_ino++; - ctx.last_root = fs_info->extent_map_shrinker_last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); + ctx.scanned = 0; + ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); - start_root_id = ctx.last_root; - next_root_id = ctx.last_root; + start_root_id = fs_info->em_shrinker_last_root; + next_root_id = fs_info->em_shrinker_last_root; if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); } - while (ctx.scanned < ctx.nr_to_scan) { + while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { struct btrfs_root *root; unsigned long count; @@ -1294,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) spin_unlock(&fs_info->fs_roots_radix_lock); if (start_root_id > 0 && !cycled) { next_root_id = 0; - ctx.last_root = 0; - ctx.last_ino = 0; + fs_info->em_shrinker_last_root = 0; + fs_info->em_shrinker_last_ino = 0; cycled = true; continue; } @@ -1314,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) btrfs_put_root(root); } - /* - * In case of multiple tasks running this extent map shrinking code this - * isn't perfect but it's simple and silences things like KCSAN. It's - * not possible to know which task made more progress because we can - * cycle back to the first root and first inode if it's not the first - * time the shrinker ran, see the above logic. Also a task that started - * later may finish ealier than another task and made less progress. So - * make this simple and update to the progress of the last task that - * finished, with the occasional possiblity of having two consecutive - * runs of the shrinker process the same inodes. - */ - spin_lock(&fs_info->extent_map_shrinker_lock); - fs_info->extent_map_shrinker_last_ino = ctx.last_ino; - fs_info->extent_map_shrinker_last_root = ctx.last_root; - spin_unlock(&fs_info->extent_map_shrinker_lock); - if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); - trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, - nr, ctx.last_root, - ctx.last_ino); + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); } - return nr_dropped; + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); +} + +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + /* + * Do nothing if the shrinker is already running. In case of high memory + * pressure we can have a lot of tasks calling us and all passing the + * same nr_to_scan value, but in reality we may need only to free + * nr_to_scan extent maps (or less). In case we need to free more than + * that, we will be called again by the fs shrinker, so no worries about + * not doing enough work to reclaim memory from extent maps. + * We can also be repeatedly called with the same nr_to_scan value + * simply because the shrinker runs asynchronously and multiple calls + * to this function are made before the shrinker does enough progress. + * + * That's why we set the atomic counter to nr_to_scan only if its + * current value is zero, instead of incrementing the counter by + * nr_to_scan. + */ + if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) + return; + + queue_work(system_unbound_wq, &fs_info->em_shrinker_work); +} + +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) +{ + atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); + INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 5154a8f1d26c..cd123b266b64 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); -long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); +void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index df7f09f3b02e..b80c07ad8c5e 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * we have in the cache is the last delalloc range we * found while the file extent item we found can be * either for a whole delalloc range we previously - * emmitted or only a part of that range. + * emitted or only a part of that range. * * We have two cases here: * @@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * cached extent's end. In this case just ignore the * current file extent item because we don't want to * overlap with previous ranges that may have been - * emmitted already; + * emitted already; * * 2) The file extent item starts behind the currently * cached extent but its end offset goes beyond the * end offset of the cached extent. We don't want to * overlap with a previous range that may have been - * emmitted already, so we emit the currently cached + * emitted already, so we emit the currently cached * extent and then partially store the current file * extent item's range in the cache, for the subrange * going the cached extent's end to the end of the diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 886749b39672..d04a3b47b1fb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -190,8 +190,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -1259,7 +1257,6 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b06..36f51c311bb1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,100 +36,42 @@ #include "ioctl.h" #include "file.h" #include "super.h" - -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int pg = 0; - int offset = offset_in_page(pos); - - while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; - /* - * Copy data from userspace to the current page - */ - copied = copy_page_from_iter_atomic(page, offset, count, i); - - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - - /* - * if we get a partial write, we can end up with - * partially up to date pages. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (unlikely(copied < count)) { - if (!PageUptodate(page)) { - iov_iter_revert(i, copied); - copied = 0; - } - if (!copied) - break; - } - - write_bytes -= copied; - total_copied += copied; - offset += copied; - if (offset == PAGE_SIZE) { - pg++; - offset = 0; - } - } - return total_copied; -} +#include "print-tree.h" /* - * unlocks pages after btrfs_file_write is done with them + * Unlock folio after btrfs_file_write() is done with it. */ -static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, - struct page **pages, size_t num_pages, +static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { - size_t i; u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here. There should be no need to mark the pages - * accessed as prepare_pages should have marked them accessed - * in prepare_pages via find_or_create_page() - */ - btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), - block_start, block_len); - unlock_page(pages[i]); - put_page(pages[i]); - } + /* + * Folio checked is some magic around finding folios that have been + * modified without going through btrfs_dirty_folio(). Clear it here. + * There should be no need to mark the pages accessed as + * prepare_one_folio() should have marked them accessed in + * prepare_one_folio() via find_or_create_page() + */ + btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); + folio_unlock(folio); + folio_put(folio); } /* - * After btrfs_copy_from_user(), update the following things for delalloc: - * - Mark newly dirtied pages as DELALLOC in the io tree. + * After copy_folio_from_iter_atomic(), update the following things for delalloc: + * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. - * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve) +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; - int i; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -147,6 +89,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); + ASSERT(folio_pos(folio) <= pos && + folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -163,16 +107,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, if (ret) return ret; - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - - btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), - start_pos, num_bytes); - btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), - start_pos, num_bytes); - } + btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated @@ -242,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); - if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); @@ -263,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } ret = btrfs_next_leaf(root, path); if (ret < 0) break; @@ -339,7 +280,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -369,7 +314,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { @@ -415,7 +359,6 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -427,7 +370,11 @@ next_slot: * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { - BUG_ON(del_nr > 0); + if (WARN_ON(del_nr > 0)) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; @@ -435,7 +382,6 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -455,7 +401,11 @@ delete_extent_item: del_slot = path->slots[0]; del_nr = 1; } else { - BUG_ON(del_slot + del_nr != path->slots[0]); + if (WARN_ON(del_slot + del_nr != path->slots[0])) { + btrfs_print_leaf(leaf); + ret = -EINVAL; + break; + } del_nr++; } @@ -686,7 +636,6 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -715,7 +664,6 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -749,7 +697,6 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(trans, leaf); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = bytenr; @@ -828,7 +775,6 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -837,7 +783,6 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -851,53 +796,47 @@ out: } /* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 + * On error return an unlocked folio and the error value + * On success return a locked folio and 0 */ -static int prepare_uptodate_page(struct inode *inode, - struct page *page, u64 pos, - bool force_uptodate) +static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, + u64 len, bool force_uptodate) { - struct folio *folio = page_folio(page); + u64 clamp_start = max_t(u64, pos, folio_pos(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); int ret = 0; - if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_read_folio(NULL, folio); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - - /* - * Since btrfs_read_folio() will unlock the folio before it - * returns, there is a window where btrfs_release_folio() can be - * called to release the page. Here we check both inode - * mapping and PagePrivate() to make sure the page was not - * released. - * - * The private flag check is essential for subpage as we need - * to store extra bitmap using folio private. - */ - if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { - unlock_page(page); - return -EAGAIN; - } - } - return 0; -} + if (folio_test_uptodate(folio)) + return 0; -static fgf_t get_prepare_fgp_flags(bool nowait) -{ - fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + if (!force_uptodate && + IS_ALIGNED(clamp_start, PAGE_SIZE) && + IS_ALIGNED(clamp_end, PAGE_SIZE)) + return 0; - if (nowait) - fgp_flags |= FGP_NOWAIT; + ret = btrfs_read_folio(NULL, folio); + if (ret) + return ret; + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + return -EIO; + } - return fgp_flags; + /* + * Since btrfs_read_folio() will unlock the folio before it returns, + * there is a window where btrfs_release_folio() can be called to + * release the page. Here we check both inode mapping and page + * private to make sure the page was not released. + * + * The private flag check is essential for subpage as we need to store + * extra bitmap using folio private. + */ + if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { + folio_unlock(folio); + return -EAGAIN; + } + return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) @@ -914,89 +853,68 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) } /* - * this just gets pages into the page cache and locks them down. + * Get folio into the page cache and lock it. */ -static noinline int prepare_pages(struct inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, bool force_uptodate, - bool nowait) +static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, + loff_t pos, size_t write_bytes, + bool force_uptodate, bool nowait) { - int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + struct folio *folio; int ret = 0; - int faili; - for (i = 0; i < num_pages; i++) { again: - pages[i] = pagecache_get_page(inode->i_mapping, index + i, - fgp_flags, mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - if (nowait) - ret = -EAGAIN; - else - ret = -ENOMEM; - goto fail; - } - - ret = set_page_extent_mapped(pages[i]); - if (ret < 0) { - faili = i; - goto fail; - } - - if (i == 0) - ret = prepare_uptodate_page(inode, pages[i], pos, - force_uptodate); - if (!ret && i == num_pages - 1) - ret = prepare_uptodate_page(inode, pages[i], - pos + write_bytes, false); - if (ret) { - put_page(pages[i]); - if (!nowait && ret == -EAGAIN) { - ret = 0; - goto again; - } - faili = i - 1; - goto fail; + folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); + if (IS_ERR(folio)) { + if (nowait) + ret = -EAGAIN; + else + ret = PTR_ERR(folio); + return ret; + } + folio_wait_writeback(folio); + /* Only support page sized folio yet. */ + ASSERT(folio_order(folio) == 0); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + return ret; + } + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + if (ret) { + /* The folio is already unlocked. */ + folio_put(folio); + if (!nowait && ret == -EAGAIN) { + ret = 0; + goto again; } - wait_on_page_writeback(pages[i]); + return ret; } - + *folio_ret = folio; return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - put_page(pages[faili]); - faili--; - } - return ret; - } /* - * This function locks the extent and properly waits for data=ordered extents - * to finish before allowing the pages to be modified if need. + * Locks the extent and properly waits for data=ordered extents to finish + * before allowing the folios to be modified if need. * - * The return value: + * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK - * -EAGAIN - need re-prepare the pages - * the other < 0 number - Something wrong happens + * -EAGAIN - need to prepare the folios again */ static noinline int -lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, - size_t write_bytes, +lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, + loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; - int i; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); @@ -1008,12 +926,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, if (nowait) { if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - pages[i] = NULL; - } - + folio_unlock(folio); + folio_put(folio); return -EAGAIN; } } else { @@ -1027,10 +941,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered->file_offset <= last_pos) { unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); - } + folio_unlock(folio); + folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; @@ -1044,11 +956,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * We should be called after prepare_pages() which should have locked + * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ - for (i = 0; i < num_pages; i++) - WARN_ON(!PageLocked(pages[i])); + WARN_ON(!folio_test_locked(folio)); return ret; } @@ -1104,7 +1015,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait, false); + NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else @@ -1120,27 +1031,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) +int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1170,7 +1061,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); @@ -1192,20 +1086,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) loff_t pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; u64 lockend; size_t num_written = 0; - int nrptrs; ssize_t ret; - bool only_release_metadata = false; - bool force_page_uptodate = false; loff_t old_isize = i_size_read(inode); unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + bool only_release_metadata = false; if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; @@ -1218,38 +1109,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret <= 0) goto out; - ret = btrfs_write_check(iocb, i, ret); + ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; - nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), - PAGE_SIZE / (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out; - } - while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_SIZE - - offset); - size_t num_pages; + size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); size_t reserve_bytes; - size_t dirty_pages; size_t copied; size_t dirty_sectors; size_t num_sectors; + struct folio *folio = NULL; int extents_locked; + bool force_page_uptodate = false; /* - * Fault pages before locking them in prepare_pages + * Fault pages before locking them in prepare_one_folio() * to avoid recursive lock */ if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { @@ -1288,8 +1167,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) only_release_metadata = true; } - num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); - WARN_ON(num_pages > nrptrs); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); @@ -1317,23 +1194,17 @@ again: break; } - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(inode, pages, num_pages, - pos, write_bytes, force_page_uptodate, false); + ret = prepare_one_folio(inode, &folio, pos, write_bytes, + force_page_uptodate, false); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); break; } - extents_locked = lock_and_cleanup_extent_if_need( - BTRFS_I(inode), pages, - num_pages, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); + extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), + folio, pos, write_bytes, &lockstart, + &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; @@ -1344,28 +1215,34 @@ again: break; } - copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + copied = copy_folio_from_iter_atomic(folio, + offset_in_folio(folio, pos), write_bytes, i); + flush_dcache_folio(folio); + + /* + * If we get a partial write, we can end up with partially + * uptodate page. Although if sector size < page size we can + * handle it, but if it's not sector aligned it can cause + * a lot of complexity, so make sure they don't happen by + * forcing retry this copy. + */ + if (unlikely(copied < write_bytes)) { + if (!folio_test_uptodate(folio)) { + iov_iter_revert(i, copied); + copied = 0; + } + } num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, fs_info->sectorsize); dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; - if (copied == 0) { force_page_uptodate = true; dirty_sectors = 0; - dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = DIV_ROUND_UP(copied + offset, - PAGE_SIZE); } if (num_sectors > dirty_sectors) { @@ -1375,13 +1252,10 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - u64 __pos; - - __pos = round_down(pos, - fs_info->sectorsize) + - (dirty_pages << PAGE_SHIFT); + u64 release_start = round_up(pos + copied, + fs_info->sectorsize); btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, __pos, + data_reserved, release_start, release_bytes, true); } } @@ -1389,15 +1263,14 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, + ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's * start offset is >= i_size, we might still have a non-NULL * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_pages(). Therefore free any + * as delalloc through btrfs_dirty_page(). Therefore free any * possible cached extent state to avoid a memory leak. */ if (extents_locked) @@ -1408,7 +1281,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); break; } @@ -1416,7 +1289,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); + btrfs_drop_folio(fs_info, folio, pos, copied); cond_resched(); @@ -1424,8 +1297,6 @@ again: num_written += copied; } - kfree(pages); - if (release_bytes) { if (only_release_metadata) { btrfs_check_nocow_unlock(BTRFS_I(inode)); @@ -1470,7 +1341,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (ret || encoded->len == 0) goto out; - ret = btrfs_write_check(iocb, from, encoded->len); + ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; @@ -2137,7 +2008,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2154,7 +2024,6 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2302,7 +2171,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -3802,6 +3670,7 @@ const struct file_operations btrfs_file_operations = { .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, + .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index 912254e653cf..de89e644be29 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); +int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, + size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); @@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); +int btrfs_write_check(struct kiocb *iocb, size_t count); ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index eaa1dbd31352..d42b6f882f57 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,7 +11,8 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> -#include "ctree.h" +#include <linux/string_choices.h> +#include "extent-tree.h" #include "fs.h" #include "messages.h" #include "misc.h" @@ -197,7 +198,6 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -215,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -462,7 +461,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) return -ENOMEM; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(page_folio(page)); if (ret < 0) { unlock_page(page); put_page(page); @@ -1188,7 +1187,6 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1387,6 +1385,7 @@ static int __btrfs_write_out_cache(struct inode *inode, int bitmaps = 0; int ret; int must_iput = 0; + int i_size; if (!i_size_read(inode)) return -EIO; @@ -1457,11 +1456,16 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, - io_ctl->num_pages, 0, i_size_read(inode), - &cached_state, false); - if (ret) - goto out_nospc; + i_size = i_size_read(inode); + for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { + u64 dirty_start = i * PAGE_SIZE; + u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; + + ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), + dirty_start, dirty_len, &cached_state, false); + if (ret < 0) + goto out_nospc; + } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -2936,12 +2940,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", - info->offset, info->bytes, - (info->bitmap) ? "yes" : "no"); + info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", - list_empty(&block_group->cluster_list) ? "no" : "yes"); + str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); @@ -3809,7 +3812,7 @@ next: if (async && *total_trimmed) break; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } @@ -4000,7 +4003,7 @@ next: } block_group->discard_cursor = start; - if (fatal_signal_pending(current)) { + if (btrfs_trim_interrupted()) { if (start != offset) reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 83774bfd7b3b..9f1dbfdee8ca 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -10,6 +10,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/freezer.h> #include "fs.h" struct inode; @@ -56,6 +57,11 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +static inline bool btrfs_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + /* * Deltas are an effective way to populate global statistics. Give macro names * to make it clear what we're doing. An example is discard_extents in diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 7ba50e133921..cae540ec15ed 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -1350,6 +1345,12 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_end_transaction(trans); return ret; } + if (btrfs_should_end_transaction(trans)) { + btrfs_end_transaction(trans); + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + } node = rb_next(node); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 31c1648bc0b4..09cfb43580cb 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -4,6 +4,136 @@ #include "ctree.h" #include "fs.h" #include "accessors.h" +#include "volumes.h" + +static const struct btrfs_csums { + u16 size; + const char name[10]; + const char driver[12]; +} btrfs_csums[] = { + [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, +}; + +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + +int btrfs_super_csum_size(const struct btrfs_super_block *s) +{ + u16 t = btrfs_super_csum_type(s); + + /* csum type is validated at mount time. */ + return btrfs_csum_type_size(t); +} + +const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time. */ + return btrfs_csums[csum_type].name; +} + +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name. + */ +const char *btrfs_super_csum_driver(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver[0] ? + btrfs_csums[csum_type].driver : + btrfs_csums[csum_type].name; +} + +size_t __attribute_const__ btrfs_get_num_csums(void) +{ + return ARRAY_SIZE(btrfs_csums); +} + +/* + * Start exclusive operation @type, return true on success. + */ +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock() + * and btrfs_exclop_finish(). + * + * Compatibility: + * - the same type is already running + * - when trying to add a device and balance has been paused + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type || + (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && + type == BTRFS_EXCLOP_DEV_ADD)) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->super_lock); + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op) +{ + switch (op) { + case BTRFS_EXCLOP_BALANCE_PAUSED: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; + spin_unlock(&fs_info->super_lock); + break; + case BTRFS_EXCLOP_BALANCE: + spin_lock(&fs_info->super_lock); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; + spin_unlock(&fs_info->super_lock); + break; + default: + btrfs_warn(fs_info, + "invalid exclop balance operation %d requested", op); + } +} void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 79f64e383edd..b572d6b9730b 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -14,10 +14,10 @@ #include <linux/lockdep.h> #include <linux/spinlock.h> #include <linux/mutex.h> -#include <linux/rwlock_types.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/list.h> +#include <linux/pagemap.h> #include <linux/radix-tree.h> #include <linux/workqueue.h> #include <linux/wait.h> @@ -263,10 +263,10 @@ enum { BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under developmen like Extent tree v2 support is enabled - * only under CONFIG_BTRFS_DEBUG. + * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ @@ -317,6 +317,8 @@ struct btrfs_dev_replace { struct percpu_counter bio_counter; wait_queue_head_t replace_wait; + + struct task_struct *replace_task; }; /* @@ -625,6 +627,9 @@ struct btrfs_fs_info { struct kobject *qgroups_kobj; struct kobject *discard_kobj; + /* Track the number of blocks (sectors) read by the filesystem. */ + struct percpu_counter stats_read_blocks; + /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -633,9 +638,10 @@ struct btrfs_fs_info { s32 delalloc_batch; struct percpu_counter evictable_extent_maps; - spinlock_t extent_map_shrinker_lock; - u64 extent_map_shrinker_last_root; - u64 extent_map_shrinker_last_ino; + u64 em_shrinker_last_root; + u64 em_shrinker_last_ino; + atomic64_t em_shrinker_nr_to_scan; + struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; @@ -876,17 +882,19 @@ struct btrfs_fs_info { #endif }; -#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \ - struct page *: (_page))->mapping->host)) #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) -#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ struct inode *: (_inode)))->root->fs_info) +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_constraint(mapping, ~__GFP_FS); +} + static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); @@ -953,6 +961,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) +#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) + static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; @@ -982,6 +992,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); +u16 btrfs_csum_type_size(u16 type); +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +static inline bool btrfs_is_empty_uuid(const u8 *uuid) +{ + return uuid_is_null((const uuid_t *)uuid); +} + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -1058,6 +1079,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +/* + * We use folio flag owner_2 to indicate there is an ordered extent with + * unfinished IO. + */ +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 29572dfaf878..448aa1a682d6 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -298,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); return ret; @@ -363,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - out: btrfs_free_path(path); @@ -590,7 +586,6 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5618ca02934a..fe2c810335ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,35 +393,14 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; struct folio *folio; - if (locked_folio) { - page_start = folio_pos(locked_folio); - page_end = page_start + folio_size(locked_folio) - 1; - } - while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_folio && index == (page_start >> PAGE_SHIFT)) { - index++; - continue; - } - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); index++; if (IS_ERR(folio)) continue; @@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_folio) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + folio_size(locked_folio)) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - folio_pos(locked_folio) - - folio_size(locked_folio); - offset = folio_pos(locked_folio) + folio_size(locked_folio); - } - } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } @@ -556,8 +518,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { struct folio *folio; - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, - 0, 0, 0); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_folio(folio, 0); @@ -565,7 +526,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -646,7 +606,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ -static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, @@ -736,7 +696,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return 1; lock_extent(&inode->io_tree, offset, end, &cached); - ret = __cow_file_range_inline(inode, offset, size, compressed_size, + ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { @@ -832,32 +792,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } /* - * Special check for subpage. - * - * We lock the full page then run each delalloc range in the page, thus - * for the following case, we will hit some subpage specific corner case: + * Only enable sector perfect compression for experimental builds. * - * 0 32K 64K - * | |///////| |///////| - * \- A \- B + * This is a big feature change for subpage cases, and can hit + * different corner cases, so only limit this feature for + * experimental build for now. * - * In above case, both range A and range B will try to unlock the full - * page [0, 64K), causing the one finished later will have page - * unlocked already, triggering various page lock requirement BUG_ON()s. - * - * So here we add an artificial limit that subpage compression can only - * if the range is fully page aligned. - * - * In theory we only need to ensure the first page is fully covered, but - * the tailing partial page will be locked until the full compression - * finishes, delaying the write of other range. - * - * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range - * first to prevent any submitted async extent to unlock the full page. - * By this, we can ensure for subpage case that only the last async_cow - * will unlock the full page. + * ETA for moving this out of experimental builds is 6.15. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->sectorsize < PAGE_SIZE && + !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; @@ -896,13 +840,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + folio = filemap_get_folio(inode->i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - folio_clear_dirty_for_io(folio); + btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + end + 1 - start); folio_put(folio); } return ret; @@ -1001,17 +946,6 @@ again: (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; - /* - * For subpage case, we require full page alignment for the sector - * aligned range. - * Thus we must also check against @actual_end, not just @end. - */ - if (blocksize < PAGE_SIZE) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(round_up(actual_end, blocksize))) - goto cleanup_and_bail_uncompressed; - } - total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -1156,19 +1090,14 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_folio, - start, end - start + 1); - if (locked_folio) { - const u64 page_start = folio_pos(locked_folio); - - folio_start_writeback(locked_folio); - folio_end_writeback(locked_folio); - btrfs_mark_ordered_io_finished(inode, locked_folio, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_folio->mapping, ret); - folio_unlock(locked_folio); - } + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1359,7 +1288,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; - unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; @@ -1367,7 +1295,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct extent_map *em; unsigned clear_bits; unsigned long page_ops; - bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { @@ -1402,6 +1329,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the @@ -1421,8 +1359,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; - cur_alloc_size = num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { @@ -1453,9 +1390,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; - extent_reserved = true; - ram_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; @@ -1463,14 +1398,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + ram_size - 1, + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1480,7 +1419,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 1 << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { unlock_extent(&inode->io_tree, start, - start + ram_size - 1, &cached); + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1501,35 +1440,20 @@ static noinline int cow_file_range(struct btrfs_inode *inode, */ if (ret) btrfs_drop_extent_map_range(inode, start, - start + ram_size - 1, + start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered (Private2) bit so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; - extent_reserved = false; + cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased @@ -1539,13 +1463,15 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: - btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); + btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1559,35 +1485,30 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * btrfs_run_delalloc_range(). + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the cleanup function. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1599,13 +1520,12 @@ out_unlock: * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ - if (extent_reserved) { + if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); - start += cur_alloc_size; } /* @@ -1614,12 +1534,18 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - if (start < end) { + if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_folio, + extent_clear_unlock_delalloc(inode, start + cur_alloc_size, + end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); + btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, + end - start - cur_alloc_size + 1, NULL); } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1729,7 +1655,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; @@ -1840,7 +1766,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) @@ -1868,7 +1794,6 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. @@ -1923,8 +1848,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; @@ -1953,9 +1877,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->file_extent.offset, - args->file_extent.disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -2002,6 +1925,53 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -2016,6 +1986,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2176,6 +2151,7 @@ must_cow: found_key.offset - 1); cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } @@ -2249,11 +2225,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } } btrfs_free_path(path); @@ -2261,12 +2238,41 @@ must_cow: error: /* + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Only need to clear the dirty flag as they will never be submitted. + * Ordered extent and extent maps are handled by + * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * + * 2) Failed with error from fallback_to_cow() + * start cur_offset cow_end end + * |/////////////|-----------| | + * + * For range [start, cur_offset) it's the same as case 1). + * But for range [cur_offset, cow_end), the folios have dirty flag + * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * + * Thus we should not call extent_clear_unlock_delalloc() on range + * [cur_offset, cow_end), as the folios are already unlocked. + * + * So clear the folio dirty flags for [start, cur_offset) first. + */ + if (cur_offset > start) + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + + /* * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. */ - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (cow_end) + cur_offset = cow_end + 1; /* * We need to lock the extent here because we're clearing DELALLOC and @@ -2286,6 +2292,10 @@ error: btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2336,8 +2346,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_folio, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2952,7 +2961,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -3094,34 +3102,19 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - - btrfs_inode_safe_disk_i_size_write(inode, 0); - if (freespace_inode) - trans = btrfs_join_transaction_spacecache(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, ret); - - ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) - btrfs_abort_transaction(trans, ret); - - goto out; + /* + * If it's a COW write we need to lock the extent range as we will be + * inserting/replacing file extent items and unpinning an extent map. + * This must be taken before joining a transaction, as it's a higher + * level lock (like the inode's VFS lock), otherwise we can run into an + * ABBA deadlock with other tasks (transactions work like a lock, + * depending on their current state). + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + clear_bits |= EXTENT_LOCKED; + lock_extent(io_tree, start, end, &cached_state); } - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3135,8 +3128,28 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + /* Logic error */ + ASSERT(list_empty(&ordered_extent->list)); + if (!list_empty(&ordered_extent->list)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + btrfs_inode_safe_disk_i_size_write(inode, 0); + ret = btrfs_update_inode_fallback(trans, inode); + if (ret) { + /* -ENOMEM or corruption */ + btrfs_abort_transaction(trans, ret); + } + goto out; + } if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3791,14 +3804,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) return 0; } +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) +{ + struct btrfs_root *root = inode->root; + struct btrfs_inode *existing; + const u64 ino = btrfs_ino(inode); + int ret; + + if (inode_unhashed(&inode->vfs_inode)) + return 0; + + if (prealloc) { + ret = xa_reserve(&root->inodes, ino, GFP_NOFS); + if (ret) + return ret; + } + + existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + + if (xa_is_err(existing)) { + ret = xa_err(existing); + ASSERT(ret != -EINVAL); + ASSERT(ret != -ENOMEM); + return ret; + } else if (existing) { + WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + } + + return 0; +} + /* - * read an inode from the btree into the in-memory inode + * Read a locked inode from the btree into the in-memory inode and add it to + * its root list/tree. + * + * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, - struct btrfs_path *in_path) +static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3812,25 +3856,25 @@ static int btrfs_read_locked_inode(struct inode *inode, ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); if (ret) - return ret; + goto out; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; - if (!path) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } + ASSERT(path); btrfs_get_inode_key(BTRFS_I(inode), &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - if (path != in_path) - btrfs_free_path(path); - return ret; + /* + * ret > 0 can come from btrfs_search_slot called by + * btrfs_lookup_inode(), this means the inode was not found. + */ + if (ret > 0) + ret = -ENOENT; + goto out; } leaf = path->nodes[0]; @@ -3965,8 +4009,6 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } - if (path != in_path) - btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -3993,7 +4035,15 @@ cache_acl: } btrfs_sync_inode_flags_to_i_flags(inode); + + ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + if (ret) + goto out; + return 0; +out: + iget_failed(inode); + return ret; } /* @@ -4074,7 +4124,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -4368,11 +4417,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + if (IS_ERR(di)) { + ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } @@ -5505,35 +5551,7 @@ out: return err; } -static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) -{ - struct btrfs_root *root = inode->root; - struct btrfs_inode *existing; - const u64 ino = btrfs_ino(inode); - int ret; - - if (inode_unhashed(&inode->vfs_inode)) - return 0; - - if (prealloc) { - ret = xa_reserve(&root->inodes, ino, GFP_NOFS); - if (ret) - return ret; - } - - existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); - - if (xa_is_err(existing)) { - ret = xa_err(existing); - ASSERT(ret != -EINVAL); - ASSERT(ret != -ENOMEM); - return ret; - } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); - } - return 0; -} static void btrfs_del_inode_from_root(struct btrfs_inode *inode) { @@ -5595,10 +5613,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) } /* - * Get an inode object given its inode number and corresponding root. - * Path can be preallocated to prevent recursing back to iget through - * allocator. NULL is also valid but may require an additional allocation - * later. + * Get an inode object given its inode number and corresponding root. Path is + * preallocated to prevent recursing back to iget through allocator. */ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path) @@ -5614,30 +5630,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, return inode; ret = btrfs_read_locked_inode(inode, path); - /* - * ret > 0 can come from btrfs_search_slot called by - * btrfs_read_locked_inode(), this means the inode item was not found. - */ - if (ret > 0) - ret = -ENOENT; - if (ret < 0) - goto error; - - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); - if (ret < 0) - goto error; + if (ret) + return ERR_PTR(ret); unlock_new_inode(inode); - return inode; -error: - iget_failed(inode); - return ERR_PTR(ret); } +/* + * Get an inode object given its inode number and corresponding root. + */ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(ino, root, NULL); + struct inode *inode; + struct btrfs_path *path; + int ret; + + inode = btrfs_iget_locked(ino, root); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) + return inode; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_read_locked_inode(inode, path); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + unlock_new_inode(inode); + return inode; } static struct inode *new_simple_dir(struct inode *dir, @@ -6026,7 +6052,7 @@ again: * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as - * they're returned by readdir. Until we re-use freed offsets + * they're returned by readdir. Until we reuse freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. @@ -6392,7 +6418,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6768,8 +6793,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct folio *folio) +static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; @@ -6967,7 +6991,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, folio); + ret = read_inline_extent(path, folio); if (ret < 0) goto out; goto insert; @@ -7024,8 +7048,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7037,7 +7059,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict) + bool nowait) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7090,7 +7112,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); @@ -7297,7 +7318,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -7363,7 +7384,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -7436,7 +7457,7 @@ next: } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); @@ -8040,31 +8061,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -8300,16 +8335,23 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -8317,18 +8359,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -8668,7 +8719,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -8975,28 +9025,6 @@ out_inode: return finish_open_simple(file, ret); } -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - while (index <= end_index) { - folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); - ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ - - /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(folio) == 0); - btrfs_folio_set_writeback(fs_info, folio, start, len); - folio_put(folio); - index++; - } -} - int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { @@ -9041,12 +9069,16 @@ static ssize_t btrfs_encoded_read_inline( unsigned long ptr; void *tmp; ssize_t ret; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } + + path->nowait = nowait; + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { @@ -9109,8 +9141,9 @@ out: } struct btrfs_encoded_read_private { - wait_queue_head_t wait; - atomic_t pending; + struct completion done; + void *uring_ctx; + refcount_t pending_refs; blk_status_t status; }; @@ -9129,26 +9162,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (!atomic_dec_return(&priv->pending)) - wake_up(&priv->wait); + if (refcount_dec_and_test(&priv->pending_refs)) { + int err = blk_status_to_errno(READ_ONCE(priv->status)); + + if (priv->uring_ctx) { + btrfs_uring_read_extent_endio(priv->uring_ctx, err); + kfree(priv); + } else { + complete(&priv->done); + } + } bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, struct page **pages) + u64 disk_bytenr, u64 disk_io_size, + struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private priv = { - .pending = ATOMIC_INIT(1), - }; + struct btrfs_encoded_read_private *priv; unsigned long i = 0; struct btrfs_bio *bbio; + int ret; + + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; - init_waitqueue_head(&priv.wait); + init_completion(&priv->done); + refcount_set(&priv->pending_refs, 1); + priv->status = 0; + priv->uring_ctx = uring_ctx; bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; @@ -9156,11 +9203,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, - btrfs_encoded_read_endio, &priv); + btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; @@ -9171,22 +9218,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv.pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); - if (atomic_dec_return(&priv.pending)) - io_wait_event(priv.wait, !atomic_read(&priv.pending)); - /* See btrfs_encoded_read_endio() for ordering. */ - return blk_status_to_errno(READ_ONCE(priv.status)); + if (uring_ctx) { + if (refcount_dec_and_test(&priv->pending_refs)) { + ret = blk_status_to_errno(READ_ONCE(priv->status)); + btrfs_uring_read_extent_endio(uring_ctx, ret); + kfree(priv); + return ret; + } + + return -EIOCBQUEUED; + } else { + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&priv->done); + /* See btrfs_encoded_read_endio() for ordering. */ + ret = blk_status_to_errno(READ_ONCE(priv->status)); + kfree(priv); + return ret; + } } -static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, - struct iov_iter *iter, - u64 start, u64 lockend, - struct extent_state **cached_state, - u64 disk_bytenr, u64 disk_io_size, - size_t count, bool compressed, - bool *unlocked) +ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state **cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; @@ -9206,8 +9264,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; } - ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, - disk_io_size, pages); + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, NULL); if (ret) goto out; @@ -9247,21 +9305,26 @@ out: } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded) + struct btrfs_ioctl_encoded_io_args *encoded, + struct extent_state **cached_state, + u64 *disk_bytenr, u64 *disk_io_size) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); - u64 start, lockend, disk_bytenr, disk_io_size; - struct extent_state *cached_state = NULL; + u64 start, lockend; struct extent_map *em; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); bool unlocked = false; file_accessed(iocb->ki_filp); - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = btrfs_inode_lock(inode, + BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); + if (ret) + return ret; if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9274,21 +9337,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; - for (;;) { + if (nowait) { struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, - lockend - start + 1); - if (ret) + if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, + start, lockend)) { + ret = -EAGAIN; goto out_unlock_inode; - lock_extent(io_tree, start, lockend, &cached_state); + } + + if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + ret = -EAGAIN; + goto out_unlock_inode; + } + ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); - if (!ordered) - break; - btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, &cached_state); - cond_resched(); + if (ordered) { + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + ret = -EAGAIN; + goto out_unlock_inode; + } + } else { + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(inode, start, + lockend - start + 1); + if (ret) + goto out_unlock_inode; + + lock_extent(io_tree, start, lockend, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + lockend - start + 1); + if (!ordered) + break; + btrfs_put_ordered_extent(ordered); + unlock_extent(io_tree, start, lockend, cached_state); + cond_resched(); + } } em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); @@ -9307,9 +9395,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, - &cached_state, extent_start, + cached_state, extent_start, count, encoded, &unlocked); - goto out; + goto out_unlock_extent; } /* @@ -9320,12 +9408,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { - disk_bytenr = EXTENT_MAP_HOLE; + *disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (extent_map_is_compressed(em)) { - disk_bytenr = em->disk_bytenr; + *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. @@ -9334,7 +9422,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = em->disk_num_bytes; + *disk_io_size = em->disk_num_bytes; count = em->disk_num_bytes; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); @@ -9344,47 +9432,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_em; encoded->compression = ret; } else { - disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ - disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; - count = start + disk_io_size - iocb->ki_pos; + *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; + count = start + *disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; - disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); + *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; - if (disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, &cached_state); + if (*disk_bytenr == EXTENT_MAP_HOLE) { + unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { - ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, - &cached_state, disk_bytenr, - disk_io_size, count, - encoded->compression, - &unlocked); + ret = -EIOCBQUEUED; + goto out_unlock_extent; } -out: - if (ret >= 0) - iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: - if (!unlocked) - unlock_extent(io_tree, start, lockend, &cached_state); + /* Leave inode and extent locked if we need to do a read. */ + if (!unlocked && ret != -EIOCBQUEUED) + unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: - if (!unlocked) + if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } @@ -9495,7 +9578,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); if (!folios) return -ENOMEM; for (i = 0; i < nr_folios; i++) { @@ -9559,7 +9642,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0 && can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { - ret = __cow_file_range_inline(inode, start, encoded->len, + ret = __cow_file_range_inline(inode, encoded->len, orig_count, compression, folios[0], true); if (ret <= 0) { @@ -9779,15 +9862,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the @@ -9796,22 +9889,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -9826,7 +9929,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9840,7 +9944,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9856,11 +9961,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); @@ -9868,24 +9975,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9897,23 +10019,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + backref_ctx->curr_leaf_bytenr = leaf->start; + + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9948,7 +10092,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -9989,20 +10132,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); @@ -10015,6 +10161,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 226c91fe31a7..ae98269a5e3a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -29,6 +29,7 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> +#include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -402,86 +403,6 @@ update_flags: return ret; } -/* - * Start exclusive operation @type, return true on success - */ -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - bool ret = false; - - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { - fs_info->exclusive_operation = type; - ret = true; - } - spin_unlock(&fs_info->super_lock); - - return ret; -} - -/* - * Conditionally allow to enter the exclusive operation in case it's compatible - * with the running one. This must be paired with btrfs_exclop_start_unlock and - * btrfs_exclop_finish. - * - * Compatibility: - * - the same type is already running - * - when trying to add a device and balance has been paused - * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller - * must check the condition first that would allow none -> @type - */ -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type) -{ - spin_lock(&fs_info->super_lock); - if (fs_info->exclusive_operation == type || - (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED && - type == BTRFS_EXCLOP_DEV_ADD)) - return true; - - spin_unlock(&fs_info->super_lock); - return false; -} - -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) -{ - spin_unlock(&fs_info->super_lock); -} - -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) -{ - spin_lock(&fs_info->super_lock); - WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); - spin_unlock(&fs_info->super_lock); - sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); -} - -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op) -{ - switch (op) { - case BTRFS_EXCLOP_BALANCE_PAUSED: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || - fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || - fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; - spin_unlock(&fs_info->super_lock); - break; - case BTRFS_EXCLOP_BALANCE: - spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); - fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; - spin_unlock(&fs_info->super_lock); - break; - default: - btrfs_warn(fs_info, - "invalid exclop balance operation %d requested", op); - } -} - static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); @@ -550,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return ret; } -int __pure btrfs_is_empty_uuid(const u8 *uuid) -{ - int i; - - for (i = 0; i < BTRFS_UUID_SIZE; i++) { - if (uuid[i]) - return 0; - } - return 1; -} - /* * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. @@ -1048,7 +958,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, struct btrfs_qgroup_inherit *inherit) { int ret; - bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is @@ -1067,15 +976,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, * creation. */ atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } @@ -1308,9 +1215,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { - struct fd src = fdget(fd); + CLASS(fd, src)(fd); struct inode *src_inode; - if (!fd_file(src)) { + if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } @@ -1341,7 +1248,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, BTRFS_I(src_inode)->root, readonly, inherit); } - fdput(src); } out_drop_write: mnt_drop_write_file(file); @@ -3010,7 +2916,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -4058,8 +3963,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, return 0; } -static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, - void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4514,12 +4418,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; + u64 disk_bytenr, disk_io_size; + struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4572,7 +4481,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; - ret = btrfs_encoded_read(&kiocb, &iter, &args); + ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, + &disk_bytenr, &disk_io_size); + + if (ret == -EIOCBQUEUED) { + bool unlocked = false; + u64 start, lockend, count; + + start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + if (args.compression) + count = disk_io_size; + else + count = args.len; + + ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, + &cached_state, disk_bytenr, + disk_io_size, count, + args.compression, &unlocked); + + if (!unlocked) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + } + if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, @@ -4690,6 +4624,585 @@ out_acct: return ret; } +/* + * Context that's attached to an encoded read io_uring command, in cmd->pdu. It + * contains the fields in btrfs_uring_read_extent that are necessary to finish + * off and cleanup the I/O in btrfs_uring_read_finished. + */ +struct btrfs_uring_priv { + struct io_uring_cmd *cmd; + struct page **pages; + unsigned long nr_pages; + struct kiocb iocb; + struct iovec *iov; + struct iov_iter iter; + struct extent_state *cached_state; + u64 count; + u64 start; + u64 lockend; + int err; + bool compressed; +}; + +struct io_btrfs_cmd { + struct btrfs_uring_priv *priv; +}; + +static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_priv *priv = bc->priv; + struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + unsigned long index; + u64 cur; + size_t page_offset; + ssize_t ret; + + /* The inode lock has already been acquired in btrfs_uring_read_extent. */ + btrfs_lockdep_inode_acquire(inode, i_rwsem); + + if (priv->err) { + ret = priv->err; + goto out; + } + + if (priv->compressed) { + index = 0; + page_offset = 0; + } else { + index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; + page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); + } + cur = 0; + while (cur < priv->count) { + size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); + + if (copy_page_to_iter(priv->pages[index], page_offset, bytes, + &priv->iter) != bytes) { + ret = -EFAULT; + goto out; + } + + index++; + cur += bytes; + page_offset = 0; + } + ret = priv->count; + +out: + unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + add_rchar(current, ret); + + for (index = 0; index < priv->nr_pages; index++) + __free_page(priv->pages[index]); + + kfree(priv->pages); + kfree(priv->iov); + kfree(priv); +} + +void btrfs_uring_read_extent_endio(void *ctx, int err) +{ + struct btrfs_uring_priv *priv = ctx; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); + + priv->err = err; + bc->priv = priv; + + io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); +} + +static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, + u64 start, u64 lockend, + struct extent_state *cached_state, + u64 disk_bytenr, u64 disk_io_size, + size_t count, bool compressed, + struct iovec *iov, struct io_uring_cmd *cmd) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct extent_io_tree *io_tree = &inode->io_tree; + struct page **pages; + struct btrfs_uring_priv *priv = NULL; + unsigned long nr_pages; + int ret; + + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + ret = btrfs_alloc_page_array(nr_pages, pages, 0); + if (ret) { + ret = -ENOMEM; + goto out_fail; + } + + priv = kmalloc(sizeof(*priv), GFP_NOFS); + if (!priv) { + ret = -ENOMEM; + goto out_fail; + } + + priv->iocb = *iocb; + priv->iov = iov; + priv->iter = *iter; + priv->count = count; + priv->cmd = cmd; + priv->cached_state = cached_state; + priv->compressed = compressed; + priv->nr_pages = nr_pages; + priv->pages = pages; + priv->start = start; + priv->lockend = lockend; + priv->err = 0; + + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, + disk_io_size, pages, priv); + if (ret && ret != -EIOCBQUEUED) + goto out_fail; + + /* + * If we return -EIOCBQUEUED, we're deferring the cleanup to + * btrfs_uring_read_finished(), which will handle unlocking the extent + * and inode and freeing the allocations. + */ + + /* + * We're returning to userspace with the inode lock held, and that's + * okay - it'll get unlocked in a worker thread. Call + * btrfs_lockdep_inode_release() to avoid confusing lockdep. + */ + btrfs_lockdep_inode_release(inode, i_rwsem); + + return -EIOCBQUEUED; + +out_fail: + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + kfree(priv); + return ret; +} + +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + +static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); + size_t copy_end; + int ret; + u64 disk_bytenr, disk_io_size; + struct file *file; + struct btrfs_inode *inode; + struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree; + loff_t pos; + struct kiocb kiocb; + struct extent_state *cached_state = NULL; + u64 start, lockend; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + file = cmd->file; + inode = BTRFS_I(file->f_inode); + fs_info = inode->root->fs_info; + io_tree = &inode->io_tree; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); +#else + return -ENOTTY; +#endif + } else { + copy_end = copy_end_kernel; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, copy_end)) { + ret = -EFAULT; + goto out_acct; + } + } + + if (data->args.flags != 0) { + ret = -EINVAL; + goto out_acct; + } + + data->iov = data->iovstack; + ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_free; + } + } + + pos = data->args.offset; + ret = rw_verify_area(READ, file, &pos, data->args.len); + if (ret < 0) + goto out_free; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + + if (issue_flags & IO_URING_F_NONBLOCK) + kiocb.ki_flags |= IOCB_NOWAIT; + + start = ALIGN_DOWN(pos, fs_info->sectorsize); + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; + + ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, + &disk_bytenr, &disk_io_size); + if (ret < 0 && ret != -EIOCBQUEUED) + goto out_free; + + file_accessed(file); + + if (copy_to_user(sqe_addr + copy_end, + (const char *)&data->args + copy_end_kernel, + sizeof(data->args) - copy_end_kernel)) { + if (ret == -EIOCBQUEUED) { + unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + } + ret = -EFAULT; + goto out_free; + } + + if (ret == -EIOCBQUEUED) { + u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); + + /* Match ioctl by not returning past EOF if uncompressed. */ + if (!data->args.compression) + count = min_t(u64, count, data->args.len); + + ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, + cached_state, disk_bytenr, disk_io_size, + count, data->args.compression, + data->iov, cmd); + + goto out_acct; + } + +out_free: + kfree(data->iov); + +out_acct: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + + return ret; +} + +static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + loff_t pos; + struct kiocb kiocb; + struct file *file; + ssize_t ret; + void __user *sqe_addr; + struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_acct; + } + + file = cmd->file; + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); + + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EBADF; + goto out_acct; + } + + if (!data) { + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + ret = -ENOMEM; + goto out_acct; + } + + io_uring_cmd_get_async_data(cmd)->op_data = data; + + if (issue_flags & IO_URING_F_COMPAT) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_encoded_io_args_32 args32; + + if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { + ret = -EFAULT; + goto out_acct; + } + data->args.iov = compat_ptr(args32.iov); + data->args.iovcnt = args32.iovcnt; + data->args.offset = args32.offset; + data->args.flags = args32.flags; + data->args.len = args32.len; + data->args.unencoded_len = args32.unencoded_len; + data->args.unencoded_offset = args32.unencoded_offset; + data->args.compression = args32.compression; + data->args.encryption = args32.encryption; + memcpy(data->args.reserved, args32.reserved, + sizeof(data->args.reserved)); +#else + ret = -ENOTTY; + goto out_acct; +#endif + } else { + if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { + ret = -EFAULT; + goto out_acct; + } + } + + ret = -EINVAL; + if (data->args.flags != 0) + goto out_acct; + if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) + goto out_acct; + if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && + data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) + goto out_acct; + if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || + data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) + goto out_acct; + if (data->args.unencoded_offset > data->args.unencoded_len) + goto out_acct; + if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) + goto out_acct; + + data->iov = data->iovstack; + ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, + ARRAY_SIZE(data->iovstack), &data->iov, + &data->iter); + if (ret < 0) + goto out_acct; + + if (iov_iter_count(&data->iter) == 0) { + ret = 0; + goto out_iov; + } + } + + if (issue_flags & IO_URING_F_NONBLOCK) { + ret = -EAGAIN; + goto out_acct; + } + + pos = data->args.offset; + ret = rw_verify_area(WRITE, file, &pos, data->args.len); + if (ret < 0) + goto out_iov; + + init_sync_kiocb(&kiocb, file); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); + if (ret) + goto out_iov; + kiocb.ki_pos = pos; + + file_start_write(file); + + ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); + if (ret > 0) + fsnotify_modify(file); + + file_end_write(file); +out_iov: + kfree(data->iov); +out_acct: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case BTRFS_IOC_ENCODED_READ: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_READ_32: +#endif + return btrfs_uring_encoded_read(cmd, issue_flags); + + case BTRFS_IOC_ENCODED_WRITE: +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_ENCODED_WRITE_32: +#endif + return btrfs_uring_encoded_write(cmd, issue_flags); + } + + return -EINVAL; +} + +static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) +{ + struct btrfs_root *root; + struct btrfs_ioctl_subvol_wait args = { 0 }; + signed long sched_ret; + int refs; + u64 root_flags; + bool wait_for_deletion = false; + bool found = false; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + switch (args.mode) { + case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: + /* + * Wait for the first one deleted that waits until all previous + * are cleaned. + */ + spin_lock(&fs_info->trans_lock); + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + found = true; + } + spin_unlock(&fs_info->trans_lock); + if (!found) + return -ENOENT; + + fallthrough; + case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: + if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || + BTRFS_LAST_FREE_OBJECTID < args.subvolid) + return -EINVAL; + break; + case BTRFS_SUBVOL_SYNC_COUNT: + spin_lock(&fs_info->trans_lock); + args.count = list_count_nodes(&fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_FIRST: + spin_lock(&fs_info->trans_lock); + /* Last in the list was deleted first. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_last_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + case BTRFS_SUBVOL_SYNC_PEEK_LAST: + spin_lock(&fs_info->trans_lock); + /* First in the list was deleted last. */ + if (!list_empty(&fs_info->dead_roots)) { + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + args.subvolid = btrfs_root_id(root); + } else { + args.subvolid = 0; + } + spin_unlock(&fs_info->trans_lock); + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } + + /* 32bit limitation: fs_roots_radix key is not wide enough. */ + if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) + return -EOVERFLOW; + + while (1) { + /* Wait for the specific one. */ + if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) + return -EINTR; + refs = -1; + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)args.subvolid); + if (root) { + spin_lock(&root->root_item_lock); + refs = btrfs_root_refs(&root->root_item); + root_flags = btrfs_root_flags(&root->root_item); + spin_unlock(&root->root_item_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + up_read(&fs_info->subvol_sem); + + /* Subvolume does not exist. */ + if (!root) + return -ENOENT; + + /* Subvolume not deleted at all. */ + if (refs > 0) + return -EEXIST; + /* We've waited and now the subvolume is gone. */ + if (wait_for_deletion && refs == -1) { + /* Return the one we waited for as the last one. */ + if (copy_to_user(argp, &args, sizeof(args))) + return -EFAULT; + return 0; + } + + /* Subvolume not found on the first try (deleted or never existed). */ + if (refs == -1) + return -ENOENT; + + wait_for_deletion = true; + ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); + sched_ret = schedule_timeout_interruptible(HZ); + /* Early wake up or error. */ + if (sched_ret != 0) + return -EINTR; + } + + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4812,7 +5325,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(fs_info, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: @@ -4831,6 +5344,8 @@ long btrfs_ioctl(struct file *file, unsigned int return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); + case FS_IOC_READ_VERITY_METADATA: + return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: @@ -4841,6 +5356,8 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif + case BTRFS_IOC_SUBVOL_SYNC_WAIT: + return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 19cd26b0244a..ce915fcda43b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -19,8 +19,9 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(const u8 *uuid); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void btrfs_uring_read_extent_endio(void *ctx, int err); #endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 6a0b7abb5bd9..9a7a7b723305 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) } /* - * Try-lock for write. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (down_write_trylock(&eb->lock)) { - btrfs_set_eb_lock_owner(eb, current->pid); - trace_btrfs_try_tree_write_lock(eb); - return 1; - } - return 0; -} - -/* * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3c15c75e0582..c69e57ff804b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -129,6 +129,16 @@ enum btrfs_lockdep_trans_states { rwsem_release(&owner->lock##_map, _THIS_IP_) /* + * Used to account for the fact that when doing io_uring encoded I/O, we can + * return to userspace with the inode lock still held. + */ +#define btrfs_lockdep_inode_acquire(owner, lock) \ + rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_) + +#define btrfs_lockdep_inode_release(owner, lock) \ + rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_) + +/* * Macros for the transaction states wait events, similar to the generic wait * event macros. */ @@ -180,7 +190,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); @@ -190,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { lockdep_assert_held_write(&eb->lock); } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_read(&eb->lock); +} #else static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 72856f6775f7..a45bc11f8665 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(void) { struct workspace *workspace; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2104d60c2161..30eceaf829a7 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -111,8 +111,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } -static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, - u64 len) +static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) { if (file_offset + len <= entry->file_offset || entry->file_offset + entry->num_bytes <= file_offset) @@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* - * Ordered (Private2) bit indicates whether we still have + * Ordered flag indicates whether we still have * pending io unfinished for the ordered extent. * - * If there's no such bit, we need to skip to next range. + * If it's not set, we need to skip to next range. */ if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; @@ -985,7 +985,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( while (1) { entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) break; if (entry->file_offset >= file_offset + len) { @@ -1114,12 +1114,12 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( } if (prev) { entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) goto out; } if (next) { entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) + if (btrfs_range_overlaps(entry, file_offset, len)) goto out; } /* No ordered extent in the range */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c297909f1506..b90fabe302e6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; @@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); return 0; } @@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) /* * If a qgroup exists for a subvolume ID, it is possible * that subvolume has been deleted, in which case - * re-using that ID would lead to incorrect accounting. + * reusing that ID would lead to incorrect accounting. * * Ensure that we skip any such subvol ids. * @@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(fs_info, qgroup); + __del_qgroup_rb(qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); } @@ -674,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - - btrfs_mark_buffer_dirty(trans, path->nodes[0]); - btrfs_free_path(path); return ret; } @@ -753,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(trans, leaf); - btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; @@ -772,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(trans, leaf); - ret = 0; out: btrfs_free_path(path); @@ -860,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -906,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -948,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - - btrfs_mark_buffer_dirty(trans, l); - out: btrfs_free_path(path); return ret; @@ -1122,6 +1105,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; if (simple) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); } else { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -1130,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(trans, leaf); - key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; key.offset = 0; @@ -1255,8 +1237,6 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (simple) - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); /* Skip rescan for simple qgroups. */ @@ -1407,7 +1387,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; - fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; + fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -1840,9 +1820,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) * Thus its reserved space should all be zero, no matter if qgroup * is consistent or the mode. */ - WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || - qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_warn_rl(fs_info, +"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC], + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]); + + } /* * The same for rfer/excl numbers, but that's only if our qgroup is * consistent and if it's in regular qgroup mode. @@ -1851,8 +1841,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL && !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { - if (WARN_ON(qgroup->rfer || qgroup->excl || - qgroup->rfer_cmpr || qgroup->excl_cmpr)) { + if (qgroup->rfer || qgroup->excl || + qgroup->rfer_cmpr || qgroup->excl_cmpr) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -2001,20 +1992,30 @@ out: * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record, + u64 bytenr) { struct btrfs_qgroup_extent_record *existing, *ret; - unsigned long bytenr = record->bytenr; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); if (!btrfs_qgroup_full_accounting(fs_info)) return 1; - lockdep_assert_held(&delayed_refs->lock); - trace_btrfs_qgroup_trace_extent(fs_info, record); +#if BITS_PER_LONG == 32 + if (bytenr >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, +"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", + bytenr); + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; + } +#endif + + trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); xa_lock(&delayed_refs->dirty_extents); - existing = xa_load(&delayed_refs->dirty_extents, bytenr); + existing = xa_load(&delayed_refs->dirty_extents, index); if (existing) { if (record->data_rsv && !existing->data_rsv) { existing->data_rsv = record->data_rsv; @@ -2024,7 +2025,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 1; } - ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { qgroup_mark_inconsistent(fs_info); @@ -2056,12 +2057,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord) + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr) { - struct btrfs_backref_walk_ctx ctx = { 0 }; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_backref_walk_ctx ctx = { + .bytenr = bytenr, + .fs_info = fs_info, + }; int ret; - if (!btrfs_qgroup_full_accounting(trans->fs_info)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * We are always called in a context where we are already holding a @@ -2084,16 +2090,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, */ ASSERT(trans != NULL); - if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ctx.bytenr = qrecord->bytenr; - ctx.fs_info = trans->fs_info; - ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(trans->fs_info); - btrfs_warn(trans->fs_info, + qgroup_mark_inconsistent(fs_info); + btrfs_warn(fs_info, "error accounting new delayed refs extent (err code: %d), quota inconsistent", ret); return 0; @@ -2128,7 +2131,8 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; + const unsigned long index = (bytenr >> fs_info->sectorsize_bits); int ret; if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) @@ -2137,26 +2141,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; - if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { kfree(record); return -ENOMEM; } - delayed_refs = &trans->transaction->delayed_refs; - record->bytenr = bytenr; record->num_bytes = num_bytes; - record->old_roots = NULL; - spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); - spin_unlock(&delayed_refs->lock); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ - xa_release(&delayed_refs->dirty_extents, record->bytenr); + xa_release(&delayed_refs->dirty_extents, index); kfree(record); return 0; } - return btrfs_qgroup_trace_extent_post(trans, record); + return btrfs_qgroup_trace_extent_post(trans, record, bytenr); } /* @@ -2641,7 +2640,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, if (!extent_buffer_uptodate(root_eb)) { struct btrfs_tree_parent_check check = { - .has_first_key = false, .transid = root_gen, .level = root_level }; @@ -3032,14 +3030,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; xa_for_each(&delayed_refs->dirty_extents, index, record) { + const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); + num_dirty_extents++; - trace_btrfs_qgroup_account_extents(fs_info, record); + trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { struct btrfs_backref_walk_ctx ctx = { 0 }; - ctx.bytenr = record->bytenr; + ctx.bytenr = bytenr; ctx.fs_info = fs_info; /* @@ -3081,7 +3081,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) ulist_del(record->old_roots, qgroup_to_skip, 0); } - ret = btrfs_qgroup_account_extent(trans, record->bytenr, + ret = btrfs_qgroup_account_extent(trans, bytenr, record->num_bytes, record->old_roots, new_roots); @@ -4185,13 +4185,20 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - btrfs_run_delayed_iputs(root->fs_info); - btrfs_wait_on_delayed_iputs(root->fs_info); ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, NULL); + /* + * After waiting for ordered extents run delayed iputs in order to free + * space from unlinked files before committing the current transaction, + * as ordered extents may have been holding the last reference of an + * inode and they add a delayed iput when they complete. + */ + btrfs_run_delayed_iputs(root->fs_info); + btrfs_wait_on_delayed_iputs(root->fs_info); + ret = btrfs_commit_current_transaction(root); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); @@ -4676,8 +4683,7 @@ out: * BOTH POINTERS ARE BEFORE TREE SWAP * @last_snapshot: last snapshot generation of the subvolume tree */ -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -4883,17 +4889,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) xa_destroy(&trans->delayed_refs.dirty_extents); } -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) -{ - if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) - return; - - if (!is_fstree(root)) - return; - - btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); -} - int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 98adf4ec7b01..e233cc79af18 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -121,11 +121,18 @@ struct btrfs_inode; #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) +#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3) + /* * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - u64 bytenr; + /* + * The bytenr of the extent is given by its index in the dirty_extents + * xarray of struct btrfs_delayed_ref_root left shifted by + * fs_info->sectorsize_bits. + */ + u64 num_bytes; /* @@ -343,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); + struct btrfs_qgroup_extent_record *record, + u64 bytenr); int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, - struct btrfs_qgroup_extent_record *qrecord); + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr); int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, @@ -430,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( struct btrfs_qgroup_swapped_blocks *swapped_blocks); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); -int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *subvol_root, +int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, @@ -440,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); -void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, const struct btrfs_squota_delta *delta); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 4c859b550f6c..1834011ccc49 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,6 +13,56 @@ #include "volumes.h" #include "print-tree.h" +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + const struct btrfs_key *oldkey, + u64 newlen, u64 frontpad) +{ + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; + struct extent_buffer *leaf; + int slot; + size_t item_size; + struct btrfs_key newkey = { + .objectid = oldkey->objectid + frontpad, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = newlen, + }; + int ret; + + ASSERT(newlen > 0); + ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); + + leaf = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); + } + + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; +} + int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -26,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le int slot; int ret; - if (!stripe_root) + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root) return 0; + if (!btrfs_is_testing(fs_info)) { + struct btrfs_chunk_map *map; + bool use_rst; + + map = btrfs_find_chunk_map(fs_info, start, length); + if (!map) + return -EINVAL; + use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + btrfs_free_chunk_map(map); + if (!use_rst) + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -36,23 +99,55 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le while (1) { key.objectid = start; key.type = BTRFS_RAID_STRIPE_KEY; - key.offset = length; + key.offset = 0; ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); if (ret < 0) break; - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; + + if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) path->slots[0]--; - } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); found_start = key.objectid; found_end = found_start + key.offset; + ret = 0; + + /* + * The stripe extent starts before the range we want to delete, + * but the range spans more than one stripe extent: + * + * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to get the previous item, truncate its + * length and then restart the search. + */ + if (found_start > start) { + if (slot == 0) { + ret = btrfs_previous_item(stripe_root, path, start, + BTRFS_RAID_STRIPE_KEY); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } + } else { + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + ASSERT(found_start <= start); + } + + if (key.type != BTRFS_RAID_STRIPE_KEY) + break; /* That stripe ends before we start, we're done. */ if (found_end <= start) @@ -61,7 +156,98 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le trace_btrfs_raid_extent_delete(fs_info, start, end, found_start, found_end); - ASSERT(found_start >= start && found_end <= end); + /* + * The stripe extent starts before the range we want to delete + * and ends after the range we want to delete, i.e. we're + * punching a hole in the stripe extent: + * + * |--- RAID Stripe Extent ---| + * | keep |--- drop ---| keep | + * + * This means we need to a) truncate the existing item and b) + * create a second item for the remaining range. + */ + if (found_start < start && found_end > end) { + size_t item_size; + u64 diff_start = start - found_start; + u64 diff_end = found_end - end; + struct btrfs_stripe_extent *extent; + struct btrfs_key newkey = { + .objectid = end, + .type = BTRFS_RAID_STRIPE_KEY, + .offset = diff_end, + }; + + /* The "right" item. */ + ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey); + if (ret) + break; + + item_size = btrfs_item_size(leaf, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_stripe_extent); + + for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { + struct btrfs_raid_stride *stride = &extent->strides[i]; + u64 phys; + + phys = btrfs_raid_stride_physical(leaf, stride); + phys += diff_start + length; + btrfs_set_raid_stride_physical(leaf, stride, phys); + } + + /* The "left" item. */ + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + break; + } + + /* + * The stripe extent starts before the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- keep ---|--- drop ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_start < start) { + u64 diff_start = start - found_start; + + btrfs_partially_delete_raid_extent(trans, path, &key, + diff_start, 0); + + start += (key.offset - diff_start); + length -= (key.offset - diff_start); + if (length == 0) + break; + + btrfs_release_path(path); + continue; + } + + /* + * The stripe extent ends after the range we want to delete: + * + * |--- RAID Stripe Extent ---| + * |--- drop ---|--- keep ---| + * + * This means we have to duplicate the tree item, truncate the + * length to the new size and then re-insert the item. + */ + if (found_end > end) { + u64 diff_end = found_end - end; + + btrfs_partially_delete_raid_extent(trans, path, &key, + key.offset - length, + length); + ASSERT(key.offset - diff_end == length); + break; + } + + /* Finally we can delete the whole item, no more special cases. */ ret = btrfs_del_item(trans, stripe_root, path); if (ret) break; @@ -102,14 +288,14 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), item_size); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return ret; } -static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, - struct btrfs_io_context *bioc) +EXPORT_FOR_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key stripe_key; @@ -131,12 +317,8 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, for (int i = 0; i < num_stripes; i++) { u64 devid = bioc->stripes[i].dev->devid; u64 physical = bioc->stripes[i].physical; - u64 length = bioc->stripes[i].length; struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; - if (length == 0) - length = bioc->size; - btrfs_set_stack_raid_stride_devid(raid_stride, devid); btrfs_set_stack_raid_stride_physical(raid_stride, physical); } @@ -233,7 +415,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, found_end = found_logical + found_length; if (found_logical > end) { - ret = -ENOENT; + ret = -ENODATA; goto out; } @@ -279,10 +461,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, } /* If we're here, we haven't found the requested devid in the stripe. */ - ret = -ENOENT; + ret = -ENODATA; out: if (ret > 0) - ret = -ENOENT; + ret = -ENODATA; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 1ac1c21aac2f..541836421778 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered_extent); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc); +#endif + static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, u64 map_type) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 39bec672df0c..cdd373c27784 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) static void assert_rbio(struct btrfs_raid_bio *rbio) { - if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || - !IS_ENABLED(CONFIG_BTRFS_ASSERT)) + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9522a8b79d22..2928abf7eb82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -857,6 +857,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + rb_erase(&ref->node, &be->refs); kfree(ref); kfree(ra); goto out_unlock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f3834f8d26b4..af0969b70b53 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, if (cur == node) ret = true; - /* The node is the lowest node */ - if (cur->lowest) { - list_del_init(&cur->lower); - cur->lowest = 0; - } - /* Cleanup the lower edges */ while (!list_empty(&cur->lower)) { struct btrfs_backref_edge *edge; @@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cache to avoid unnecessary backref lookup. */ if (cur->level > 0) { - list_add(&cur->list, &cache->detached); cur->detached = 1; } else { rb_erase(&cur->rb_node, &cache->rb_root); @@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( goto out; } - node->lowest = 1; cur = node; /* Breadth-first search to build backref cache */ @@ -470,92 +462,6 @@ out: } /* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - const struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct btrfs_backref_cache *cache = &rc->backref_cache; - struct btrfs_backref_node *node = NULL; - struct btrfs_backref_node *new_node; - struct btrfs_backref_edge *edge; - struct btrfs_backref_edge *new_edge; - struct rb_node *rb_node; - - rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = rb_simple_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct btrfs_backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = btrfs_backref_alloc_node(cache, dest->node->start, - node->level); - if (!new_node) - return -ENOMEM; - - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = btrfs_grab_root(dest); - ASSERT(new_node->root); - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = btrfs_backref_alloc_edge(cache); - if (!new_edge) - goto fail; - - btrfs_backref_link_edge(new_edge, edge->node[LOWER], - new_node, LINK_UPPER); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct btrfs_backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - btrfs_backref_free_edge(cache, new_edge); - } - btrfs_backref_free_node(cache, new_node); - return -ENOMEM; -} - -/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -950,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, u32 i; int ret = 0; int first = 1; - int dirty = 0; if (rc->stage != UPDATE_DATA_PTRS) return 0; @@ -1030,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); ref.action = BTRFS_ADD_DELAYED_REF; @@ -1061,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } } - if (dirty) - btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(inode); return ret; @@ -1244,7 +1146,7 @@ again: * The real subtree rescan is delayed until we have new * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + ret = btrfs_qgroup_add_swapped_blocks(dest, rc->block_group, parent, slot, path->nodes[level], path->slots[level], last_snapshot); @@ -1255,13 +1157,11 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(trans, path->nodes[level]); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = old_bytenr; @@ -2058,100 +1958,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, int index = 0; int ret; - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; + next = walk_up_backref(node, edges, &index); + root = next->root; - /* - * If there is no root, then our references for this block are - * incomplete, as we should be able to walk all the way up to a - * block that is owned by a root. - * - * This path is only for SHAREABLE roots, so if we come upon a - * non-SHAREABLE root then we have backrefs that resolve - * improperly. - * - * Both of these cases indicate file system corruption, or a bug - * in the backref walking code. - */ - if (!root) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu doesn't have a backref path ending in a root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - ASSERT(0); - btrfs_err(trans->fs_info, - "bytenr %llu has multiple refs with one ending in a non-shareable root", - node->bytenr); - return ERR_PTR(-EUCLEAN); - } - - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { - ret = record_reloc_root_in_trans(trans, root); - if (ret) - return ERR_PTR(ret); - break; - } + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a block + * that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve improperly. + * + * Both of these cases indicate file system corruption, or a bug in the + * backref walking code. + */ + if (unlikely(!root)) { + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } - ret = btrfs_record_root_in_trans(trans, root); + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { + ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); - root = root->reloc_root; - - /* - * We could have raced with another thread which failed, so - * root->reloc_root may not be set, return ENOENT in this case. - */ - if (!root) - return ERR_PTR(-ENOENT); + goto found; + } - if (next->new_bytenr != root->node->start) { - /* - * We just created the reloc root, so we shouldn't have - * ->new_bytenr set and this shouldn't be in the changed - * list. If it is then we have multiple roots pointing - * at the same bytenr which indicates corruption, or - * we've made a mistake in the backref walking code. - */ - ASSERT(next->new_bytenr == 0); - ASSERT(list_empty(&next->list)); - if (next->new_bytenr || !list_empty(&next->list)) { - btrfs_err(trans->fs_info, - "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", - node->bytenr, next->bytenr); - return ERR_PTR(-EUCLEAN); - } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); + root = root->reloc_root; - next->new_bytenr = root->node->start; - btrfs_put_root(next->root); - next->root = btrfs_grab_root(root); - ASSERT(next->root); - list_add_tail(&next->list, - &rc->backref_cache.changed); - mark_block_processed(rc, next); - break; - } + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) { + if (next->new_bytenr) { /* - * This can happen if there's fs corruption or if there's a bug - * in the backref lookup code. + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set yet. If it is then we have multiple roots + * pointing at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. */ - ASSERT(0); - return ERR_PTR(-ENOENT); + ASSERT(next->new_bytenr == 0); + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); } + next->new_bytenr = root->node->start; + btrfs_put_root(next->root); + next->root = btrfs_grab_root(root); + ASSERT(next->root); + mark_block_processed(rc, next); +found: next = node; /* setup backref node path for btrfs_reloc_cow_block */ while (1) { @@ -2247,17 +2119,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, return num_bytes; } -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_backref_node *node) +static int refill_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, u64 num_bytes) { - struct btrfs_root *root = rc->extent_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 num_bytes; + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - u64 tmp; - - num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2270,7 +2136,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2288,6 +2155,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, return 0; } +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_backref_node *node) +{ + u64 num_bytes; + + num_bytes = calcu_metadata_size(rc, node) * 2; + return refill_metadata_space(trans, rc, num_bytes); +} + /* * relocate a block tree, and then update pointers in upper level * blocks that reference the block to point to the new location. @@ -2442,7 +2319,7 @@ next: if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); + list_del_init(&node->list); node->pending = 0; } @@ -2605,8 +2482,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, /* * This block was the root block of a root, and this is * the first time we're processing the block and thus it - * should not have had the ->new_bytenr modified and - * should have not been included on the changed list. + * should not have had the ->new_bytenr modified. * * However in the case of corruption we could have * multiple refs pointing to the same block improperly, @@ -2616,8 +2492,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, * normal user in the case of corruption. */ ASSERT(node->new_bytenr == 0); - ASSERT(list_empty(&node->list)); - if (node->new_bytenr || !list_empty(&node->list)) { + if (node->new_bytenr) { btrfs_err(root->fs_info, "bytenr %llu has improper references to it", node->bytenr); @@ -2640,17 +2515,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, btrfs_put_root(node->root); node->root = btrfs_grab_root(root); ASSERT(node->root); - list_add_tail(&node->list, &rc->backref_cache.changed); } else { - path->lowest_level = node->level; - if (root == root->fs_info->chunk_root) - btrfs_reserve_chunk_metadata(trans, false); - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (root == root->fs_info->chunk_root) - btrfs_trans_release_chunk_metadata(trans); - if (ret > 0) - ret = 0; + btrfs_err(root->fs_info, + "bytenr %llu resolved to a non-shareable root", + node->bytenr); + ret = -EUCLEAN; + goto out; } if (!ret) update_processed_blocks(rc, node); @@ -2658,11 +2528,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = do_relocation(trans, rc, node, key, path, 1); } out: - if (ret || node->level == 0 || node->cowonly) + if (ret || node->level == 0) btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } +static int relocate_cowonly_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct tree_block *block, + struct btrfs_path *path) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + u64 num_bytes; + int nr_levels; + int ret; + + root = btrfs_get_fs_root(fs_info, block->owner, true); + if (IS_ERR(root)) + return PTR_ERR(root); + + nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; + + num_bytes = fs_info->nodesize * nr_levels; + ret = refill_metadata_space(trans, rc, num_bytes); + if (ret) { + btrfs_put_root(root); + return ret; + } + path->lowest_level = block->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); + + ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); + path->lowest_level = 0; + btrfs_release_path(path); + + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); + if (ret > 0) + ret = 0; + btrfs_put_root(root); + + return ret; +} + /* * relocate a list of blocks */ @@ -2702,6 +2611,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { + /* + * For COWonly blocks, or the data reloc tree, we only need to + * COW down to the block, there's no need to generate a backref + * tree. + */ + if (block->owner && + (!is_fstree(block->owner) || + block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { + ret = relocate_cowonly_block(trans, rc, block, path); + if (ret) + break; + continue; + } + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { @@ -2902,6 +2825,7 @@ static int relocate_one_folio(struct reloc_control *rc, const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); +again: folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { @@ -2937,11 +2861,16 @@ static int relocate_one_folio(struct reloc_control *rc, ret = -EIO; goto release_folio; } + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } /* * We could have lost folio private when we dropped the lock to read the - * folio above, make sure we set_page_extent_mapped here so we have any + * folio above, make sure we set_folio_extent_mapped() here so we have any * of the subpage blocksize stuff we need in place. */ ret = set_folio_extent_mapped(folio); @@ -3793,7 +3722,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -4399,8 +4327,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + } btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); @@ -4500,10 +4438,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return ret; } new_root->reloc_root = btrfs_grab_root(reloc_root); - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; + return 0; } /* diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 33962671a96c..e22e6b06927a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -447,7 +446,6 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3a3427428074..531312efee8d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; + if (unlikely(!extent_root)) { + btrfs_err(fs_info, "no valid extent root for scrub"); + return -EUCLEAN; + } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * stripe->nr_sectors); scrub_stripe_reset_bitmaps(stripe); @@ -1656,8 +1660,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe) stripe->bg->start + stripe->bg->length - stripe->logical); } -static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) +static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; @@ -1704,8 +1707,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); if (err < 0) { - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + if (err != -ENODATA) { + /* + * Earlier btrfs_get_raid_extent_offset() + * returned -ENODATA, which means there's + * no entry for the corresponding range + * in the stripe tree. But if it's in + * the extent tree, then it's a preallocated + * extent and not an error. + */ + set_bit(i, &stripe->io_error_bitmap); + set_bit(i, &stripe->error_bitmap); + } continue; } @@ -1743,7 +1756,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { - scrub_submit_extent_sector_read(sctx, stripe); + scrub_submit_extent_sector_read(stripe); return; } @@ -1954,7 +1967,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); /* - * For data stripe search, we cannot re-use the same extent/csum paths, + * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ @@ -2103,7 +2116,6 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2222,7 +2234,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + ret = scrub_simple_mirror(sctx, bg, cur_logical, BTRFS_STRIPE_LEN, device, cur_physical, mirror_num); if (ret) @@ -2256,7 +2268,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - int stop_loop = 0; /* Extent_path should be released by now. */ ASSERT(sctx->extent_path.nodes[0] == NULL); @@ -2307,7 +2318,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; @@ -2362,7 +2373,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; @@ -2370,14 +2381,8 @@ next: logical += increment; physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); - if (stop_loop) - sctx->stat.last_physical = - map->stripes[stripe_index].physical + dev_stripe_len; - else - sctx->stat.last_physical = physical; + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); - if (stop_loop) - break; } out: ret2 = flush_scrub_stripes(sctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 27306d98ec43..f437138fefbc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) return ret; } -typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, - struct fs_path *p, - void *ctx); +typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); /* * Helper function to iterate the entries in ONE btrfs_inode_ref or @@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, u32 name_len; char *start; int ret = 0; - int num = 0; - int index; u64 dir; unsigned long name_off; unsigned long elem_size; @@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, iref = (struct btrfs_inode_ref *)(ptr + cur); name_len = btrfs_inode_ref_name_len(eb, iref); name_off = (unsigned long)(iref + 1); - index = btrfs_inode_ref_index(eb, iref); dir = found_key->offset; } else { extref = (struct btrfs_inode_extref *)(ptr + cur); name_len = btrfs_inode_extref_name_len(eb, extref); name_off = (unsigned long)&extref->name; - index = btrfs_inode_extref_index(eb, extref); dir = btrfs_inode_extref_parent(eb, extref); } @@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } cur += elem_size + name_len; - ret = iterate(num, dir, index, p, ctx); + ret = iterate(dir, p, ctx); if (ret) goto out; - num++; } out: @@ -1227,8 +1220,7 @@ out: return ret; } -static int __copy_first_ref(int num, u64 dir, int index, - struct fs_path *p, void *ctx) +static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) { int ret; struct fs_path *pt = ctx; @@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { - struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + di = btrfs_match_dir_item_name(path, parent_ref->name, parent_ref->name_len); if (!di) { ret = 0; @@ -4708,8 +4699,7 @@ out: return ret; } -static int record_new_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4738,8 +4728,7 @@ out: return ret; } -static int record_deleted_ref_if_needed(int num, u64 dir, int index, - struct fs_path *name, void *ctx) +static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { int ret = 0; struct send_ctx *sctx = ctx; @@ -5291,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); +again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5323,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) ret = -EIO; break; } + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } } memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, @@ -5677,10 +5672,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), disk_bytenr, disk_num_bytes, sctx->send_buf_pages + - (data_offset >> PAGE_SHIFT)); + (data_offset >> PAGE_SHIFT), + NULL); if (ret) goto out; @@ -7190,13 +7186,11 @@ static int changed_extent(struct send_ctx *sctx, static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { - int ret = 0; - if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) sctx->cur_inode_needs_verity = true; } - return ret; + return 0; } static int dir_changed(struct send_ctx *sctx, u64 dir) @@ -7265,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path, enum btrfs_compare_tree_result result, struct send_ctx *sctx) { - int ret = 0; + int ret; /* * We can not hold the commit root semaphore here. This is because in @@ -7325,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path, return 0; } result = BTRFS_COMPARE_TREE_CHANGED; - ret = 0; } sctx->left_path = left_path; @@ -8137,7 +8130,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); - if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started. + */ + if (btrfs_root_dead(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + /* Userspace tools do the checks and warn the user if it's not RO. */ + if (!btrfs_root_readonly(send_root)) { + spin_unlock(&send_root->root_item_lock); + return -EPERM; + } + if (send_root->dedupe_in_progress) { dedupe_in_progress_warn(send_root); spin_unlock(&send_root->root_item_lock); return -EAGAIN; @@ -8146,15 +8152,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a spin_unlock(&send_root->root_item_lock); /* - * Userspace tools do the checks and warn the user if it's - * not RO. - */ - if (!btrfs_root_readonly(send_root)) { - ret = -EPERM; - goto out; - } - - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside * access_ok. Also set an upper limit for allocation size so this can't @@ -8219,15 +8216,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a } sctx->send_root = send_root; - /* - * Unlikely but possible, if the subvolume is marked for deletion but - * is slow to remove the directory entry, send can still be started - */ - if (btrfs_root_dead(sctx->send_root)) { - ret = -EPERM; - goto out; - } - sctx->clone_roots_cnt = arg->clone_sources_count; if (sctx->proto >= 2) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b07f4aa66878..9309886c5ea1 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" /* Conditional support for the upcoming protocol version. */ -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL #define BTRFS_SEND_STREAM_VERSION 3 #else #define BTRFS_SEND_STREAM_VERSION 2 diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d5a9cd8a4fd8..a341d087567a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -14,6 +14,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -127,6 +128,14 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RESET_ZONES + * This state works only for the zoned mode. On the zoned mode, we cannot + * reuse once allocated then freed region until we reset the zone, due to + * the sequential write zone requirement. The RESET_ZONES state resets the + * zones of an unused block group and let us reuse the space. The reusing + * is faster than removing the block group and allocating another block + * group on the zones. + * * ALLOC_CHUNK * We will skip this the first time through space reservation, because of * overcommit and we don't want to have a lot of useless metadata space when @@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, found->bytes_used += block_group->used; found->disk_used += block_group->used * factor; found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); + btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); if (block_group->length > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -489,9 +498,7 @@ again: if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; @@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, */ ret = btrfs_commit_current_transaction(root); break; + case RESET_ZONES: + ret = btrfs_reset_unused_block_groups(space_info, num_bytes); + break; default: ret = -ENOSPC; break; @@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; + enum btrfs_flush_state final_state; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + if (btrfs_is_zoned(fs_info)) + final_state = RESET_ZONES; + else + final_state = COMMIT_TRANS; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); @@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) flush_state++; - if (flush_state > COMMIT_TRANS) { + if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { if (maybe_fail_all_tickets(fs_info, space_info)) { @@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } } spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); + } while (flush_state <= final_state); } /* @@ -1279,13 +1294,17 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * If we are freeing inodes, we want to make sure all delayed iputs have * completed, because they could have been on an inode with i_nlink == 0, and * thus have been truncated and freed up space. But again this space is not - * immediately re-usable, it comes in the form of a delayed ref, which must be + * immediately reusable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * * COMMIT_TRANS * This is where we reclaim all of the pinned space generated by running the * iputs * + * RESET_ZONES + * This state works only for the zoned mode. We scan the unused block group + * list and reset the zones and reuse the block group. + * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full * before, and then the transaction commit could have freed new block groups, @@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RESET_ZONES, ALLOC_CHUNK_FORCE, }; @@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) static const enum btrfs_flush_state priority_flush_states[] = { FLUSH_DELAYED_ITEMS_NR, FLUSH_DELAYED_ITEMS, + RESET_ZONES, ALLOC_CHUNK, }; @@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, + RESET_ZONES, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, @@ -1488,8 +1510,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void wait_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { @@ -1547,7 +1568,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: - wait_reserve_ticket(fs_info, space_info, ticket); + wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: priority_reclaim_metadata_space(fs_info, space_info, ticket, @@ -1691,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1704,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { used = btrfs_space_info_used(space_info, false); if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); + btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } } @@ -1984,8 +2003,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, int raid) +static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; int thresh_pct; @@ -2081,6 +2099,35 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) if (!btrfs_should_periodic_reclaim(space_info)) continue; for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) - do_reclaim_sweep(fs_info, space_info, raid); + do_reclaim_sweep(space_info, raid); + } +} + +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + lockdep_assert_held(&space_info->lock); + + /* Prioritize the global reservation to receive the freed space. */ + if (global_rsv->space_info != space_info) + goto grant; + + spin_lock(&global_rsv->lock); + if (!global_rsv->full) { + u64 to_add = min(len, global_rsv->size - global_rsv->reserved); + + global_rsv->reserved += to_add; + btrfs_space_info_update_bytes_may_use(space_info, to_add); + if (global_rsv->reserved >= global_rsv->size) + global_rsv->full = 1; + len -= to_add; } + spin_unlock(&global_rsv->lock); + +grant: + /* Add to any tickets we may have. */ + if (len) + btrfs_try_granting_tickets(fs_info, space_info); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index efbecc0c5258..a96efdb5e681 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_EMERGENCY, }; +/* + * Please be aware that the order of enum values will be the order of the reclaim + * process in btrfs_async_reclaim_metadata_space(). + */ enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, @@ -91,6 +95,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, + RESET_ZONES = 12, }; struct btrfs_space_info { @@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i */ #define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \ static inline void \ -btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ +btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + struct btrfs_fs_info *fs_info = sinfo->fs_info; \ const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \ lockdep_assert_held(&sinfo->lock); \ trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ @@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( - struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes) { spin_lock(&space_info->lock); - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); + btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, @@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); +void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index fe4d719d506b..722acf768396 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); spin_lock_init(&ret->lock); - if (type == BTRFS_SUBPAGE_METADATA) { + if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&ret->eb_refs, 0); - } else { - atomic_set(&ret->readers, 0); - atomic_set(&ret->writers, 0); - } + else + atomic_set(&ret->nr_locked, 0); return ret; } @@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, __start_bit; \ }) -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - /* - * Even though it's just for reading the page, no one should have - * locked the subpage range. - */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - atomic_add(nbits, &subpage->readers); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = len >> fs_info->sectorsize_bits; - unsigned long flags; - bool is_data; - bool last; - - btrfs_subpage_assert(fs_info, folio, start, len); - is_data = is_data_inode(BTRFS_I(folio->mapping->host)); - - spin_lock_irqsave(&subpage->lock, flags); - - /* The range should have already been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - ASSERT(atomic_read(&subpage->readers) >= nbits); - - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->readers); - - /* - * For data we need to unlock the page if the last read has finished. - * - * And please don't replace @last with atomic_sub_and_test() call - * inside if () condition. - * As we want the atomic_sub_and_test() to be always executed. - */ - if (is_data && last) - folio_unlock(folio); - spin_unlock_irqrestore(&subpage->lock, flags); -} - static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; @@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) orig_start + orig_len) - *start; } -static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); - const int nbits = (len >> fs_info->sectorsize_bits); - unsigned long flags; - int ret; - - btrfs_subpage_assert(fs_info, folio, start, len); - - spin_lock_irqsave(&subpage->lock, flags); - ASSERT(atomic_read(&subpage->readers) == 0); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret == nbits); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); @@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf * extent_clear_unlock_delalloc() for compression path. * * This @locked_page is locked by plain lock_page(), thus its - * subpage::writers is 0. Handle them in a special way. + * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->writers) == 0) { + if (atomic_read(&subpage->nr_locked) == 0) { spin_unlock_irqrestore(&subpage->lock, flags); return true; } @@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf clear_bit(bit, subpage->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); return last; } /* - * Lock a folio for delalloc page writeback. - * - * Return -EAGAIN if the page is not properly initialized. - * Return 0 with the page locked, and writer counter updated. - * - * Even with 0 returned, the page still need extra check to make sure - * it's really the correct page, as the caller is using - * filemap_get_folios_contig(), which can race with page invalidating. - */ -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { - folio_lock(folio); - return 0; - } - folio_lock(folio); - if (!folio_test_private(folio) || !folio_get_private(folio)) { - folio_unlock(folio); - return -EAGAIN; - } - btrfs_subpage_clamp_range(folio, &start, &len); - btrfs_subpage_start_writer(fs_info, folio, start, len); - return 0; -} - -/* * Handle different locked folios: * * - Non-subpage folio @@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, * bitmap, reduce the writer lock number, and unlock the page if that's * the last locked range. */ -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage = folio_get_private(folio); @@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, /* * For subpage case, there are two types of locked page. With or - * without writers number. + * without locked number. * - * Since we own the page lock, no one else could touch subpage::writers + * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } btrfs_subpage_clamp_range(folio, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) folio_unlock(folio); } -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap) +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; @@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, int cleared = 0; int bit; - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page(). */ + if (atomic_read(&subpage->nr_locked) == 0) { + /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } @@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->writers) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->writers); + ASSERT(atomic_read(&subpage->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->nr_locked); spin_unlock_irqrestore(&subpage->lock, flags); if (last) folio_unlock(folio); @@ -740,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +{ \ + const int sectors_per_page = fs_info->sectors_per_page; \ + \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ +} + +#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ +{ \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned long bitmap; \ + \ + GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + btrfs_warn(fs_info, \ + "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ + start, len, folio_pos(folio), \ + fs_info->sectors_per_page, &bitmap); \ +} + /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. @@ -765,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, subpage = folio_get_private(folio); ASSERT(subpage); spin_lock_irqsave(&subpage->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -776,8 +697,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * This populates the involved subpage ranges so that subpage helpers can * properly unlock them. */ -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; unsigned long flags; @@ -794,70 +715,16 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, nbits = len >> fs_info->sectorsize_bits; spin_lock_irqsave(&subpage->lock, flags); /* Target range should not yet be locked. */ - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); + ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + } bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->writers); + ret = atomic_add_return(nbits, &subpage->nr_locked); ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Find any subpage writer locked range inside @folio, starting at file offset - * @search_start. The caller should ensure the folio is locked. - * - * Return true and update @found_start_ret and @found_len_ret to the first - * writer locked range. - * Return false if there is no writer locked range. - */ -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - const u32 sectors_per_page = fs_info->sectors_per_page; - const unsigned int len = PAGE_SIZE - offset_in_page(search_start); - const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, - locked, search_start, len); - const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; - const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; - unsigned long flags; - int first_zero; - int first_set; - bool found = false; - - ASSERT(folio_test_locked(folio)); - spin_lock_irqsave(&subpage->lock, flags); - first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); - if (first_set >= locked_bitmap_end) - goto out; - - found = true; - - *found_start_ret = folio_pos(folio) + - ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); - /* - * Since @first_set is ensured to be smaller than locked_bitmap_end - * here, @found_start_ret should be inside the folio. - */ - ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); - - first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); - *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; -out: - spin_unlock_irqrestore(&subpage->lock, flags); - return found; -} - -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ -{ \ - const int sectors_per_page = fs_info->sectors_per_page; \ - \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ -} - void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { @@ -868,6 +735,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long writeback_bitmap; unsigned long ordered_bitmap; unsigned long checked_bitmap; + unsigned long locked_bitmap; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -880,15 +748,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, -"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), sectors_per_page, &uptodate_bitmap, sectors_per_page, &dirty_bitmap, + sectors_per_page, &locked_bitmap, sectors_per_page, &writeback_bitmap, sectors_per_page, &ordered_bitmap, sectors_per_page, &checked_bitmap); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4b85d91d0e18..44fff1f4eac4 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -45,14 +45,6 @@ enum { struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - /* - * Both data and metadata needs to track how many readers are for the - * page. - * Data relies on @readers to unlock the page when last reader finished. - * While metadata doesn't need page unlock, it needs to prevent - * page::private get cleared before the last end_page_read(). - */ - atomic_t readers; union { /* * Structures only used by metadata @@ -62,8 +54,12 @@ struct btrfs_subpage { */ atomic_t eb_refs; - /* Structures only used by data */ - atomic_t writers; + /* + * Structures only used by data, + * + * How many sectors inside the page is locked. + */ + atomic_t nr_locked; }; unsigned long bitmaps[]; }; @@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); -void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); - -int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); -void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, - struct folio *folio, unsigned long bitmap); -bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 search_start, - u64 *found_start_ret, u32 *found_len_ret); - +void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); /* * Template for subpage related operations. * @@ -152,6 +137,19 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); +/* + * Helper for error cleanup, where a folio will have its dirty flag cleared, + * with writeback started and finished. + */ +static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info, + struct folio *locked_folio, + u64 start, u32 len) +{ + btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len); +} + bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 98fa0f382480..f809c3200c21 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -28,7 +28,6 @@ #include <linux/btrfs.h> #include <linux/security.h> #include <linux/fs_parser.h> -#include <linux/swap.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -340,6 +339,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears + * "force-compress" without the need to pass + * "compress-force=[no|none]" before specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + if (opt == Opt_compress || opt == Opt_compress_force) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; @@ -937,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec } static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data) + struct btrfs_fs_devices *fs_devices) { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -962,9 +969,9 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - err = open_ctree(sb, fs_devices, (char *)data); + err = open_ctree(sb, fs_devices); if (err) { - btrfs_err(fs_info, "open_ctree failed"); + btrfs_err(fs_info, "open_ctree failed: %d", err); return err; } @@ -1498,8 +1505,7 @@ static int btrfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - if (!mount_reconfigure && - !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) return -EINVAL; ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); @@ -1879,18 +1885,21 @@ static int btrfs_get_tree_super(struct fs_context *fc) if (sb->s_root) { btrfs_close_devices(fs_devices); - if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) - ret = -EBUSY; + /* + * At this stage we may have RO flag mismatch between + * fc->sb_flags and sb->s_flags. Caller should detect such + * mismatch and reconfigure with sb->s_umount rwsem held if + * needed. + */ } else { snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; - ret = btrfs_fill_super(sb, fs_devices, NULL); - } - - if (ret) { - deactivate_locked_super(sb); - return ret; + ret = btrfs_fill_super(sb, fs_devices); + if (ret) { + deactivate_locked_super(sb); + return ret; + } } btrfs_clear_oneshot_options(fs_info); @@ -1971,59 +1980,23 @@ error: * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem * in fc->sb_flags. * - * This disambiguation has rather positive consequences. Mounting a subvolume - * ro will not also turn the superblock ro. Only the mount for the subvolume - * will become ro. - * - * So, if the superblock creation request comes from the new mount API the - * caller must have explicitly done: - * - * fsconfig(FSCONFIG_SET_FLAG, "ro") - * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) - * - * IOW, at some point the caller must have explicitly turned the whole - * superblock ro and we shouldn't just undo it like we did for the old mount - * API. In any case, it lets us avoid the hack in the new mount API. - * - * Consequently, the remounting hack must only be used for requests originating - * from the old mount API and should be marked for full deprecation so it can be - * turned off in a couple of years. - * - * The new mount API has no reason to support this hack. + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. */ -static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) { - struct vfsmount *mnt; - int ret; - const bool ro2rw = !(fc->sb_flags & SB_RDONLY); - - /* - * We got an EBUSY because our SB_RDONLY flag didn't match the existing - * super block, so invert our setting here and retry the mount so we - * can get our vfsmount. - */ - if (ro2rw) - fc->sb_flags |= SB_RDONLY; - else - fc->sb_flags &= ~SB_RDONLY; - - mnt = fc_mount(fc); - if (IS_ERR(mnt)) - return mnt; + int ret = 0; - if (!fc->oldapi || !ro2rw) - return mnt; + if (fc->sb_flags & SB_RDONLY) + return ret; - /* We need to convert to rw, call reconfigure. */ - fc->sb_flags &= ~SB_RDONLY; down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_reconfigure(fc); + if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + ret = btrfs_reconfigure(fc); up_write(&mnt->mnt_sb->s_umount); - if (ret) { - mntput(mnt); - return ERR_PTR(ret); - } - return mnt; + return ret; } static int btrfs_get_tree_subvol(struct fs_context *fc) @@ -2033,6 +2006,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) struct fs_context *dup_fc; struct dentry *dentry; struct vfsmount *mnt; + int ret = 0; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -2075,11 +2049,16 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->security = NULL; mnt = fc_mount(dup_fc); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) - mnt = btrfs_reconfigure_for_mount(dup_fc); - put_fs_context(dup_fc); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + put_fs_context(dup_fc); return PTR_ERR(mnt); + } + ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + put_fs_context(dup_fc); + if (ret) { + mntput(mnt); + return ret; + } /* * This free's ->subvol_name, because if it isn't set we have to @@ -2198,7 +2177,8 @@ static struct file_system_type btrfs_fs_type = { .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); @@ -2263,7 +2243,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); - ret = PTR_ERR(device); + if (IS_ERR(device)) + ret = PTR_ERR(device); + else + ret = 0; break; } ret = !(device->fs_devices->num_devices == @@ -2402,13 +2385,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro trace_btrfs_extent_map_shrinker_count(fs_info, nr); - /* - * Only report the real number for DEBUG builds, as there are reports of - * serious performance degradation caused by too frequent shrinks. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - return nr; - return 0; + return nr; } static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) @@ -2416,16 +2393,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); struct btrfs_fs_info *fs_info = btrfs_sb(sb); - /* - * We may be called from any task trying to allocate memory and we don't - * want to slow it down with scanning and dropping extent maps. It would - * also cause heavy lock contention if many tasks concurrently enter - * here. Therefore only allow kswapd tasks to scan and drop extent maps. - */ - if (!current_is_kswapd()) - return 0; + btrfs_free_extent_maps(fs_info, nr_to_scan); - return btrfs_free_extent_maps(fs_info, nr_to_scan); + /* The extent map shrinker runs asynchronously, so always return 0. */ + return 0; } static const struct super_operations btrfs_super_ops = { @@ -2475,6 +2446,9 @@ static __cold void btrfs_interface_exit(void) static int __init btrfs_print_mod_info(void) { static const char options[] = "" +#ifdef CONFIG_BTRFS_EXPERIMENTAL + ", experimental=on" +#endif #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2495,7 +2469,17 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (btrfs_get_mod_read_policy() == NULL) + pr_info("Btrfs loaded%s\n", options); + else + pr_info("Btrfs loaded%s, read_policy=%s\n", + options, btrfs_get_mod_read_policy()); +#else pr_info("Btrfs loaded%s\n", options); +#endif + return 0; } @@ -2553,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = { }, { .init_func = extent_map_init, .exit_func = extent_map_exit, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + }, { + .init_func = btrfs_read_policy_init, + .exit_func = NULL, +#endif }, { .init_func = ordered_data_init, .exit_func = ordered_data_exit, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 03926ad467c9..53b846d99ece 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -295,7 +295,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); /* Remove once support for raid stripe tree is feature complete. */ @@ -329,7 +329,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_FEAT_ATTR_PTR(extent_tree_v2), BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif @@ -1118,7 +1118,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -1128,7 +1128,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -1180,7 +1180,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -1305,7 +1305,73 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, } BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); -static const char * const btrfs_read_policy_name[] = { "pid" }; +static const char *btrfs_read_policy_name[] = { + "pid", +#ifdef CONFIG_BTRFS_EXPERIMENTAL + "round-robin", + "devid", +#endif +}; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + +/* Global module configuration parameters. */ +static char *read_policy; +char *btrfs_get_mod_read_policy(void) +{ + return read_policy; +} + +/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */ +module_param(read_policy, charp, 0); +MODULE_PARM_DESC(read_policy, +"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]"); +#endif + +int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) +{ + char param[32] = { 0 }; + char __maybe_unused *value_str; + + if (!str || strlen(str) == 0) + return 0; + + strncpy(param, str, sizeof(param) - 1); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Separate value from input in policy:value format. */ + value_str = strchr(param, ':'); + if (value_str) { + int ret; + + *value_str = 0; + value_str++; + if (!value_ret) + return -EINVAL; + ret = kstrtos64(value_str, 10, value_ret); + if (ret) + return -EINVAL; + if (*value_ret < 0) + return -ERANGE; + } +#endif + + return sysfs_match_string(btrfs_read_policy_name, param); +} + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void) +{ + s64 value; + + if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) { + btrfs_err(NULL, "invalid read policy or value %s", read_policy); + return -EINVAL; + } + + return 0; +} +#endif static ssize_t btrfs_read_policy_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1316,14 +1382,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, int i; for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (policy == i) - ret += sysfs_emit_at(buf, ret, "%s[%s]", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); - else - ret += sysfs_emit_at(buf, ret, "%s%s", - (ret == 0 ? "" : " "), - btrfs_read_policy_name[i]); + if (ret != 0) + ret += sysfs_emit_at(buf, ret, " "); + + if (i == policy) + ret += sysfs_emit_at(buf, ret, "["); + + ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + if (i == BTRFS_READ_POLICY_RR) + ret += sysfs_emit_at(buf, ret, ":%u", + READ_ONCE(fs_devices->rr_min_contig_read)); + + if (i == BTRFS_READ_POLICY_DEVID) + ret += sysfs_emit_at(buf, ret, ":%llu", + READ_ONCE(fs_devices->read_devid)); +#endif + if (i == policy) + ret += sysfs_emit_at(buf, ret, "]"); } ret += sysfs_emit_at(buf, ret, "\n"); @@ -1336,21 +1413,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); - int i; + int index; + s64 value = -1; - for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { - if (sysfs_streq(buf, btrfs_read_policy_name[i])) { - if (i != READ_ONCE(fs_devices->read_policy)) { - WRITE_ONCE(fs_devices->read_policy, i); - btrfs_info(fs_devices->fs_info, - "read policy set to '%s'", - btrfs_read_policy_name[i]); + index = btrfs_read_policy_to_enum(buf, &value); + if (index < 0) + return -EINVAL; + +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* If moving from RR then disable collecting fs stats. */ + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = false; + + if (index == BTRFS_READ_POLICY_RR) { + if (value != -1) { + const u32 sectorsize = fs_devices->fs_info->sectorsize; + + if (!IS_ALIGNED(value, sectorsize)) { + u64 temp_value = round_up(value, sectorsize); + + btrfs_debug(fs_devices->fs_info, +"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu", + value, sectorsize, temp_value); + value = temp_value; } - return len; + } else { + value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->rr_min_contig_read)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->rr_min_contig_read, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'", + btrfs_read_policy_name[index], value); + } + + fs_devices->collect_fs_stats = true; + + return len; } - return -EINVAL; + if (index == BTRFS_READ_POLICY_DEVID) { + if (value != -1) { + BTRFS_DEV_LOOKUP_ARGS(args); + + /* Validate input devid. */ + args.devid = value; + if (btrfs_find_device(fs_devices, &args) == NULL) + return -EINVAL; + } else { + /* Set default devid to the devid of the latest device. */ + value = fs_devices->latest_dev->devid; + } + + if (index != READ_ONCE(fs_devices->read_policy) || + value != READ_ONCE(fs_devices->read_devid)) { + WRITE_ONCE(fs_devices->read_policy, index); + WRITE_ONCE(fs_devices->read_devid, value); + + btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'", + btrfs_read_policy_name[index], value); + } + + return len; + } +#endif + if (index != READ_ONCE(fs_devices->read_policy)) { + WRITE_ONCE(fs_devices->read_policy, index); + btrfs_info(fs_devices->fs_info, "read policy set to '%s'", + btrfs_read_policy_name[index]); + } + + return len; } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); @@ -1390,7 +1526,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL static ssize_t btrfs_offload_csum_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1450,7 +1586,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif NULL, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e6a284c59809..3fc5c6f90dc4 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -47,5 +47,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup); +int btrfs_read_policy_to_enum(const char *str, s64 *value); + +#ifdef CONFIG_BTRFS_EXPERIMENTAL +int __init btrfs_read_policy_init(void); +char *btrfs_get_mod_read_policy(void); +#endif #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ce50847e1e01..5eff8d7d2360 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -29,6 +29,8 @@ const char *test_error[] = { [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", + [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context", + [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction", }; static const struct super_operations btrfs_test_super_ops = { @@ -141,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + + /* CRC32C csum size. */ + fs_info->csum_size = 4; + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / + fs_info->csum_size; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -246,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) kfree(cache); } +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) +{ + memset(trans, 0, sizeof(*trans)); + trans->fs_info = fs_info; + xa_init(&trans->delayed_refs.head_refs); + xa_init(&trans->delayed_refs.dirty_extents); + spin_lock_init(&trans->delayed_refs.lock); +} + void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -291,6 +307,12 @@ int btrfs_run_sanity_tests(void) ret = btrfs_test_free_space_tree(sectorsize, nodesize); if (ret) goto out; + ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_delayed_refs(sectorsize, nodesize); + if (ret) + goto out; } } ret = btrfs_test_extent_map(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index dc2f2ab15fa5..4307bdaa6749 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -6,6 +6,8 @@ #ifndef BTRFS_TESTS_H #define BTRFS_TESTS_H +#include <linux/types.h> + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); @@ -24,12 +26,15 @@ enum { TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_CHUNK_MAP, + TEST_ALLOC_IO_CONTEXT, + TEST_ALLOC_TRANSACTION, }; extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; +struct btrfs_transaction; int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); @@ -37,7 +42,9 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); int btrfs_test_extent_map(void); +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); @@ -47,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c new file mode 100644 index 000000000000..6558508c2ddf --- /dev/null +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -0,0 +1,1015 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sizes.h> +#include "btrfs-tests.h" +#include "../transaction.h" +#include "../delayed-ref.h" +#include "../extent-tree.h" + +#define FAKE_ROOT_OBJECTID 256 +#define FAKE_BYTENR 0 +#define FAKE_LEVEL 1 +#define FAKE_INO 256 +#define FAKE_FILE_OFFSET 0 +#define FAKE_PARENT SZ_1M + +struct ref_head_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + int total_ref_mod; + int must_insert; +}; + +struct ref_node_check { + u64 bytenr; + u64 num_bytes; + int ref_mod; + enum btrfs_delayed_ref_action action; + u8 type; + u64 parent; + u64 root; + u64 owner; + u64 offset; +}; + +static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type) +{ + if ((type == BTRFS_TREE_BLOCK_REF_KEY) || + (type == BTRFS_SHARED_BLOCK_REF_KEY)) + return BTRFS_REF_METADATA; + return BTRFS_REF_DATA; +} + +static void delete_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + btrfs_delete_ref_head(fs_info, delayed_refs, head); + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); +} + +static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *node) +{ + rb_erase_cached(&node->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&node->ref_node); + if (!list_empty(&node->add_list)) + list_del_init(&node->add_list); + btrfs_put_delayed_ref(node); +} + +static int validate_ref_head(struct btrfs_delayed_ref_head *head, + struct ref_head_check *check) +{ + if (head->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", head->bytenr, + check->bytenr); + return -EINVAL; + } + + if (head->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + head->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (head->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", head->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (head->total_ref_mod != check->total_ref_mod) { + test_err("invalid total_ref_mod have: %d want: %d", + head->total_ref_mod, check->total_ref_mod); + return -EINVAL; + } + + if (head->must_insert_reserved != check->must_insert) { + test_err("invalid must_insert have: %d want: %d", + head->must_insert_reserved, check->must_insert); + return -EINVAL; + } + + return 0; +} + +static int validate_ref_node(struct btrfs_delayed_ref_node *node, + struct ref_node_check *check) +{ + if (node->bytenr != check->bytenr) { + test_err("invalid bytenr have: %llu want: %llu", node->bytenr, + check->bytenr); + return -EINVAL; + } + + if (node->num_bytes != check->num_bytes) { + test_err("invalid num_bytes have: %llu want: %llu", + node->num_bytes, check->num_bytes); + return -EINVAL; + } + + if (node->ref_mod != check->ref_mod) { + test_err("invalid ref_mod have: %d want: %d", node->ref_mod, + check->ref_mod); + return -EINVAL; + } + + if (node->action != check->action) { + test_err("invalid action have: %d want: %d", node->action, + check->action); + return -EINVAL; + } + + if (node->parent != check->parent) { + test_err("invalid parent have: %llu want: %llu", node->parent, + check->parent); + return -EINVAL; + } + + if (node->ref_root != check->root) { + test_err("invalid root have: %llu want: %llu", node->ref_root, + check->root); + return -EINVAL; + } + + if (node->type != check->type) { + test_err("invalid type have: %d want: %d", node->type, + check->type); + return -EINVAL; + } + + if (btrfs_delayed_ref_owner(node) != check->owner) { + test_err("invalid owner have: %llu want: %llu", + btrfs_delayed_ref_owner(node), check->owner); + return -EINVAL; + } + + if (btrfs_delayed_ref_offset(node) != check->offset) { + test_err("invalid offset have: %llu want: %llu", + btrfs_delayed_ref_offset(node), check->offset); + return -EINVAL; + } + + return 0; +} + +static int simple_test(struct btrfs_trans_handle *trans, + struct ref_head_check *head_check, + struct ref_node_check *node_check) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = ref_type_from_disk_ref_type(node_check->type), + .action = node_check->action, + .parent = node_check->parent, + .ref_root = node_check->root, + .bytenr = node_check->bytenr, + .num_bytes = fs_info->nodesize, + }; + int ret; + + if (ref.type == BTRFS_REF_METADATA) + btrfs_init_tree_ref(&ref, node_check->owner, node_check->root, + false); + else + btrfs_init_data_ref(&ref, node_check->owner, node_check->offset, + node_check->root, true); + + if (ref.type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + return -EINVAL; + } + + ret = -EINVAL; + if (validate_ref_head(head, head_check)) + goto out; + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, node_check)) + goto out; + ret = 0; +out: + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * These are simple tests, make sure that our btrfs_ref's get turned into the + * appropriate btrfs_delayed_ref_node based on their settings and action. + */ +static int simple_tests(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .total_ref_mod = 1, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + .owner = FAKE_LEVEL, + .offset = 0, + }; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single add shared data failed"); + return -EINVAL; + } + + head_check.ref_mod = -1; + head_check.total_ref_mod = -1; + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + node_check.parent = 0; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop tree block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop extent data failed"); + return -EINVAL; + } + + node_check.parent = FAKE_PARENT; + node_check.type = BTRFS_SHARED_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + node_check.offset = 0; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared block failed"); + return -EINVAL; + } + + node_check.type = BTRFS_SHARED_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + if (simple_test(trans, &head_check, &node_check)) { + test_err("single drop shared data failed"); + return -EINVAL; + } + + return 0; +} + +/* + * Merge tests, validate that we do delayed ref merging properly, the ref counts + * all end up properly, and delayed refs are deleted once they're no longer + * needed. + */ +static int merge_tests(struct btrfs_trans_handle *trans, + enum btrfs_ref_type type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = type, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 2, + .action = BTRFS_ADD_DELAYED_REF, + .parent = 0, + .root = FAKE_ROOT_OBJECTID, + }; + int ret; + + /* + * First add a ref and then drop it, make sure we get a head ref with a + * 0 total ref mod and no nodes. + */ + if (type == BTRFS_REF_METADATA) { + node_check.type = BTRFS_TREE_BLOCK_REF_KEY; + node_check.owner = FAKE_LEVEL; + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + } else { + node_check.type = BTRFS_EXTENT_DATA_REF_KEY; + node_check.owner = FAKE_INO; + node_check.offset = FAKE_FILE_OFFSET; + btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET, + FAKE_ROOT_OBJECTID, true); + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("single add and drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a ref, then add another ref, make sure we get a head ref with a + * 2 total ref mod and 1 node. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double add failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add two drop refs, make sure they are merged properly. */ + ref.action = BTRFS_DROP_DELAYED_REF; + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Add multiple refs, then drop until we go negative again. */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = -2; + head_check.total_ref_mod = -2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("double drop failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + + delete_delayed_ref_head(trans, head); + head = NULL; + + /* Drop multiple refs, then add until we go positive again. */ + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 10; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 12; i++) { + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 2; + head_check.total_ref_mod = 2; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop to positive failed"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Add a bunch of refs with different roots and parents, then drop them + * all, make sure everything is properly merged. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + ref.action = BTRFS_DROP_DELAYED_REF; + for (int i = 0; i < 50; i++) { + if (!(i % 2)) { + ref.parent = 0; + ref.ref_root = FAKE_ROOT_OBJECTID + i; + } else { + ref.parent = FAKE_PARENT + (i * fs_info->nodesize); + } + if (type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + else + ret = btrfs_add_delayed_data_ref(trans, &ref, 0); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + } + + head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + goto out; + } + + head_check.ref_mod = 0; + head_check.total_ref_mod = 0; + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("add and drop multiple failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (node) { + test_err("found node when none should exist"); + goto out; + } + ret = 0; +out: + if (!IS_ERR_OR_NULL(head)) + btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +/* + * Basic test to validate we always get the add operations first followed by any + * delete operations. + */ +static int select_delayed_refs_test(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_node *node; + struct btrfs_ref ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_DROP_DELAYED_REF, + .parent = 0, + .ref_root = FAKE_ROOT_OBJECTID, + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + }; + struct ref_head_check head_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 0, + .total_ref_mod = 0, + }; + struct ref_node_check node_check = { + .bytenr = FAKE_BYTENR, + .num_bytes = fs_info->nodesize, + .ref_mod = 1, + .action = BTRFS_ADD_DELAYED_REF, + .type = BTRFS_TREE_BLOCK_REF_KEY, + .parent = 0, + .owner = FAKE_LEVEL, + .offset = 0, + }; + int ret; + + /* Add the drop first. */ + btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false); + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + return ret; + } + + /* + * Now add the add, and make it a different root so it's logically later + * in the rb tree. + */ + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.root = FAKE_ROOT_OBJECTID + 1; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + delete_delayed_ref_head(trans, head); + head = NULL; + + /* + * Now we're going to do the same thing, but we're going to have an add + * that gets deleted because of a merge, and make sure we still have + * another add in place. + */ + ref.action = BTRFS_DROP_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 1; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_DROP_DELAYED_REF; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.ref_root = FAKE_ROOT_OBJECTID + 2; + ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL); + if (ret) { + test_err("failed ref action %d", ret); + goto out; + } + + head = btrfs_select_ref_head(fs_info, delayed_refs); + if (IS_ERR_OR_NULL(head)) { + if (IS_ERR(head)) + test_err("failed to select delayed ref head: %ld", + PTR_ERR(head)); + else + test_err("failed to find delayed ref head"); + ret = -EINVAL; + head = NULL; + goto out; + } + + ret = -EINVAL; + if (validate_ref_head(head, &head_check)) { + test_err("head check failed"); + goto out; + } + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_ADD_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID + 2; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + + spin_lock(&head->lock); + node = btrfs_select_delayed_ref(head); + spin_unlock(&head->lock); + if (!node) { + test_err("failed to select delayed ref"); + goto out; + } + + node_check.action = BTRFS_DROP_DELAYED_REF; + node_check.root = FAKE_ROOT_OBJECTID; + if (validate_ref_node(node, &node_check)) { + test_err("node check failed"); + goto out; + } + delete_delayed_ref_node(head, node); + ret = 0; +out: + if (head) + btrfs_unselect_ref_head(delayed_refs, head); + btrfs_destroy_delayed_refs(trans->transaction); + return ret; +} + +int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) +{ + struct btrfs_transaction *transaction; + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + int ret; + + test_msg("running delayed refs tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + transaction = kmalloc(sizeof(*transaction), GFP_KERNEL); + if (!transaction) { + test_std_err(TEST_ALLOC_TRANSACTION); + ret = -ENOMEM; + goto out_free_fs_info; + } + btrfs_init_dummy_trans(&trans, fs_info); + btrfs_init_dummy_transaction(transaction, fs_info); + trans.transaction = transaction; + + ret = simple_tests(&trans); + if (!ret) { + test_msg("running delayed refs merg tests on metadata refs"); + ret = merge_tests(&trans, BTRFS_REF_METADATA); + } + + if (!ret) { + test_msg("running delayed refs merg tests on data refs"); + ret = merge_tests(&trans, BTRFS_REF_DATA); + } + + if (!ret) + ret = select_delayed_refs_test(&trans); + +out_free_fs_info: + btrfs_free_dummy_fs_info(fs_info); + return ret; +} diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c new file mode 100644 index 000000000000..a7bc58a5c1e2 --- /dev/null +++ b/fs/btrfs/tests/raid-stripe-tree-tests.c @@ -0,0 +1,1161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Western Digital Corporation or its affiliates. + */ + +#include <linux/sizes.h> +#include "../fs.h" +#include "../disk-io.h" +#include "../transaction.h" +#include "../volumes.h" +#include "../raid-stripe-tree.h" +#include "btrfs-tests.h" + +#define RST_TEST_NUM_DEVICES (2) +#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) + +#define SZ_48K (SZ_32K + SZ_16K) + +typedef int (*test_func_t)(struct btrfs_trans_handle *trans); + +static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, + u64 devid) +{ + struct btrfs_device *dev; + + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (dev->devid == devid) + return dev; + } + + return NULL; +} + +/* + * Test creating a range of three extents and then punch a hole in the middle, + * deleting all of the middle extents and partially deleting the "book ends". + */ +static int test_punch_hole_3extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + u64 hole_start = logical1 + SZ_256K; + u64 hole_len = SZ_2M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 + 256K and 2M in length. Extent + * 1 is truncated to 256k length, extent 2 is completely dropped and + * extent 3 is moved 256K to the right. + */ + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + /* Get the first extent and check its size. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_256K, len1); + ret = -EINVAL; + goto out; + } + + /* Get the second extent and check it's absent. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical2, logical2 + len2); + ret = -EINVAL; + goto out; + } + + /* Get the third extent and check its size. */ + logical3 += SZ_256K; + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, logical3 + len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3 + SZ_256K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M - SZ_256K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M - SZ_256K, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static int test_delete_two_extents(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 len1 = SZ_1M; + u64 logical2 = logical1 + len1; + u64 len2 = SZ_1M; + u64 logical3 = logical2 + len2; + u64 len3 = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + + /* Prepare for the test, 1st create 3 x 1M extents. */ + bioc->map_type = map_type; + bioc->size = len1; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + bioc->size = len2; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical3; + bioc->size = len3; + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical3 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + /* + * Delete a range starting at logical1 and 2M in length. Extents 1 + * and 2 are dropped and extent 3 is kept as is. + */ + ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1, logical1 + len1 + len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical1, len1); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical2, len2); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical3, len3); + goto out; + } + + if (io_stripe.physical != logical3) { + test_err("invalid physical address, expected %llu, got %llu", + logical3, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len3 != SZ_1M) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_1M, len3); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical3, len3); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* Test punching a hole into a single RAID stripe-extent. */ +static int test_punch_hole(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 hole_start = logical1 + SZ_32K; + u64 hole_len = SZ_64K; + u64 logical2 = hole_start + hole_len; + u64 len = SZ_1M; + u64 len1 = SZ_32K; + u64 len2 = len - len1 - hole_len; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_1M) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_1M, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, hole_start, hole_len); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + hole_start, hole_start + hole_len); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical1, logical1 + len1); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu, got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len1 != SZ_32K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_32K, len1); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type, + 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical2, + logical2 + len2); + goto out; + } + + if (io_stripe.physical != logical2) { + test_err("invalid physical address, expected %llu, got %llu", + logical2, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len2 != len - len1 - hole_len) { + test_err("invalid length, expected %llu, got %llu", + len - len1 - hole_len, len2); + ret = -EINVAL; + goto out; + } + + /* Check for the absence of the hole. */ + ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + hole_start, hole_start + SZ_64K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1, len1); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2, len2); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 1M RST write that spans two adjacent RST items on disk and then + * delete a portion starting in the first item and spanning into the second + * item. This is similar to test_front_delete(), but spanning multiple items. + */ +static int test_front_delete_prev_item(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical1 = SZ_1M; + u64 logical2 = SZ_2M; + u64 len = SZ_1M; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + /* Insert RAID extent 1. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical1 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + bioc->logical = logical2; + /* Insert RAID extent 2, directly adjacent to it. */ + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical2 + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical1 + SZ_512K, (u64)SZ_1M); + goto out; + } + + /* Verify item 1 is truncated to 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0, + &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical1, + logical1 + len); + goto out; + } + + if (io_stripe.physical != logical1) { + test_err("invalid physical address, expected %llu got %llu", + logical1, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify item 2's start is moved by 512K. */ + ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical2 + SZ_512K, logical2 + len); + goto out; + } + + if (io_stripe.physical != logical2 + SZ_512K) { + test_err("invalid physical address, expected %llu got %llu", + logical2 + SZ_512K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_512K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_512K, len); + ret = -EINVAL; + goto out; + } + + /* Verify there's a hole at [1M+512K, 2M+512K] . */ + len = SZ_1M; + ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID [%llu, %llu] succeeded, should fail", + logical1 + SZ_512K, logical1 + SZ_512K + len); + goto out; + } + + /* Clean up after us. */ + ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K); + if (ret) + goto out; + + ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * delete the 1st 32K, making the new start address 1M+32K. + */ +static int test_front_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, SZ_16K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + SZ_16K); + goto out; + } + + len -= SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len, + map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", + logical + SZ_16K, logical + SZ_64K); + goto out; + } + + if (io_stripe.physical != logical + SZ_16K) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_16K, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_48K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret != -ENODATA) { + ret = -EINVAL; + test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", + logical, logical + SZ_16K); + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K); +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * truncate the stripe extent down to 32K. + */ +static int test_tail_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K); + if (ret) { + test_err("deleting RAID extent [%llu, %llu] failed", + logical + SZ_48K, logical + SZ_64K); + goto out; + } + + len = SZ_48K; + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu, got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_48K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_48K, len); + ret = -EINVAL; + goto out; + } + + len = SZ_16K; + ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len, + map_type, 0, &io_stripe); + if (ret != -ENODATA) { + test_err("lookup of RAID extent [%llu, %llu] succeeded should fail", + logical + SZ_48K, logical + SZ_64K); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then + * overwrite the whole range giving it new physical address at an offset of 1G. + * The intent of this test is to exercise the 'update_raid_extent_item()' + * function called be btrfs_insert_one_raid_extent(). + */ +static int test_create_update_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + bioc->map_type = map_type; + bioc->size = len; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = SZ_1G + logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("updating RAID extent failed: %d", ret); + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical + SZ_1G) { + test_err("invalid physical address, expected %llu, got %llu", + logical + SZ_1G, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu, got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +/* + * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. + * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. + */ +static int test_simple_create_delete(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe io_stripe = { 0 }; + u64 map_type = RST_TEST_RAID1_TYPE; + u64 logical = SZ_1M; + u64 len = SZ_64K; + int ret; + + bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); + if (!bioc) { + test_std_err(TEST_ALLOC_IO_CONTEXT); + ret = -ENOMEM; + goto out; + } + + bioc->map_type = map_type; + bioc->size = SZ_64K; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_io_stripe *stripe = &bioc->stripes[i]; + + stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); + if (!stripe->dev) { + test_err("cannot find device with devid %d", i); + ret = -EINVAL; + goto out; + } + + stripe->physical = logical + i * SZ_1G; + } + + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) { + test_err("inserting RAID extent failed: %d", ret); + goto out; + } + + io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); + if (!io_stripe.dev) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); + if (ret) { + test_err("lookup of RAID extent [%llu, %llu] failed", logical, + logical + len); + goto out; + } + + if (io_stripe.physical != logical) { + test_err("invalid physical address, expected %llu got %llu", + logical, io_stripe.physical); + ret = -EINVAL; + goto out; + } + + if (len != SZ_64K) { + test_err("invalid stripe length, expected %llu got %llu", + (u64)SZ_64K, len); + ret = -EINVAL; + goto out; + } + + ret = btrfs_delete_raid_extent(trans, logical, len); + if (ret) + test_err("deleting RAID extent [%llu, %llu] failed", logical, + logical + len); + +out: + btrfs_put_bioc(bioc); + return ret; +} + +static const test_func_t tests[] = { + test_simple_create_delete, + test_create_update_delete, + test_tail_delete, + test_front_delete, + test_front_delete_prev_item, + test_punch_hole, + test_punch_hole_3extents, + test_delete_two_extents, +}; + +static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root = NULL; + int ret; + + fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + ret = -ENOMEM; + goto out; + } + + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + goto out; + } + btrfs_set_super_incompat_flags(root->fs_info->super_copy, + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); + root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + fs_info->stripe_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); + if (IS_ERR(root->node)) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = PTR_ERR(root->node); + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 2 * nodesize; + + for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { + struct btrfs_device *dev; + + dev = btrfs_alloc_dummy_device(fs_info); + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + dev->devid = i; + } + + btrfs_init_dummy_trans(&trans, root->fs_info); + ret = test(&trans); + if (ret) + goto out; + +out: + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); + + return ret; +} + +int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) +{ + int ret = 0; + + test_msg("running raid-stripe-tree tests"); + for (int i = 0; i < ARRAY_SIZE(tests); i++) { + ret = run_test(tests[i], sectorsize, nodesize); + if (ret) { + test_err("test-case %ps failed with %d\n", tests[i], ret); + goto out; + } + } + +out: + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0fc873af891f..15312013f2a3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, @@ -349,9 +348,8 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); - cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; + xa_init(&cur_trans->delayed_refs.head_refs); xa_init(&cur_trans->delayed_refs.dirty_extents); - atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* * although the tree mod log is per file system and not per transaction, @@ -797,8 +795,7 @@ alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) - btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, - delayed_refs_bytes); + btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); @@ -2052,7 +2049,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_unlock(&fs_info->trans_lock); - btrfs_cleanup_one_transaction(trans->transaction, fs_info); + btrfs_cleanup_one_transaction(trans->transaction); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index dd9ce9b9f69e..9f7c777af635 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -33,7 +33,7 @@ struct btrfs_path; */ #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) -/* Radix-tree tag for roots that are part of the trasaction. */ +/* Radix-tree tag for roots that are part of the transaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { @@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int error); +/* + * We want the transaction abort to print stack trace only for errors where the + * cause could be a bug, eg. due to ENOSPC, and not for common errors that are + * caused by external factors. + */ +static inline bool btrfs_abort_should_print_stack(int error) +{ + switch (error) { + case -EIO: + case -EROFS: + case -ENOMEM: + return false; + } + return true; +} /* * Call btrfs_abort_transaction as early as possible when an error condition is @@ -240,7 +254,7 @@ do { \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ __first = true; \ - if (WARN(abort_should_print_stack(error), \ + if (WARN(btrfs_abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b50263723bc..43979891f7c8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf, return 0; } -__printf(4, 5) +__printf(5, 6) __cold -static void chunk_err(const struct extent_buffer *leaf, +static void chunk_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { - const struct btrfs_fs_info *fs_info = leaf->fs_info; - bool is_sb; + bool is_sb = !leaf; struct va_format vaf; va_list args; int i; int slot = -1; - /* Only superblock eb is able to have such small offset */ - is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); - if (!is_sb) { /* * Get the slot number by iterating through all slots, this @@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf, /* * The common chunk check which could also work on super block sys chunk array. * + * If @leaf is NULL, then @chunk must be an on-stack chunk item. + * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable) + * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize) { - struct btrfs_fs_info *fs_info = leaf->fs_info; u64 length; u64 chunk_end; u64 stripe_len; @@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u16 sub_stripes; u64 type; u64 features; + u32 chunk_sector_size; bool mixed = false; int raid_index; int nparity; int ncopies; - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); + if (leaf) { + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk); + } else { + length = btrfs_stack_chunk_length(chunk); + stripe_len = btrfs_stack_chunk_stripe_len(chunk); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + sub_stripes = btrfs_stack_chunk_sub_stripes(chunk); + type = btrfs_stack_chunk_type(chunk); + chunk_sector_size = btrfs_stack_chunk_sector_size(chunk); + } raid_index = btrfs_bg_flags_to_raid_index(type); ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; if (unlikely(!num_stripes)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (unlikely(num_stripes < ncopies)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } if (unlikely(nparity && num_stripes == nparity)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!IS_ALIGNED(logical, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", - logical, fs_info->sectorsize); + logical, sectorsize); return -EUCLEAN; } - if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { - chunk_err(leaf, chunk, logical, + if (unlikely(chunk_sector_size != sectorsize)) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", - btrfs_chunk_sector_size(leaf, chunk), - fs_info->sectorsize); + chunk_sector_size, sectorsize); return -EUCLEAN; } - if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { - chunk_err(leaf, chunk, logical, + if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) { + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (unlikely(check_add_overflow(logical, length, &chunk_end))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical start and length, have logical start %llu length %llu", logical, length); return -EUCLEAN; } if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; @@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * Thus it should be a good way to catch obvious bitflips. */ if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "chunk length too large: have %llu limit %llu", length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); + BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); return -EUCLEAN; } if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; @@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; @@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, if (!mixed) { if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } @@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { - chunk_err(leaf, chunk, logical, + chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, struct btrfs_chunk *chunk, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int num_stripes; if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size(leaf, slot), sizeof(struct btrfs_chunk), - BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } @@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (unlikely(btrfs_chunk_item_size(num_stripes) != btrfs_item_size(leaf, slot))) { - chunk_err(leaf, chunk, key->offset, + chunk_err(fs_info, leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size(leaf, slot), btrfs_chunk_item_size(num_stripes)); return -EUCLEAN; } out: - return btrfs_check_chunk_valid(leaf, chunk, key->offset); + return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset, + fs_info->sectorsize); } __printf(3, 4) @@ -1527,6 +1539,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1556,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1633,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1721,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; } @@ -2183,8 +2220,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; @@ -2192,16 +2229,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, int ret; found_level = btrfs_header_level(eb); - if (found_level != level) { + if (found_level != check->level) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); + eb->start, check->level, found_level); return -EIO; } - if (!first_key) + if (!check->has_first_key) return 0; /* @@ -2226,15 +2263,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); if (ret) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, + eb->start, check->transid, check->first_key.objectid, + check->first_key.type, check->first_key.offset, found_key.objectid, found_key.type, found_key.offset); } diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 01669cfa6578..eb201f4ec3c7 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -10,6 +10,7 @@ #include <uapi/linux/btrfs_tree.h> struct extent_buffer; +struct btrfs_fs_info; struct btrfs_chunk; struct btrfs_key; @@ -66,10 +67,12 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); -int btrfs_check_chunk_valid(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + u32 sectorsize); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); +int btrfs_verify_level_key(struct extent_buffer *eb, + const struct btrfs_tree_parent_check *check); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2ed2a791f8f..955d1677e865 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -590,7 +590,6 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -1374,7 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1845,7 +1844,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2124,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; @@ -3588,7 +3587,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -4566,7 +4564,6 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4776,7 +4773,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -6204,7 +6200,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, const struct list_head *delayed_del_list, const struct btrfs_delayed_item *first, const struct btrfs_delayed_item **last_ret) @@ -6265,7 +6260,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, if (ret < 0) { return ret; } else if (ret == 0) { - ret = batch_delete_dir_index_items(trans, inode, path, ctx, + ret = batch_delete_dir_index_items(trans, inode, path, delayed_del_list, curr, &last); if (ret) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index b382a4c443d4..1ac2678fc4ca 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, * is freed (its refcount is decremented). */ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq) { diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 6308c577a4a4..1c12566040db 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, struct extent_buffer *eb, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index aca2861f2187..17b5e81123a1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8f340ad1d938..0a0776489055 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -13,8 +13,8 @@ #include <linux/list_sort.h> #include <linux/namei.h> #include "misc.h" -#include "ctree.h" #include "disk-io.h" +#include "extent-tree.h" #include "transaction.h" #include "volumes.h" #include "raid56.h" @@ -48,6 +48,7 @@ struct btrfs_io_geometry { u64 raid56_full_stripe_start; int max_errors; enum btrfs_map_op op; + bool use_rst; }; const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { @@ -733,6 +734,118 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) } /* + * We can have very weird soft links passed in. + * One example is "/proc/self/fd/<fd>", which can be a soft link to + * a block device. + * + * But it's never a good idea to use those weird names. + * Here we check if the path (not following symlinks) is a good one inside + * "/dev/". + */ +static bool is_good_dev_path(const char *dev_path) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + bool is_good = false; + int ret; + + if (!dev_path) + goto out; + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) + goto out; + + /* + * Do not follow soft link, just check if the original path is inside + * "/dev/". + */ + ret = kern_path(dev_path, 0, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) + goto out; + if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) + goto out; + is_good = true; +out: + kfree(path_buf); + path_put(&path); + return is_good; +} + +static int get_canonical_dev_path(const char *dev_path, char *canonical) +{ + struct path path = { .mnt = NULL, .dentry = NULL }; + char *path_buf = NULL; + char *resolved_path; + int ret; + + if (!dev_path) { + ret = -EINVAL; + goto out; + } + + path_buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!path_buf) { + ret = -ENOMEM; + goto out; + } + + ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (ret) + goto out; + resolved_path = d_path(&path, path_buf, PATH_MAX); + if (IS_ERR(resolved_path)) { + ret = PTR_ERR(resolved_path); + goto out; + } + ret = strscpy(canonical, resolved_path, PATH_MAX); +out: + kfree(path_buf); + path_put(&path); + return ret; +} + +static bool is_same_device(struct btrfs_device *device, const char *new_path) +{ + struct path old = { .mnt = NULL, .dentry = NULL }; + struct path new = { .mnt = NULL, .dentry = NULL }; + char *old_path = NULL; + bool is_same = false; + int ret; + + if (!device->name) + goto out; + + old_path = kzalloc(PATH_MAX, GFP_NOFS); + if (!old_path) + goto out; + + rcu_read_lock(); + ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + rcu_read_unlock(); + if (ret < 0) + goto out; + + ret = kern_path(old_path, LOOKUP_FOLLOW, &old); + if (ret) + goto out; + ret = kern_path(new_path, LOOKUP_FOLLOW, &new); + if (ret) + goto out; + if (path_equal(&old, &new)) + is_same = true; +out: + kfree(old_path); + path_put(&old); + path_put(&new); + return is_same; +} + +/* * Add new device to list of registered devices * * Returns: @@ -852,7 +965,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, MAJOR(path_devt), MINOR(path_devt), current->comm, task_pid_nr(current)); - } else if (!device->name || strcmp(device->name->str, path)) { + } else if (!device->name || !is_same_device(device, path)) { /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -1105,6 +1218,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; + device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); @@ -1189,6 +1303,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + s64 __maybe_unused value = 0; int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, @@ -1218,7 +1333,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ; + fs_devices->read_devid = latest_dev->devid; + fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(), + &value); + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->collect_fs_stats = true; + + if (value) { + if (fs_devices->read_policy == BTRFS_READ_POLICY_RR) + fs_devices->rr_min_contig_read = value; + if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID) + fs_devices->read_devid = value; + } +#else fs_devices->read_policy = BTRFS_READ_POLICY_PID; +#endif return 0; } @@ -1382,12 +1513,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; + char *canonical_path = NULL; u64 bytenr; dev_t devt; int ret; lockdep_assert_held(&uuid_mutex); + if (!is_good_dev_path(path)) { + canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (canonical_path) { + ret = get_canonical_dev_path(path, canonical_path); + if (ret < 0) { + kfree(canonical_path); + canonical_path = NULL; + } + } + } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1432,7 +1574,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(path, disk_super, &new_device_added); + device = device_list_add(canonical_path ? : path, disk_super, + &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1441,6 +1584,7 @@ free_disk_super: error_bdev_put: fput(bdev_file); + kfree(canonical_path); return device; } @@ -1923,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2619,11 +2762,9 @@ next_slot: device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (device->fs_devices->seeding) btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(trans, leaf); - } path->slots[0]++; goto next_slot; @@ -2720,8 +2861,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - btrfs_clear_sb_rdonly(sb); - /* GFP_KERNEL allocation must not be under device_list_mutex */ seed_devices = btrfs_init_sprout(fs_info); if (IS_ERR(seed_devices)) { @@ -2864,8 +3003,6 @@ error_sysfs: mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: - if (seeding_dev) - btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: @@ -2920,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(trans, leaf); - out: btrfs_free_path(path); return ret; @@ -3630,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_meta(leaf, item, &disk_bargs); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); btrfs_set_balance_sys(leaf, item, &disk_bargs); - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -5309,7 +5441,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->stripe_size) + ctl->nparity, @@ -5395,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma btrfs_free_chunk_map(map); } +static int btrfs_chunk_map_cmp(const struct rb_node *new, + const struct rb_node *exist) +{ + const struct btrfs_chunk_map *new_map = + rb_entry(new, struct btrfs_chunk_map, rb_node); + const struct btrfs_chunk_map *exist_map = + rb_entry(exist, struct btrfs_chunk_map, rb_node); + + if (new_map->start == exist_map->start) + return 0; + if (new_map->start < exist_map->start) + return -1; + return 1; +} + EXPORT_FOR_TESTS int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) { - struct rb_node **p; - struct rb_node *parent = NULL; - bool leftmost = true; + struct rb_node *exist; write_lock(&fs_info->mapping_tree_lock); - p = &fs_info->mapping_tree.rb_root.rb_node; - while (*p) { - struct btrfs_chunk_map *entry; - - parent = *p; - entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); - - if (map->start < entry->start) { - p = &(*p)->rb_left; - } else if (map->start > entry->start) { - p = &(*p)->rb_right; - leftmost = false; - } else { - write_unlock(&fs_info->mapping_tree_lock); - return -EEXIST; - } + exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree, + btrfs_chunk_map_cmp); + + if (exist) { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; } - rb_link_node(&map->rb_node, parent, p); - rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); chunk_map_device_set_bits(map, CHUNK_ALLOCATED); chunk_map_device_clear_bits(map, CHUNK_TRIMMED); write_unlock(&fs_info->mapping_tree_lock); @@ -5841,23 +5974,75 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes) { - struct btrfs_chunk_map *map; - int ret = 0; + for (int index = first; index < first + num_stripes; index++) { + const struct btrfs_device *device = map->stripes[index].dev; - if (!btrfs_fs_incompat(fs_info, RAID56)) - return 0; + if (device->devid == READ_ONCE(device->fs_devices->read_devid)) + return index; + } - map = btrfs_get_chunk_map(fs_info, logical, len); + /* If no read-preferred device is set use the first stripe. */ + return first; +} - if (!WARN_ON(IS_ERR(map))) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - btrfs_free_chunk_map(map); +struct stripe_mirror { + u64 devid; + int num; +}; + +static int btrfs_cmp_devid(const void *a, const void *b) +{ + const struct stripe_mirror *s1 = (const struct stripe_mirror *)a; + const struct stripe_mirror *s2 = (const struct stripe_mirror *)b; + + if (s1->devid < s2->devid) + return -1; + if (s1->devid > s2->devid) + return 1; + return 0; +} + +/* + * Select a stripe for reading using the round-robin algorithm. + * + * 1. Compute the read cycle as the total sectors read divided by the minimum + * sectors per device. + * 2. Determine the stripe number for the current read by taking the modulus + * of the read cycle with the total number of stripes: + * + * stripe index = (total sectors / min sectors per dev) % num stripes + * + * The calculated stripe index is then used to select the corresponding device + * from the list of devices, which is ordered by devid. + */ +static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes) +{ + struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 }; + struct btrfs_device *device = map->stripes[first].dev; + struct btrfs_fs_info *fs_info = device->fs_devices->fs_info; + unsigned int read_cycle; + unsigned int total_reads; + unsigned int min_reads_per_dev; + + total_reads = percpu_counter_sum(&fs_info->stats_read_blocks); + min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >> + fs_info->sectorsize_bits; + + for (int index = 0, i = first; i < first + num_stripes; i++) { + stripes[index].devid = map->stripes[i].dev->devid; + stripes[index].num = i; + index++; } - return ret; + sort(stripes, num_stripes, sizeof(struct stripe_mirror), + btrfs_cmp_devid, NULL); + + read_cycle = total_reads / min_reads_per_dev; + return stripes[read_cycle % num_stripes].num; } +#endif static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, @@ -5888,6 +6073,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, case BTRFS_READ_POLICY_PID: preferred_mirror = first + (current->pid % num_stripes); break; +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_READ_POLICY_RR: + preferred_mirror = btrfs_read_rr(map, first, num_stripes); + break; + case BTRFS_READ_POLICY_DEVID: + preferred_mirror = btrfs_read_preferred(map, first, num_stripes); + break; +#endif } if (dev_replace_is_ongoing && @@ -5919,9 +6112,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - u64 logical, - u16 total_stripes) +EXPORT_FOR_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -6246,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, { dst->dev = map->stripes[io_geom->stripe_index].dev; - if (io_geom->op == BTRFS_MAP_READ && - btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst) return btrfs_get_raid_extent_offset(fs_info, logical, length, map->type, io_geom->stripe_index, dst); @@ -6262,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, const struct btrfs_io_stripe *smap, const struct btrfs_chunk_map *map, int num_alloc_stripes, - enum btrfs_map_op op, int mirror_num) + struct btrfs_io_geometry *io_geom) { if (!smap) return false; @@ -6270,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info, if (num_alloc_stripes != 1) return false; - if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ) return false; - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1) return false; return true; @@ -6479,14 +6671,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); + io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type); + + if (dev_replace->replace_task != current) + down_read(&dev_replace->rwsem); - down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* * Hold the semaphore for read during the whole operation, write is * requested at commit time but must wait. */ - if (!dev_replace_is_ongoing) + if (!dev_replace_is_ongoing && dev_replace->replace_task != current) up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -6545,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, - io_geom.mirror_num)) { + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) { ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) *mirror_num_ret = io_geom.mirror_num; @@ -6560,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, goto out; } bioc->map_type = map->type; + bioc->use_rst = io_geom.use_rst; /* * For RAID56 full map, we need to make sure the stripes[] follows the @@ -6626,7 +6821,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { + if (dev_replace_is_ongoing && dev_replace->replace_task != current) { lockdep_assert_held(&dev_replace->rwsem); /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); @@ -6900,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, warn_32bit_meta_chunk(fs_info, logical, length, type); #endif - /* - * Only need to verify chunk item if we're reading from sys chunk array, - * as chunk item in tree block is already verified by tree-checker. - */ - if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_check_chunk_valid(leaf, chunk, logical); - if (ret) - return ret; - } - map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ @@ -7167,16 +7352,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; u8 *array_ptr; unsigned long sb_array_offset; int ret = 0; - u32 num_stripes; u32 array_size; - u32 len = 0; u32 cur_offset; - u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); @@ -7199,10 +7379,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) cur_offset = 0; while (cur_offset < array_size) { - disk_key = (struct btrfs_disk_key *)array_ptr; - len = sizeof(*disk_key); - if (cur_offset + len > array_size) - goto out_short_read; + struct btrfs_chunk *chunk; + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr; + u32 len = sizeof(*disk_key); + + /* + * The sys_chunk_array has been already verified at super block + * read time. Only do ASSERT()s for basic checks. + */ + ASSERT(cur_offset + len <= array_size); btrfs_disk_key_to_cpu(&key, disk_key); @@ -7210,44 +7395,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type != BTRFS_CHUNK_ITEM_KEY) { - btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); - ret = -EIO; - break; - } + ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY); chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be present, - * exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM); - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk)); - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + ASSERT(cur_offset + len <= array_size); ret = read_one_chunk(&key, sb, chunk); if (ret) @@ -7260,13 +7415,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) clear_extent_buffer_uptodate(sb); free_extent_buffer_stale(sb); return ret; - -out_short_read: - btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", - len, cur_offset); - clear_extent_buffer_uptodate(sb); - free_extent_buffer_stale(sb); - return -EIO; } /* @@ -7466,8 +7614,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_device *device; int ret = 0; - fs_devices->fs_info = fs_info; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; @@ -7643,8 +7789,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(trans, eb); - out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 03d2d60afe0c..120f65e21eeb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -30,6 +30,12 @@ struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) +/* + * Arbitratry maximum size of one discard request to limit potentially long time + * spent in blkdev_issue_discard(). + */ +#define BTRFS_MAX_DISCARD_CHUNK_SIZE (SZ_1G) + extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K @@ -290,6 +296,9 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_ZONED, }; +#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K) +/* Keep in sync with raid_attr table, current maximum is RAID1C4. */ +#define BTRFS_RAID1_MAX_MIRRORS (4) /* * Read policies for mirrored block group profiles, read picks the stripe based * on these policies. @@ -297,10 +306,16 @@ enum btrfs_chunk_allocation_policy { enum btrfs_read_policy { /* Use process PID to choose the stripe */ BTRFS_READ_POLICY_PID, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* Balancing RAID1 reads across all striped devices (round-robin). */ + BTRFS_READ_POLICY_RR, + /* Read from a specific device. */ + BTRFS_READ_POLICY_DEVID, +#endif BTRFS_NR_READ_POLICY, }; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Checksum mode - offload it to workqueues or do it synchronously in * btrfs_submit_chunk(). @@ -411,6 +426,8 @@ struct btrfs_fs_devices { bool seeding; /* The mount needs to use a randomly generated fsid. */ bool temp_fsid; + /* Enable/disable the filesystem stats tracking. */ + bool collect_fs_stats; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -424,7 +441,16 @@ struct btrfs_fs_devices { /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BTRFS_EXPERIMENTAL + /* + * Minimum contiguous reads before switching to next device, the unit + * is one block/sectorsize. + */ + u32 rr_min_contig_read; + + /* Device to be used for reading in case of RAID1. */ + u64 read_devid; + /* Checksum mode - offload it or do it synchronously. */ enum btrfs_offload_csum_mode offload_csum_mode; #endif @@ -479,6 +505,7 @@ struct btrfs_io_context { struct bio *orig_bio; atomic_t error; u16 max_errors; + bool use_rst; u64 logical; u64 size; @@ -735,8 +762,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); @@ -834,4 +859,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes); +#endif + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ce464cd8e0ac..3e0edbcf73e1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; @@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(fs_info, path, name, name_len); + di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; @@ -205,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 100abc00b794..c9e92c6941ec 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -174,10 +174,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; - workspace->strm.avail_in = - (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = min(bytes_left, + in_buf_folios << PAGE_SHIFT); } else { unsigned int pg_off; unsigned int cur_len; @@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, pg_off = offset_in_page(start); cur_len = btrfs_calc_input_length(orig_end, start); data_in = kmap_local_folio(in_folio, pg_off); - start += PAGE_SIZE; + start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7fa2920632ba..73e0aa9fc08a 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -707,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * zoned mode. In this case, we don't have a valid max zone * append size. */ - if (bdev_is_zoned(device->bdev)) { - blk_stack_limits(lim, - &bdev_get_queue(device->bdev)->limits, - 0); - } + if (bdev_is_zoned(device->bdev)) + blk_stack_limits(lim, bdev_limits(device->bdev), 0); + } + + ret = blk_validate_limits(lim); + if (ret) { + btrfs_err(fs_info, "zoned: failed to validate queue limits"); + return ret; } /* @@ -745,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) (u64)lim->max_segments << PAGE_SHIFT), fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; - if (fs_info->max_zone_append_size < fs_info->max_extent_size) - fs_info->max_extent_size = fs_info->max_zone_append_size; + + fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, + fs_info->max_zone_append_size); /* * Check mount options here, because we might change fs_info->zoned @@ -1340,7 +1344,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, + btrfs_err_in_rcu(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), rcu_str_deref(device->name), device->devid); @@ -1739,7 +1743,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) return false; /* - * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the * extent layout the relocation code has. * Furthermore we have set aside own block-group from which only the * relocation "process" can allocate and make sure only one process at a @@ -1973,7 +1977,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, if (block_group->meta_write_pointer > eb->start) return -EBUSY; - /* If for_sync, this hole will be filled with trasnsaction commit. */ + /* If for_sync, this hole will be filled with transaction commit. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) return -EAGAIN; return -EBUSY; @@ -2648,3 +2652,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->zone_active_bgs_lock); } + +/* + * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. + * + * @space_info: the space to work on + * @num_bytes: targeting reclaim bytes + * + * This one resets the zones of a block group, so we can reuse the region + * without removing the block group. On the other hand, btrfs_delete_unused_bgs() + * just removes a block group and frees up the underlying zones. So, we still + * need to allocate a new block group to reuse the zones. + * + * Resetting is faster than deleting/recreating a block group. It is similar + * to freeing the logical space on the regular mode. However, we cannot change + * the block group's profile with this operation. + */ +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + while (num_bytes > 0) { + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg = NULL; + bool found = false; + u64 reclaimed = 0; + + /* + * Here, we choose a fully zone_unusable block group. It's + * technically possible to reset a partly zone_unusable block + * group, which still has some free space left. However, + * handling that needs to cope with the allocation side, which + * makes the logic more complex. So, let's handle the easy case + * for now. + */ + spin_lock(&fs_info->unused_bgs_lock); + list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { + if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) + continue; + + /* + * Use trylock to avoid locking order violation. In + * btrfs_reclaim_bgs_work(), the lock order is + * &bg->lock -> &fs_info->unused_bgs_lock. We skip a + * block group if we cannot take its lock. + */ + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + found = true; + break; + } + if (!found) { + spin_unlock(&fs_info->unused_bgs_lock); + return 0; + } + + list_del_init(&bg->bg_list); + btrfs_put_block_group(bg); + spin_unlock(&fs_info->unused_bgs_lock); + + /* + * Since the block group is fully zone_unusable and we cannot + * allocate from this block group anymore, we don't need to set + * this block group read-only. + */ + + down_read(&fs_info->dev_replace.rwsem); + map = bg->physical_map; + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + unsigned int nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, + stripe->physical >> SECTOR_SHIFT, + zone_size_sectors); + memalloc_nofs_restore(nofs_flags); + + if (ret) { + up_read(&fs_info->dev_replace.rwsem); + return ret; + } + } + up_read(&fs_info->dev_replace.rwsem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + ASSERT(!btrfs_is_block_group_used(bg)); + if (bg->ro) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + continue; + } + + reclaimed = bg->alloc_offset; + bg->zone_unusable = bg->length - bg->zone_capacity; + bg->alloc_offset = 0; + /* + * This holds because we currently reset fully used then freed + * block group. + */ + ASSERT(reclaimed == bg->zone_capacity); + bg->free_space_ctl->free_space += reclaimed; + space_info->bytes_zone_unusable -= reclaimed; + spin_unlock(&bg->lock); + btrfs_return_free_space(space_info, reclaimed); + spin_unlock(&space_info->lock); + + if (num_bytes <= reclaimed) + break; + num_bytes -= reclaimed; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 7612e6572605..9672bf4c3335 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); +int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } +static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, + u64 num_bytes) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 866607fd3e58..5232b56d5892 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; + ASSERT(timer == &wsm.timer); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { @@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { - tot_in += PAGE_SIZE; + tot_in += workspace->in_buf.size; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..cc8452f60251 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -736,15 +736,12 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); - folio_memcg_unlock(folio); - if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -855,8 +852,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * done a sync(). Just drop the buffers from the inode list. * * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which - * assumes that all the buffers are against the blockdev. Not true - * for reiserfs. + * assumes that all the buffers are against the blockdev. */ void invalidate_inode_buffers(struct inode *inode) { @@ -1194,13 +1190,11 @@ void mark_buffer_dirty(struct buffer_head *bh) struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; - folio_memcg_lock(folio); if (!folio_test_set_dirty(folio)) { mapping = folio->mapping; if (mapping) __folio_mark_dirty(folio, mapping, 0); } - folio_memcg_unlock(folio); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -1649,6 +1643,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) if (length == folio_size(folio)) filemap_release_folio(folio, 0); out: + folio_clear_mappedtodisk(folio); return; } EXPORT_SYMBOL(block_invalidate_folio); @@ -2803,7 +2798,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2808,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 89b11336a836..1806bff8e59b 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -15,6 +15,7 @@ #include <linux/namei.h> #include <linux/poll.h> #include <linux/mount.h> +#include <linux/security.h> #include <linux/statfs.h> #include <linux/ctype.h> #include <linux/string.h> @@ -576,7 +577,7 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) */ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) { - char *secctx; + int err; _enter(",%s", args); @@ -585,16 +586,16 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) return -EINVAL; } - if (cache->secctx) { + if (cache->have_secid) { pr_err("Second security context specified\n"); return -EINVAL; } - secctx = kstrdup(args, GFP_KERNEL); - if (!secctx) - return -ENOMEM; + err = security_secctx_to_secid(args, strlen(args), &cache->secid); + if (err) + return err; - cache->secctx = secctx; + cache->have_secid = true; return 0; } @@ -820,7 +821,6 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache) put_cred(cache->cache_cred); kfree(cache->rootdirname); - kfree(cache->secctx); kfree(cache->tag); _leave(""); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 35ba2117a6f6..3e63cfe15874 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object, static void cachefiles_clean_up_object(struct cachefiles_object *object, struct cachefiles_cache *cache) { + struct file *file; + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { cachefiles_see_object(object, cachefiles_obj_see_clean_delete); @@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object, } cachefiles_unmark_inode_in_use(object, object->file); - if (object->file) { - fput(object->file); - object->file = NULL; - } + + spin_lock(&object->lock); + file = object->file; + object->file = NULL; + spin_unlock(&object->lock); + + if (file) + fput(file); } /* diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 7b99bd98de75..38c236e38cef 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,7 +122,6 @@ struct cachefiles_cache { #define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */ #define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */ char *rootdirname; /* name of cache root directory */ - char *secctx; /* LSM security context */ char *tag; /* cache binding tag */ refcount_t unbind_pincount;/* refcount to do daemon unbind */ struct xarray reqs; /* xarray of pending on-demand requests */ @@ -130,6 +129,8 @@ struct cachefiles_cache { struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; u32 msg_id_next; + u32 secid; /* LSM security id */ + bool have_secid; /* whether "secid" was set */ }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 6a821a959b59..92058ae43488 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -13,6 +13,7 @@ #include <linux/falloc.h> #include <linux/sched/mm.h> #include <trace/events/fscache.h> +#include <trace/events/netfs.h> #include "internal.h" struct cachefiles_kiocb { @@ -366,6 +367,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { if (term_func) term_func(term_func_priv, -ENOBUFS, false); + trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite); return -ENOBUFS; } @@ -695,6 +697,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) iov_iter_truncate(&subreq->io_iter, len); } + trace_netfs_sreq(subreq, netfs_sreq_trace_cache_prepare); cachefiles_begin_secure(cache, &saved_cred); ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), &start, &len, len, true); @@ -704,6 +707,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) return; } + trace_netfs_sreq(subreq, netfs_sreq_trace_cache_write); cachefiles_write(&subreq->rreq->cache_resources, subreq->start, &subreq->io_iter, netfs_write_subrequest_terminated, subreq); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2b3f9935dbb4..7cf59713f0f7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, } if (!d_is_negative(dentry)) { - if (d_backing_inode(dentry) == file_inode(object->file)) { - success = true; - goto out_dput; - } - ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 470c96658385..fe3de9ad57bf 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, { struct cachefiles_object *object = kiocb->ki_filp->private_data; struct cachefiles_cache *cache = object->volume->cache; - struct file *file = object->file; - size_t len = iter->count; + struct file *file; + size_t len = iter->count, aligned_len = len; loff_t pos = kiocb->ki_pos; const struct cred *saved_cred; int ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) - return ret; + goto out; trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); - if (!ret) + if (!ret) { ret = len; + kiocb->ki_pos += ret; + } +out: + fput(file); return ret; } @@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, int whence) { struct cachefiles_object *object = filp->private_data; - struct file *file = object->file; + struct file *file; + loff_t ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); - return vfs_llseek(file, pos, whence); + ret = vfs_llseek(file, pos, whence); + fput(file); + + return ret; } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index fe777164f1d8..fc6611886b3b 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -18,7 +18,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) struct cred *new; int ret; - _enter("{%s}", cache->secctx); + _enter("{%u}", cache->have_secid ? cache->secid : 0); new = prepare_kernel_cred(current); if (!new) { @@ -26,8 +26,8 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) goto error; } - if (cache->secctx) { - ret = set_security_override_from_ctx(new, cache->secctx); + if (cache->have_secid) { + ret = set_security_override(new, cache->secid); if (ret < 0) { put_cred(new); pr_err("Security denies permission to nominate security context: error %d\n", diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 7c6f260a3be5..52383b1d0ba6 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -77,6 +77,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) trace_cachefiles_vfs_error(object, file_inode(file), ret, cachefiles_trace_setxattr_error); trace_cachefiles_coherency(object, file_inode(file)->i_ino, + be64_to_cpup((__be64 *)buf->data), buf->content, cachefiles_coherency_set_fail); if (ret != -ENOMEM) @@ -85,6 +86,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) "Failed to set xattr with error %d", ret); } else { trace_cachefiles_coherency(object, file_inode(file)->i_ino, + be64_to_cpup((__be64 *)buf->data), buf->content, cachefiles_coherency_set_ok); } @@ -126,7 +128,10 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file object, "Failed to read aux with error %zd", xlen); why = cachefiles_coherency_check_xattr; - } else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) { + goto out; + } + + if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) { why = cachefiles_coherency_check_type; } else if (memcmp(buf->data, p, len) != 0) { why = cachefiles_coherency_check_aux; @@ -141,7 +146,9 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file ret = 0; } +out: trace_cachefiles_coherency(object, file_inode(file)->i_ino, + be64_to_cpup((__be64 *)buf->data), buf->content, why); kfree(buf); return ret; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c2a9e2cc03de..f5224a566b69 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -223,10 +223,13 @@ static void finish_netfs_read(struct ceph_osd_request *req) subreq->len, i_size_read(req->r_inode)); /* no object means success but no data */ - if (err == -ENOENT) + if (err == -ENOENT) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); err = 0; - else if (err == -EBLOCKLISTED) + } else if (err == -EBLOCKLISTED) { fsc->blocklisted = true; + } if (err >= 0) { if (sparse && err > 0) @@ -242,6 +245,8 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (err > subreq->len) err = subreq->len; } + if (err > 0) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); } if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { @@ -253,8 +258,9 @@ static void finish_netfs_read(struct ceph_osd_request *req) subreq->transferred = err; err = 0; } + subreq->error = err; trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); - netfs_read_subreq_terminated(subreq, err, false); + netfs_read_subreq_terminated(subreq); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } @@ -314,7 +320,9 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) ceph_mdsc_put_request(req); out: - netfs_read_subreq_terminated(subreq, err, false); + subreq->error = err; + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq); return true; } @@ -426,8 +434,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); - if (err) - netfs_read_subreq_terminated(subreq, err, false); + if (err) { + subreq->error = err; + netfs_read_subreq_terminated(subreq); + } doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } @@ -1054,7 +1064,9 @@ get_more_pages: if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + + page = &folio->page; doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -1081,8 +1093,6 @@ get_more_pages: continue; } if (page_offset(page) >= ceph_wbc.i_size) { - struct folio *folio = page_folio(page); - doutc(cl, "folio at %lu beyond eof %llu\n", folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || @@ -1098,16 +1108,16 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page) || - PagePrivate2(page) /* [DEPRECATED] */) { + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", page); - unlock_page(page); + doutc(cl, "%p under writeback\n", folio); + folio_unlock(folio); continue; } - doutc(cl, "waiting on writeback %p\n", page); - wait_on_page_writeback(page); - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { @@ -2195,7 +2205,7 @@ int ceph_pool_perm_check(struct inode *inode, int need) if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. - * But for snapshot, head of the first object may have alread + * But for snapshot, head of the first object may have already * been deleted. Skip check to avoid creating orphan object. */ return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bed34fc11c91..a8d8b56cf9d2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -978,20 +978,6 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, return 0; } -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) -{ - struct inode *inode = &ci->netfs.inode; - struct ceph_client *cl = ceph_inode_to_client(inode); - int ret; - - spin_lock(&ci->i_ceph_lock); - ret = __ceph_caps_revoking_other(ci, NULL, mask); - spin_unlock(&ci->i_ceph_lock); - doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode), - ceph_cap_string(mask), ret); - return ret; -} - int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; @@ -2813,7 +2799,7 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, - * or a negative error code. There are 3 speical error codes: + * or a negative error code. There are 3 special error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. @@ -4085,23 +4071,22 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; - unsigned mseq = le32_to_cpu(ex->migrate_seq); - unsigned t_seq, t_mseq; + u32 t_issue_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); - t_seq = le32_to_cpu(ph->seq); + t_issue_seq = le32_to_cpu(ph->issue_seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { - t_cap_id = t_seq = t_mseq = 0; + t_cap_id = t_issue_seq = t_mseq = 0; target = -1; } - doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n", - inode, ceph_vinop(inode), ci, mds, mseq, target); + doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), target, t_issue_seq, t_mseq); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); @@ -4134,12 +4119,12 @@ retry: if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && - ceph_seq_cmp(tcap->seq, t_seq) < 0) { + ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; - tcap->seq = t_seq - 1; - tcap->issue_seq = t_seq - 1; + tcap->seq = t_issue_seq - 1; + tcap->issue_seq = t_issue_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { @@ -4154,7 +4139,7 @@ retry: int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, - t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { @@ -4228,18 +4213,22 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; + u32 piseq = 0; + u32 pmseq = 0; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); + piseq = le32_to_cpu(ph->issue_seq); + pmseq = le32_to_cpu(ph->mseq); } else { p_cap_id = 0; peer = -1; } - doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n", - inode, ceph_vinop(inode), ci, mds, mseq, peer); + doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), peer, piseq, pmseq); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { @@ -4268,15 +4257,13 @@ retry: doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && - (ocap->seq != le32_to_cpu(ph->seq) || - ocap->mseq != le32_to_cpu(ph->mseq))) { + (ocap->seq != piseq || + ocap->mseq != pmseq)) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, - ocap->seq, ocap->mseq, mds, - le32_to_cpu(ph->seq), - le32_to_cpu(ph->mseq)); + ocap->seq, ocap->mseq, mds, piseq, pmseq); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } @@ -4350,7 +4337,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); - u32 seq, mseq; + u32 seq, mseq, issue_seq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; @@ -4360,8 +4347,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, bool close_sessions = false; bool do_cap_release = false; - doutc(cl, "from mds%d\n", session->s_mds); - if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; @@ -4375,6 +4360,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); + issue_seq = le32_to_cpu(h->issue_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); @@ -4462,12 +4448,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), - vino.ino, vino.snap, inode); + doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", + session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, + seq, issue_seq, mseq); mutex_lock(&session->s_mutex); - doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds, - session->s_seq, (unsigned)seq); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 47e0c319fc68..d0768239a1c9 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -27,7 +27,7 @@ struct ceph_fname { }; /* - * Header for the crypted file when truncating the size, this + * Header for the encrypted file when truncating the size, this * will be sent to MDS, and the MDS will update the encrypted * last block and then truncate the size. */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 24c08078f5aa..fdf9dc15eafa 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -357,7 +357,7 @@ static int status_show(struct seq_file *s, void *p) seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); - seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false"); + seq_printf(s, "blocklisted: %s\n", str_true_false(fsc->blocklisted)); return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 952109292d69..0bf388e07a02 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -207,7 +207,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, dentry = __dcache_find_get_entry(parent, idx + step, &cache_ctl); if (!dentry) { - /* use linar search */ + /* use linear search */ idx = 0; break; } @@ -659,7 +659,7 @@ static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) return true; if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when - * dentries are sotred in hash order */ + * dentries are sorted in hash order */ } else if (dfi->frag != fpos_frag(new_pos)) { return true; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 44451749c544..150076ced937 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -393,9 +393,9 @@ static struct dentry *ceph_get_parent(struct dentry *child) } dir = snapdir; } - /* If directory has already been deleted, futher get_parent + /* If directory has already been deleted, further get_parent * will fail. Do not mark snapdir dentry as disconnected, - * this prevent exportfs from doing futher get_parent. */ + * this prevents exportfs from doing further get_parent. */ if (unlinked) dn = d_obtain_root(dir); else @@ -452,7 +452,13 @@ static int __get_snap_name(struct dentry *parent, char *name, goto out; if (ceph_snap(inode) == CEPH_SNAPDIR) { if (ceph_snap(dir) == CEPH_NOSNAP) { - strcpy(name, fsc->mount_options->snapdir_name); + /* + * .get_name() from struct export_operations + * assumes that its 'name' parameter is pointing + * to a NAME_MAX+1 sized buffer + */ + strscpy(name, fsc->mount_options->snapdir_name, + NAME_MAX + 1); err = 0; } goto out; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4b8d59ebda00..851d70200c6b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1066,7 +1066,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, if (ceph_inode_is_shutdown(inode)) return -EIO; - if (!len) + if (!len || !i_size) return 0; /* * flush any page cache pages in this range. this @@ -1086,7 +1086,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, int num_pages; size_t page_off; bool more; - int idx; + int idx = 0; size_t left; struct ceph_osd_req_op *op; u64 read_off = off; @@ -1116,6 +1116,16 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, len = read_off + read_len - off; more = len < iov_iter_count(to); + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + num_pages = calc_pages_for(read_off, read_len); page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1127,17 +1137,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, offset_in_page(read_off), - false, false); - - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } + false, true); ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); @@ -1160,7 +1160,14 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, else if (ret == -ENOENT) ret = 0; - if (ret > 0 && IS_ENCRYPTED(inode)) { + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { int fret; fret = ceph_fscrypt_decrypt_extents(inode, pages, @@ -1186,10 +1193,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret = min_t(ssize_t, fret, len); } - ceph_osdc_put_request(req); - /* Short read but not EOF? Zero out the remainder. */ - if (ret >= 0 && ret < len && (off + ret < i_size)) { + if (ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; @@ -1199,13 +1204,11 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, ret += zlen; } - idx = 0; - if (ret <= 0) - left = 0; - else if (off + ret > i_size) - left = i_size - off; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; else left = ret; + while (left > 0) { size_t plen, copied; @@ -1221,13 +1224,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, break; } } - ceph_release_page_vector(pages, num_pages); - if (ret < 0) { - if (ret == -EBLOCKLISTED) - fsc->blocklisted = true; - break; - } + ceph_osdc_put_request(req); if (off >= i_size || !more) break; @@ -1553,6 +1551,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + op = &req->r_ops[0]; + if (!write && sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); if (len < 0) { ceph_osdc_put_request(req); @@ -1562,6 +1570,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (len != size) osd_req_op_extent_update(req, 0, len); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + /* * To simplify error handling, allow AIO when IO within i_size * or IO can be satisfied by single OSD request. @@ -1593,17 +1603,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); - op = &req->r_ops[0]; - if (sparse) { - extent_cnt = __ceph_sparse_read_ext_count(inode, size); - ret = ceph_alloc_sparse_ext_map(op, extent_cnt); - if (ret) { - ceph_osdc_put_request(req); - break; - } - } - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 315ef02f9a3f..7dd6c2275085 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -160,7 +160,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, } /* - * get/constuct snapdir inode for a given directory + * get/construct snapdir inode for a given directory */ struct inode *ceph_get_snapdir(struct inode *parent) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c4a5fd94bbbb..785fe489ef4b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -827,7 +827,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) * And the worst case is that for the none async openc request it will * successfully open the file if the CDentry hasn't been unlinked yet, * but later the previous delayed async unlink request will remove the - * CDenty. That means the just created file is possiblly deleted later + * CDentry. That means the just created file is possibly deleted later * by accident. * * We need to wait for the inflight async unlink requests to finish @@ -1747,14 +1747,6 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, } } -void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - mutex_lock(&mdsc->mutex); - __open_export_target_sessions(mdsc, session); - mutex_unlock(&mdsc->mutex); -} - /* * session caps */ @@ -2362,7 +2354,7 @@ again: item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); item->migrate_seq = cpu_to_le32(cap->mseq); - item->seq = cpu_to_le32(cap->issue_seq); + item->issue_seq = cpu_to_le32(cap->issue_seq); msg->front.iov_len += sizeof(*item); ceph_put_cap(mdsc, cap); @@ -2808,12 +2800,11 @@ retry: if (pos < 0) { /* - * A rename didn't occur, but somehow we didn't end up where - * we thought we would. Throw a warning and try again. + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. */ - pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", - pos); - goto retry; + return ERR_PTR(-ENAMETOOLONG); } *pbase = base; @@ -3269,7 +3260,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, &session->s_features); /* - * Avoid inifinite retrying after overflow. The client will + * Avoid infinite retrying after overflow. The client will * increase the retry count and if the MDS is old version, * so we limit to retry at most 256 times. */ @@ -3522,7 +3513,7 @@ static void __do_request(struct ceph_mds_client *mdsc, /* * For async create we will choose the auth MDS of frag in parent - * directory to send the request and ususally this works fine, but + * directory to send the request and usually this works fine, but * if the migrated the dirtory to another MDS before it could handle * it the request will be forwarded. * @@ -4033,7 +4024,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* - * Avoid inifinite retrying after overflow. + * Avoid infinite retrying after overflow. * * The MDS will increase the fwd count and in client side * if the num_fwd is less than the one saved in request @@ -5609,9 +5600,9 @@ void send_flush_mdlog(struct ceph_mds_session *s) static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, struct ceph_mds_cap_auth *auth, + const struct cred *cred, char *tpath) { - const struct cred *cred = get_current_cred(); u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_client *cl = mdsc->fsc->client; @@ -5734,11 +5725,12 @@ int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) for (i = 0; i < mdsc->s_cap_auths_num; i++) { struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i]; - err = ceph_mds_auth_match(mdsc, s, tpath); + err = ceph_mds_auth_match(mdsc, s, cred, tpath); if (err < 0) { + put_cred(cred); return err; } else if (err > 0) { - /* always follow the last auth caps' permision */ + /* always follow the last auth caps' permission */ root_squash_perms = true; rw_perms_s = NULL; if ((mask & MAY_WRITE) && s->writeable && @@ -5751,6 +5743,8 @@ int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) } } + put_cred(cred); + doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms, rw_perms_s); if (root_squash_perms && rw_perms_s == NULL) { diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3dd54587944a..38bb7e0d2d79 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -634,8 +634,6 @@ extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); -extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 73f321b52895..4344e1f11806 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -285,8 +285,10 @@ static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, size_t len; struct ceph_fsid fsid; struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_options *opts = pctx->copts; struct ceph_mount_options *fsopt = pctx->opts; - char *fsid_start, *fs_name_start; + const char *name_start = dev_name; + const char *fsid_start, *fs_name_start; if (*dev_name_end != '=') { dout("separator '=' missing in source"); @@ -296,8 +298,14 @@ static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, fsid_start = strchr(dev_name, '@'); if (!fsid_start) return invalfc(fc, "missing cluster fsid"); - ++fsid_start; /* start of cluster fsid */ + len = fsid_start - name_start; + kfree(opts->name); + opts->name = kstrndup(name_start, len, GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + dout("using %s entity name", opts->name); + ++fsid_start; /* start of cluster fsid */ fs_name_start = strchr(fsid_start, '.'); if (!fs_name_start) return invalfc(fc, "missing file system name"); @@ -423,6 +431,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); fsopt->snapdir_name = param->string; param->string = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 037eac35a9e0..af14ec382246 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -60,7 +60,7 @@ /* max size of osd read request, limited by libceph */ #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN -/* osd has a configurable limitaion of max write size. +/* osd has a configurable limitation of max write size. * CEPH_MSG_MAX_DATA_LEN should be small enough. */ #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ @@ -796,7 +796,6 @@ extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); -extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index e066a556eccb..1a9f12204666 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -899,7 +899,7 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, } /* - * If there are dirty xattrs, reencode xattrs into the prealloc_blob + * If there are dirty xattrs, re-encode xattrs into the prealloc_blob * and swap into place. It returns the old i_xattrs.blob (or NULL) so * that it can be freed by the caller as the i_ceph_lock is likely to be * held. diff --git a/fs/char_dev.c b/fs/char_dev.c index 57cc096c498a..c2ddb998f3c9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) /** * cdev_device_del() - inverse of cdev_device_add - * @dev: the device structure * @cdev: the cdev structure + * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 8f0af4f62631..d5ef5469e4e6 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -80,6 +80,16 @@ #define ELF_HWCAP2 COMPAT_ELF_HWCAP2 #endif +#ifdef COMPAT_ELF_HWCAP3 +#undef ELF_HWCAP3 +#define ELF_HWCAP3 COMPAT_ELF_HWCAP3 +#endif + +#ifdef COMPAT_ELF_HWCAP4 +#undef ELF_HWCAP4 +#define ELF_HWCAP4 COMPAT_ELF_HWCAP4 +#endif + #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index e710a1782382..0b969d0eb8ff 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -55,6 +55,8 @@ struct configfs_dirent { #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) +#define CONFIGFS_PINNED \ + (CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; @@ -73,8 +75,6 @@ extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); -extern void configfs_hash_and_remove(struct dentry * dir, const char * name); - extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); extern int configfs_setattr(struct mnt_idmap *idmap, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 43d6bde1adcc..7d10278db30d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -207,7 +207,17 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren return ERR_PTR(-ENOENT); } sd->s_frag = get_fragment(frag); - list_add(&sd->s_sibling, &parent_sd->s_children); + + /* + * configfs_lookup scans only for unpinned items. s_children is + * partitioned so that configfs_lookup can bail out early. + * CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are not symmetrical. readdir + * cursors still need to be inserted at the front of the list. + */ + if (sd->s_type & CONFIGFS_PINNED) + list_add_tail(&sd->s_sibling, &parent_sd->s_children); + else + list_add(&sd->s_sibling, &parent_sd->s_children); spin_unlock(&configfs_dirent_lock); return sd; @@ -220,10 +230,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren * * called with parent inode's i_mutex held */ -static int configfs_dirent_exists(struct configfs_dirent *parent_sd, - const unsigned char *new) +static int configfs_dirent_exists(struct dentry *dentry) { - struct configfs_dirent * sd; + struct configfs_dirent *parent_sd = dentry->d_parent->d_fsdata; + const unsigned char *new = dentry->d_name.name; + struct configfs_dirent *sd; list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (sd->s_element) { @@ -289,10 +300,6 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry, BUG_ON(!item); - error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name); - if (unlikely(error)) - return error; - error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, CONFIGFS_DIR | CONFIGFS_USET_CREATING, frag); @@ -451,6 +458,18 @@ static struct dentry * configfs_lookup(struct inode *dir, spin_lock(&configfs_dirent_lock); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + + /* + * s_children is partitioned, see configfs_new_dirent. The first + * pinned item indicates we can stop scanning. + */ + if (sd->s_type & CONFIGFS_PINNED) + break; + + /* + * Note: CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are asymmetric. + * there may be a readdir cursor in this list + */ if ((sd->s_type & CONFIGFS_NOT_PINNED) && !strcmp(configfs_get_name(sd), dentry->d_name.name)) { struct configfs_attribute *attr = sd->s_element; @@ -1885,8 +1904,11 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) if (dentry) { d_add(dentry, NULL); - err = configfs_attach_group(sd->s_element, &group->cg_item, - dentry, frag); + err = configfs_dirent_exists(dentry); + if (!err) + err = configfs_attach_group(sd->s_element, + &group->cg_item, + dentry, frag); if (err) { BUG_ON(d_inode(dentry)); d_drop(dentry); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index dcc22f593e43..1d2e3a5738d1 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -216,28 +216,3 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) spin_unlock(&dentry->d_lock); } } - -void configfs_hash_and_remove(struct dentry * dir, const char * name) -{ - struct configfs_dirent * sd; - struct configfs_dirent * parent_sd = dir->d_fsdata; - - if (d_really_is_negative(dir)) - /* no inode means this hasn't been made visible yet */ - return; - - inode_lock(d_inode(dir)); - list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (!sd->s_element) - continue; - if (!strcmp(configfs_get_name(sd), name)) { - spin_lock(&configfs_dirent_lock); - list_del_init(&sd->s_sibling); - spin_unlock(&configfs_dirent_lock); - configfs_drop_dentry(sd, dir); - configfs_put(sd); - break; - } - } - inode_unlock(d_inode(dir)); -} diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 206835e31efa..787e9c8938ba 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -22,6 +22,7 @@ #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> +#include <linux/once.h> #include <linux/seq_file.h> #include "fscrypt_private.h" @@ -1262,35 +1262,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); + loff_t copy_pos = iter->pos; + u64 copy_len = iomap_length(iter); + u32 mod; int id = 0; s64 ret = 0; void *daddr = NULL, *saddr = NULL; - /* don't bother with blocks that are not shared to start with */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; + if (!iomap_want_unshare_iter(iter)) + return iomap_length(iter); + + /* + * Extend the file range to be aligned to fsblock/pagesize, because + * we need to copy entire blocks, not just the byte range specified. + * Invalidate the mapping because we're about to CoW. + */ + mod = offset_in_page(copy_pos); + if (mod) { + copy_len += mod; + copy_pos -= mod; + } + + mod = offset_in_page(copy_pos + copy_len); + if (mod) + copy_len += PAGE_SIZE - mod; + + invalidate_inode_pages2_range(iter->inode->i_mapping, + copy_pos >> PAGE_SHIFT, + (copy_pos + copy_len - 1) >> PAGE_SHIFT); id = dax_read_lock(); - ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); if (ret < 0) goto out_unlock; - /* zero the distance if srcmap is HOLE or UNWRITTEN */ - if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { - memset(daddr, 0, length); - dax_flush(iomap->dax_dev, daddr, length); - ret = length; - goto out_unlock; - } - - ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, length) == 0) - ret = length; + if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) + ret = iomap_length(iter); else ret = -EIO; diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d0..1a01d7a6a7a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -135,6 +135,7 @@ struct dentry_stat_t { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ @@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_dcache_sysctls(void) @@ -1089,7 +1099,7 @@ void shrink_dentry_list(struct list_head *list) } static enum lru_status dentry_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -1170,7 +1180,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -1671,9 +1681,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ - dentry->d_lockref.count = 1; dentry->d_flags = 0; - spin_lock_init(&dentry->d_lock); + lockref_init(&dentry->d_lockref, 1); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; @@ -2039,8 +2048,8 @@ EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the @@ -2093,8 +2102,8 @@ EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name - * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false @@ -2401,6 +2410,8 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 67299e8b734e..16e198a26339 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -64,22 +64,13 @@ const struct file_operations *debugfs_real_fops(const struct file *filp) } EXPORT_SYMBOL_GPL(debugfs_real_fops); -/** - * debugfs_file_get - mark the beginning of file data access - * @dentry: the dentry object whose data is being accessed. - * - * Up to a matching call to debugfs_file_put(), any successive call - * into the file removing functions debugfs_remove() and - * debugfs_remove_recursive() will block. Since associated private - * file data may only get freed after a successful return of any of - * the removal functions, you may safely access it after a successful - * call to debugfs_file_get() without worrying about lifetime issues. - * - * If -%EIO is returned, the file has already been removed and thus, - * it is not safe to access any of its data. If, on the other hand, - * it is allowed to access the file data, zero is returned. - */ -int debugfs_file_get(struct dentry *dentry) +enum dbgfs_get_mode { + DBGFS_GET_ALREADY, + DBGFS_GET_REGULAR, + DBGFS_GET_SHORT, +}; + +static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) { struct debugfs_fsdata *fsd; void *d_fsd; @@ -96,12 +87,22 @@ int debugfs_file_get(struct dentry *dentry) if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { + if (WARN_ON(mode == DBGFS_GET_ALREADY)) + return -EINVAL; + fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; - fsd->real_fops = (void *)((unsigned long)d_fsd & - ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); + if (mode == DBGFS_GET_SHORT) { + fsd->real_fops = NULL; + fsd->short_fops = (void *)((unsigned long)d_fsd & + ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); + } else { + fsd->real_fops = (void *)((unsigned long)d_fsd & + ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); + fsd->short_fops = NULL; + } refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); @@ -130,6 +131,26 @@ int debugfs_file_get(struct dentry *dentry) return 0; } + +/** + * debugfs_file_get - mark the beginning of file data access + * @dentry: the dentry object whose data is being accessed. + * + * Up to a matching call to debugfs_file_put(), any successive call + * into the file removing functions debugfs_remove() and + * debugfs_remove_recursive() will block. Since associated private + * file data may only get freed after a successful return of any of + * the removal functions, you may safely access it after a successful + * call to debugfs_file_get() without worrying about lifetime issues. + * + * If -%EIO is returned, the file has already been removed and thus, + * it is not safe to access any of its data. If, on the other hand, + * it is allowed to access the file data, zero is returned. + */ +int debugfs_file_get(struct dentry *dentry) +{ + return __debugfs_file_get(dentry, DBGFS_GET_ALREADY); +} EXPORT_SYMBOL_GPL(debugfs_file_get); /** @@ -241,9 +262,10 @@ static int debugfs_locked_down(struct inode *inode, { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && - !real_fops->unlocked_ioctl && - !real_fops->compat_ioctl && - !real_fops->mmap) + (!real_fops || + (!real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap))) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) @@ -258,7 +280,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) const struct file_operations *real_fops = NULL; int r; - r = debugfs_file_get(dentry); + r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; @@ -316,19 +338,38 @@ static ret_type full_proxy_ ## name(proto) \ return r; \ } -FULL_PROXY_FUNC(llseek, loff_t, filp, - PROTO(struct file *filp, loff_t offset, int whence), - ARGS(filp, offset, whence)); +#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args) \ +static ret_type full_proxy_ ## name(proto) \ +{ \ + struct dentry *dentry = F_DENTRY(filp); \ + struct debugfs_fsdata *fsd; \ + ret_type r; \ + \ + r = debugfs_file_get(dentry); \ + if (unlikely(r)) \ + return r; \ + fsd = dentry->d_fsdata; \ + if (fsd->real_fops) \ + r = fsd->real_fops->name(args); \ + else \ + r = fsd->short_fops->name(args); \ + debugfs_file_put(dentry); \ + return r; \ +} + +FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp, + PROTO(struct file *filp, loff_t offset, int whence), + ARGS(filp, offset, whence)); -FULL_PROXY_FUNC(read, ssize_t, filp, - PROTO(struct file *filp, char __user *buf, size_t size, - loff_t *ppos), - ARGS(filp, buf, size, ppos)); +FULL_PROXY_FUNC_BOTH(read, ssize_t, filp, + PROTO(struct file *filp, char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos)); -FULL_PROXY_FUNC(write, ssize_t, filp, - PROTO(struct file *filp, const char __user *buf, size_t size, - loff_t *ppos), - ARGS(filp, buf, size, ppos)); +FULL_PROXY_FUNC_BOTH(write, ssize_t, filp, + PROTO(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos), + ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), @@ -363,7 +404,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ - if (real_fops->release) + if (real_fops && real_fops->release) r = real_fops->release(inode, filp); replace_fops(filp, d_inode(dentry)->i_fop); @@ -373,39 +414,49 @@ static int full_proxy_release(struct inode *inode, struct file *filp) } static void __full_proxy_fops_init(struct file_operations *proxy_fops, - const struct file_operations *real_fops) + struct debugfs_fsdata *fsd) { proxy_fops->release = full_proxy_release; - if (real_fops->llseek) + + if ((fsd->real_fops && fsd->real_fops->llseek) || + (fsd->short_fops && fsd->short_fops->llseek)) proxy_fops->llseek = full_proxy_llseek; - if (real_fops->read) + + if ((fsd->real_fops && fsd->real_fops->read) || + (fsd->short_fops && fsd->short_fops->read)) proxy_fops->read = full_proxy_read; - if (real_fops->write) + + if ((fsd->real_fops && fsd->real_fops->write) || + (fsd->short_fops && fsd->short_fops->write)) proxy_fops->write = full_proxy_write; - if (real_fops->poll) + + if (fsd->real_fops && fsd->real_fops->poll) proxy_fops->poll = full_proxy_poll; - if (real_fops->unlocked_ioctl) + + if (fsd->real_fops && fsd->real_fops->unlocked_ioctl) proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } -static int full_proxy_open(struct inode *inode, struct file *filp) +static int full_proxy_open(struct inode *inode, struct file *filp, + enum dbgfs_get_mode mode) { struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = NULL; + const struct file_operations *real_fops; struct file_operations *proxy_fops = NULL; + struct debugfs_fsdata *fsd; int r; - r = debugfs_file_get(dentry); + r = __debugfs_file_get(dentry, mode); if (r) return r == -EIO ? -ENOENT : r; - real_fops = debugfs_real_fops(filp); - + fsd = dentry->d_fsdata; + real_fops = fsd->real_fops; r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; - if (!fops_get(real_fops)) { + if (real_fops && !fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { @@ -426,11 +477,14 @@ static int full_proxy_open(struct inode *inode, struct file *filp) r = -ENOMEM; goto free_proxy; } - __full_proxy_fops_init(proxy_fops, real_fops); + __full_proxy_fops_init(proxy_fops, fsd); replace_fops(filp, proxy_fops); - if (real_fops->open) { - r = real_fops->open(inode, filp); + if (!real_fops || real_fops->open) { + if (real_fops) + r = real_fops->open(inode, filp); + else + r = simple_open(inode, filp); if (r) { replace_fops(filp, d_inode(dentry)->i_fop); goto free_proxy; @@ -451,8 +505,22 @@ out: return r; } +static int full_proxy_open_regular(struct inode *inode, struct file *filp) +{ + return full_proxy_open(inode, filp, DBGFS_GET_REGULAR); +} + const struct file_operations debugfs_full_proxy_file_operations = { - .open = full_proxy_open, + .open = full_proxy_open_regular, +}; + +static int full_proxy_open_short(struct inode *inode, struct file *filp) +{ + return full_proxy_open(inode, filp, DBGFS_GET_SHORT); +} + +const struct file_operations debugfs_full_short_proxy_file_operations = { + .open = full_proxy_open_short, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 66d9b3b4c588..e752009de929 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -229,7 +229,7 @@ static void debugfs_release_dentry(struct dentry *dentry) return; /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ - if (fsd && fsd->real_fops) { + if (fsd && (fsd->real_fops || fsd->short_fops)) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } @@ -412,7 +412,7 @@ static struct dentry *end_creating(struct dentry *dentry) static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *proxy_fops, - const struct file_operations *real_fops) + const void *real_fops) { struct dentry *dentry; struct inode *inode; @@ -450,49 +450,35 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, return end_creating(dentry); } -/** - * debugfs_create_file - create a file in the debugfs filesystem - * @name: a pointer to a string containing the name of the file to create. - * @mode: the permission that the file should have. - * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this parameter is NULL, then the - * file will be created in the root of the debugfs filesystem. - * @data: a pointer to something that the caller will want to get to later - * on. The inode.i_private pointer will point to this value on - * the open() call. - * @fops: a pointer to a struct file_operations that should be used for - * this file. - * - * This is the basic "create a file" function for debugfs. It allows for a - * wide range of flexibility in creating a file, or a directory (if you want - * to create a directory, the debugfs_create_dir() function is - * recommended to be used instead.) - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. - * - * NOTE: it's expected that most callers should _ignore_ the errors returned - * by this function. Other debugfs functions handle the fact that the "dentry" - * passed to them could be an error and they don't crash in that case. - * Drivers should generally work fine even if debugfs fails to init anyway. - */ -struct dentry *debugfs_create_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops) +struct dentry *debugfs_create_file_full(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) { + if (WARN_ON((unsigned long)fops & + DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) + return ERR_PTR(-EINVAL); return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_full_proxy_file_operations : &debugfs_noop_file_operations, fops); } -EXPORT_SYMBOL_GPL(debugfs_create_file); +EXPORT_SYMBOL_GPL(debugfs_create_file_full); + +struct dentry *debugfs_create_file_short(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct debugfs_short_fops *fops) +{ + if (WARN_ON((unsigned long)fops & + DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) + return ERR_PTR(-EINVAL); + + return __debugfs_create_file(name, mode, parent, data, + fops ? &debugfs_full_short_proxy_file_operations : + &debugfs_noop_file_operations, + fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_file_short); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index dae80c2a469e..bbae4a228ef4 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -15,9 +15,11 @@ struct file_operations; extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; +extern const struct file_operations debugfs_full_short_proxy_file_operations; struct debugfs_fsdata { const struct file_operations *real_fops; + const struct debugfs_short_fops *short_fops; union { /* automount_fn is used when real_fops is NULL */ debugfs_automount_t automount; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 742b30b61c19..0fe8d80ce5e8 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -30,7 +30,7 @@ static void dlm_run_callback(uint32_t ls_id, uint32_t lkb_id, int8_t mode, trace_dlm_bast(ls_id, lkb_id, mode, res_name, res_length); bastfn(astparam, mode); } else if (flags & DLM_CB_CAST) { - trace_dlm_ast(ls_id, lkb_id, sb_status, sb_flags, res_name, + trace_dlm_ast(ls_id, lkb_id, sb_flags, sb_status, res_name, res_length); lksb->sb_status = sb_status; lksb->sb_flags = sb_flags; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index eac96f1c1d74..b2f21aa00719 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -24,9 +24,9 @@ #include "lowcomms.h" /* - * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>) * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight - * /config/dlm/<cluster>/comms/<comm>/nodeid + * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>) * /config/dlm/<cluster>/comms/<comm>/local * /config/dlm/<cluster>/comms/<comm>/addr (write only) * /config/dlm/<cluster>/comms/<comm>/addr_list (read only) @@ -73,20 +73,6 @@ const struct rhashtable_params dlm_rhash_rsb_params = { struct dlm_cluster { struct config_group group; - unsigned int cl_tcp_port; - unsigned int cl_buffer_size; - unsigned int cl_rsbtbl_size; - unsigned int cl_recover_timer; - unsigned int cl_toss_secs; - unsigned int cl_scan_secs; - unsigned int cl_log_debug; - unsigned int cl_log_info; - unsigned int cl_protocol; - unsigned int cl_mark; - unsigned int cl_new_rsb_count; - unsigned int cl_recover_callbacks; - char cl_cluster_name[DLM_LOCKSPACE_LEN]; - struct dlm_spaces *sps; struct dlm_comms *cms; }; @@ -115,25 +101,60 @@ enum { static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf) { - struct dlm_cluster *cl = config_item_to_cluster(item); - return sprintf(buf, "%s\n", cl->cl_cluster_name); + return sprintf(buf, "%s\n", dlm_config.ci_cluster_name); } static ssize_t cluster_cluster_name_store(struct config_item *item, const char *buf, size_t len) { - struct dlm_cluster *cl = config_item_to_cluster(item); - strscpy(dlm_config.ci_cluster_name, buf, - sizeof(dlm_config.ci_cluster_name)); - strscpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); + sizeof(dlm_config.ci_cluster_name)); return len; } CONFIGFS_ATTR(cluster_, cluster_name); -static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int (*check_cb)(unsigned int x), +static ssize_t cluster_tcp_port_show(struct config_item *item, char *buf) +{ + return sprintf(buf, "%u\n", be16_to_cpu(dlm_config.ci_tcp_port)); +} + +static int dlm_check_zero_and_dlm_running(unsigned int x) +{ + if (!x) + return -EINVAL; + + if (dlm_lowcomms_is_running()) + return -EBUSY; + + return 0; +} + +static ssize_t cluster_tcp_port_store(struct config_item *item, + const char *buf, size_t len) +{ + int rc; + u16 x; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rc = kstrtou16(buf, 0, &x); + if (rc) + return rc; + + rc = dlm_check_zero_and_dlm_running(x); + if (rc) + return rc; + + dlm_config.ci_tcp_port = cpu_to_be16(x); + return len; +} + +CONFIGFS_ATTR(cluster_, tcp_port); + +static ssize_t cluster_set(unsigned int *info_field, + int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -151,7 +172,6 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return rc; } - *cl_field = x; *info_field = x; return len; @@ -161,14 +181,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ - struct dlm_cluster *cl = config_item_to_cluster(item); \ - return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_cb, buf, len); \ + return cluster_set(&dlm_config.ci_##name, check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ - struct dlm_cluster *cl = config_item_to_cluster(item); \ - return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_config.ci_##name); \ } \ CONFIGFS_ATTR(cluster_, name); @@ -191,17 +208,6 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) return 0; } -static int dlm_check_zero_and_dlm_running(unsigned int x) -{ - if (!x) - return -EINVAL; - - if (dlm_lowcomms_is_running()) - return -EBUSY; - - return 0; -} - static int dlm_check_zero(unsigned int x) { if (!x) @@ -218,7 +224,6 @@ static int dlm_check_buffer_size(unsigned int x) return 0; } -CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running); CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); @@ -423,20 +428,6 @@ static struct config_group *make_cluster(struct config_group *g, configfs_add_default_group(&sps->ss_group, &cl->group); configfs_add_default_group(&cms->cs_group, &cl->group); - cl->cl_tcp_port = dlm_config.ci_tcp_port; - cl->cl_buffer_size = dlm_config.ci_buffer_size; - cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size; - cl->cl_recover_timer = dlm_config.ci_recover_timer; - cl->cl_toss_secs = dlm_config.ci_toss_secs; - cl->cl_scan_secs = dlm_config.ci_scan_secs; - cl->cl_log_debug = dlm_config.ci_log_debug; - cl->cl_log_info = dlm_config.ci_log_info; - cl->cl_protocol = dlm_config.ci_protocol; - cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; - cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; - memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, - DLM_LOCKSPACE_LEN); - space_list = &sps->ss_group; comm_list = &cms->cs_group; return &cl->group; @@ -517,6 +508,12 @@ static void release_space(struct config_item *i) static struct config_item *make_comm(struct config_group *g, const char *name) { struct dlm_comm *cm; + unsigned int nodeid; + int rv; + + rv = kstrtouint(name, 0, &nodeid); + if (rv) + return ERR_PTR(rv); cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); if (!cm) @@ -528,7 +525,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) if (!cm->seq) cm->seq = dlm_comm_count++; - cm->nodeid = -1; + cm->nodeid = nodeid; cm->local = 0; cm->addr_count = 0; cm->mark = 0; @@ -555,16 +552,25 @@ static void release_comm(struct config_item *i) static struct config_item *make_node(struct config_group *g, const char *name) { struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + unsigned int nodeid; struct dlm_node *nd; + uint32_t seq = 0; + int rv; + + rv = kstrtouint(name, 0, &nodeid); + if (rv) + return ERR_PTR(rv); nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); if (!nd) return ERR_PTR(-ENOMEM); config_item_init_type_name(&nd->item, name, &node_type); - nd->nodeid = -1; + nd->nodeid = nodeid; nd->weight = 1; /* default weight of 1 if none is set */ nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */ + dlm_comm_seq(nodeid, &seq, true); + nd->comm_seq = seq; mutex_lock(&sp->members_lock); list_add(&nd->list, &sp->members); @@ -622,16 +628,19 @@ void dlm_config_exit(void) static ssize_t comm_nodeid_show(struct config_item *item, char *buf) { - return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid); + unsigned int nodeid; + int rv; + + rv = kstrtouint(config_item_name(item), 0, &nodeid); + if (WARN_ON(rv)) + return rv; + + return sprintf(buf, "%u\n", nodeid); } static ssize_t comm_nodeid_store(struct config_item *item, const char *buf, size_t len) { - int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid); - - if (rc) - return rc; return len; } @@ -772,20 +781,19 @@ static struct configfs_attribute *comm_attrs[] = { static ssize_t node_nodeid_show(struct config_item *item, char *buf) { - return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid); + unsigned int nodeid; + int rv; + + rv = kstrtouint(config_item_name(item), 0, &nodeid); + if (WARN_ON(rv)) + return rv; + + return sprintf(buf, "%u\n", nodeid); } static ssize_t node_nodeid_store(struct config_item *item, const char *buf, size_t len) { - struct dlm_node *nd = config_item_to_node(item); - uint32_t seq = 0; - int rc = kstrtoint(buf, 0, &nd->nodeid); - - if (rc) - return rc; - dlm_comm_seq(nd->nodeid, &seq); - nd->comm_seq = seq; return len; } @@ -845,7 +853,7 @@ static struct dlm_comm *get_comm(int nodeid) if (!comm_list) return NULL; - mutex_lock(&clusters_root.subsys.su_mutex); + WARN_ON_ONCE(!mutex_is_locked(&clusters_root.subsys.su_mutex)); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { cm = config_item_to_comm(i); @@ -856,7 +864,6 @@ static struct dlm_comm *get_comm(int nodeid) config_item_get(i); break; } - mutex_unlock(&clusters_root.subsys.su_mutex); if (!found) cm = NULL; @@ -916,11 +923,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, return rv; } -int dlm_comm_seq(int nodeid, uint32_t *seq) +int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked) { - struct dlm_comm *cm = get_comm(nodeid); + struct dlm_comm *cm; + + if (locked) { + cm = get_comm(nodeid); + } else { + mutex_lock(&clusters_root.subsys.su_mutex); + cm = get_comm(nodeid); + mutex_unlock(&clusters_root.subsys.su_mutex); + } if (!cm) return -EEXIST; + *seq = cm->seq; put_comm(cm); return 0; @@ -957,7 +973,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_CLUSTER_NAME "" struct dlm_config_info dlm_config = { - .ci_tcp_port = DEFAULT_TCP_PORT, + .ci_tcp_port = cpu_to_be16(DEFAULT_TCP_PORT), .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index ed237d910208..e48c4f9686d3 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -29,18 +29,18 @@ extern const struct rhashtable_params dlm_rhash_rsb_params; #define DLM_PROTO_SCTP 1 struct dlm_config_info { - int ci_tcp_port; - int ci_buffer_size; - int ci_rsbtbl_size; - int ci_recover_timer; - int ci_toss_secs; - int ci_scan_secs; - int ci_log_debug; - int ci_log_info; - int ci_protocol; - int ci_mark; - int ci_new_rsb_count; - int ci_recover_callbacks; + __be16 ci_tcp_port; + unsigned int ci_buffer_size; + unsigned int ci_rsbtbl_size; + unsigned int ci_recover_timer; + unsigned int ci_toss_secs; + unsigned int ci_scan_secs; + unsigned int ci_log_debug; + unsigned int ci_log_info; + unsigned int ci_protocol; + unsigned int ci_mark; + unsigned int ci_new_rsb_count; + unsigned int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; }; @@ -50,7 +50,7 @@ int dlm_config_init(void); void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); -int dlm_comm_seq(int nodeid, uint32_t *seq); +int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 865dc70a9dfc..fc1d710166e9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1703,19 +1703,11 @@ static int msg_reply_type(int mstype) /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ -static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) +static void add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - int error = 0; spin_lock_bh(&ls->ls_waiters_lock); - - if (is_overlap_unlock(lkb) || - (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) { - error = -EINVAL; - goto out; - } - if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) { switch (mstype) { case DLM_MSG_UNLOCK: @@ -1725,7 +1717,11 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); break; default: - error = -EBUSY; + /* should never happen as validate_lock_args() checks + * on lkb_wait_type and validate_unlock_args() only + * creates UNLOCK or CANCEL messages. + */ + WARN_ON_ONCE(1); goto out; } lkb->lkb_wait_count++; @@ -1747,12 +1743,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); out: - if (error) - log_error(ls, "addwait error %x %d flags %x %d %d %s", - lkb->lkb_id, error, dlm_iflags_val(lkb), mstype, - lkb->lkb_wait_type, lkb->lkb_resource->res_name); spin_unlock_bh(&ls->ls_waiters_lock); - return error; } /* We clear the RESEND flag because we might be taking an lkb off the waiters @@ -2861,16 +2852,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, case -EINVAL: /* annoy the user because dlm usage is wrong */ WARN_ON(1); - log_error(ls, "%s %d %x %x %x %d %d %s", __func__, + log_error(ls, "%s %d %x %x %x %d %d", __func__, rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, - lkb->lkb_status, lkb->lkb_wait_type, - lkb->lkb_resource->res_name); + lkb->lkb_status, lkb->lkb_wait_type); break; default: - log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, + log_debug(ls, "%s %d %x %x %x %d %d", __func__, rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, - lkb->lkb_status, lkb->lkb_wait_type, - lkb->lkb_resource->res_name); + lkb->lkb_status, lkb->lkb_wait_type); break; } @@ -2928,13 +2917,16 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) goto out; } + if (is_overlap_unlock(lkb)) + goto out; + /* cancel not allowed with another cancel/unlock in progress */ if (args->flags & DLM_LKF_CANCEL) { if (lkb->lkb_exflags & DLM_LKF_CANCEL) goto out; - if (is_overlap(lkb)) + if (is_overlap_cancel(lkb)) goto out; if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { @@ -2972,9 +2964,6 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK) goto out; - if (is_overlap_unlock(lkb)) - goto out; - if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); rv = -EBUSY; @@ -3610,10 +3599,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) to_nodeid = r->res_nodeid; - error = add_to_waiters(lkb, mstype, to_nodeid); - if (error) - return error; - + add_to_waiters(lkb, mstype, to_nodeid); error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); if (error) goto fail; @@ -3716,10 +3702,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) to_nodeid = dlm_dir_nodeid(r); - error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); - if (error) - return error; - + add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); if (error) goto fail; @@ -5016,16 +4999,19 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms_local) { if (middle_conversion(lkb)) { + log_rinfo(ls, "%s %x middle convert in progress", __func__, + lkb->lkb_id); + + /* We sent this lock to the new master. The new master will + * tell us when it's granted. We no longer need a reply, so + * use a fake reply to put the lkb into the right state. + */ hold_lkb(lkb); memset(ms_local, 0, sizeof(struct dlm_message)); ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS)); ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); _receive_convert_reply(lkb, ms_local, true); - - /* Same special case as in receive_rcom_lock_args() */ - lkb->lkb_grmode = DLM_LOCK_IV; - rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT); unhold_lkb(lkb); } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { @@ -5572,10 +5558,11 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, The real granted mode of these converting locks cannot be determined until all locks have been rebuilt on the rsb (recover_conversion) */ - if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) && - middle_conversion(lkb)) { - rl->rl_status = DLM_LKSTS_CONVERT; - lkb->lkb_grmode = DLM_LOCK_IV; + if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) { + /* We may need to adjust grmode depending on other granted locks. */ + log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x", + __func__, lkb->lkb_id, lkb->lkb_grmode, + lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid); rsb_set_flag(r, RSB_RECOVER_CONVERT); } @@ -6344,8 +6331,8 @@ int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id, if (error) return error; - error = add_to_waiters(lkb, mstype, to_nodeid); + add_to_waiters(lkb, mstype, to_nodeid); dlm_put_lkb(lkb); - return error; + return 0; } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cb3a10b041c2..df40c3fd1070 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -660,18 +660,18 @@ static void add_sock(struct socket *sock, struct connection *con) /* Add the port number to an IPv6 or 4 sockaddr and return the address length */ -static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, +static void make_sockaddr(struct sockaddr_storage *saddr, __be16 port, int *addr_len) { saddr->ss_family = dlm_local_addr[0].ss_family; if (saddr->ss_family == AF_INET) { struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; - in4_addr->sin_port = cpu_to_be16(port); + in4_addr->sin_port = port; *addr_len = sizeof(struct sockaddr_in); memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); } else { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; - in6_addr->sin6_port = cpu_to_be16(port); + in6_addr->sin6_port = port; *addr_len = sizeof(struct sockaddr_in6); } memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); @@ -1121,7 +1121,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) /* * sctp_bind_addrs - bind a SCTP socket to all our addresses */ -static int sctp_bind_addrs(struct socket *sock, uint16_t port) +static int sctp_bind_addrs(struct socket *sock, __be16 port) { struct sockaddr_storage localaddr; struct sockaddr *addr = (struct sockaddr *)&localaddr; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index c9661906568a..b0864c93230f 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -493,7 +493,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) we consider the node to have failed (versus being removed due to dlm_release_lockspace) */ - error = dlm_comm_seq(memb->nodeid, &seq); + error = dlm_comm_seq(memb->nodeid, &seq, false); if (!error && seq == memb->comm_seq) return; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 2e1169c81c6e..be4240f09abd 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -811,33 +811,42 @@ static void recover_lvb(struct dlm_rsb *r) } /* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks - converting PR->CW or CW->PR need to have their lkb_grmode set. */ + * converting PR->CW or CW->PR may need to have their lkb_grmode changed. + */ static void recover_conversion(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; + uint32_t other_lkid = 0; + int other_grmode = -1; struct dlm_lkb *lkb; - int grmode = -1; list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { if (lkb->lkb_grmode == DLM_LOCK_PR || lkb->lkb_grmode == DLM_LOCK_CW) { - grmode = lkb->lkb_grmode; + other_grmode = lkb->lkb_grmode; + other_lkid = lkb->lkb_id; break; } } + if (other_grmode == -1) + return; + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { - if (lkb->lkb_grmode != DLM_LOCK_IV) - continue; - if (grmode == -1) { - log_debug(ls, "recover_conversion %x set gr to rq %d", - lkb->lkb_id, lkb->lkb_rqmode); - lkb->lkb_grmode = lkb->lkb_rqmode; - } else { - log_debug(ls, "recover_conversion %x set gr %d", - lkb->lkb_id, grmode); - lkb->lkb_grmode = grmode; + /* Lock recovery created incompatible granted modes, so + * change the granted mode of the converting lock to + * NL. The rqmode of the converting lock should be CW, + * which means the converting lock should be granted at + * the end of recovery. + */ + if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) || + ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) { + log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL", + __func__, lkb->lkb_id, lkb->lkb_grmode, + lkb->lkb_rqmode, lkb->lkb_nodeid, + lkb->lkb_remid, other_lkid, other_grmode); + lkb->lkb_grmode = DLM_LOCK_NL; } } } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 34f4f9f49a6c..12272a8f6d75 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -151,7 +151,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members(ls, rv, &neg); if (error) { log_rinfo(ls, "dlm_recover_members error %d", error); - goto fail; + goto fail_root_list; } dlm_recover_dir_nodeid(ls, &root_list); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 827278525fd9..69536cacdea8 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -328,10 +328,10 @@ out: * Convert an eCryptfs page index into a lower byte offset */ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, - struct page *page) + struct folio *folio) { return ecryptfs_lower_header_size(crypt_stat) + - ((loff_t)page->index << PAGE_SHIFT); + (loff_t)folio->index * PAGE_SIZE; } /** @@ -340,6 +340,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, * encryption operation * @dst_page: The page to write the result into * @src_page: The page to read from + * @page_index: The offset in the file (in units of PAGE_SIZE) * @extent_offset: Page extent offset for use in generating IV * @op: ENCRYPT or DECRYPT to indicate the desired operation * @@ -350,9 +351,9 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat, static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, struct page *dst_page, struct page *src_page, + pgoff_t page_index, unsigned long extent_offset, int op) { - pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index; loff_t extent_base; char extent_iv[ECRYPTFS_MAX_IV_BYTES]; struct scatterlist src_sg, dst_sg; @@ -392,7 +393,7 @@ out: /** * ecryptfs_encrypt_page - * @page: Page mapped from the eCryptfs inode for the file; contains + * @folio: Folio mapped from the eCryptfs inode for the file; contains * decrypted content that needs to be encrypted (to a temporary * page; not in place) and written out to the lower file * @@ -406,7 +407,7 @@ out: * * Returns zero on success; negative on error */ -int ecryptfs_encrypt_page(struct page *page) +int ecryptfs_encrypt_page(struct folio *folio) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -416,7 +417,7 @@ int ecryptfs_encrypt_page(struct page *page) loff_t lower_offset; int rc = 0; - ecryptfs_inode = page->mapping->host; + ecryptfs_inode = folio->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); @@ -431,8 +432,9 @@ int ecryptfs_encrypt_page(struct page *page) for (extent_offset = 0; extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { - rc = crypt_extent(crypt_stat, enc_extent_page, page, - extent_offset, ENCRYPT); + rc = crypt_extent(crypt_stat, enc_extent_page, + folio_page(folio, 0), folio->index, + extent_offset, ENCRYPT); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " "rc = [%d]\n", __func__, rc); @@ -440,7 +442,7 @@ int ecryptfs_encrypt_page(struct page *page) } } - lower_offset = lower_offset_for_page(crypt_stat, page); + lower_offset = lower_offset_for_page(crypt_stat, folio); enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); @@ -461,7 +463,7 @@ out: /** * ecryptfs_decrypt_page - * @page: Page mapped from the eCryptfs inode for the file; data read + * @folio: Folio mapped from the eCryptfs inode for the file; data read * and decrypted from the lower file will be written into this * page * @@ -475,7 +477,7 @@ out: * * Returns zero on success; negative on error */ -int ecryptfs_decrypt_page(struct page *page) +int ecryptfs_decrypt_page(struct folio *folio) { struct inode *ecryptfs_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -484,13 +486,13 @@ int ecryptfs_decrypt_page(struct page *page) loff_t lower_offset; int rc = 0; - ecryptfs_inode = page->mapping->host; + ecryptfs_inode = folio->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); - lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap_local_page(page); + lower_offset = lower_offset_for_page(crypt_stat, folio); + page_virt = kmap_local_folio(folio, 0); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); kunmap_local(page_virt); @@ -504,8 +506,9 @@ int ecryptfs_decrypt_page(struct page *page) for (extent_offset = 0; extent_offset < (PAGE_SIZE / crypt_stat->extent_size); extent_offset++) { - rc = crypt_extent(crypt_stat, page, page, - extent_offset, DECRYPT); + struct page *page = folio_page(folio, 0); + rc = crypt_extent(crypt_stat, page, page, folio->index, + extent_offset, DECRYPT); if (rc) { printk(KERN_ERR "%s: Error decrypting extent; " "rc = [%d]\n", __func__, rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c586c5db18b5..1f562e75d0e4 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -569,8 +569,8 @@ void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); -int ecryptfs_encrypt_page(struct page *page); -int ecryptfs_decrypt_page(struct page *page); +int ecryptfs_encrypt_page(struct folio *folio); +int ecryptfs_decrypt_page(struct folio *folio); int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); @@ -653,16 +653,15 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size); int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, - struct page *page_for_lower, + struct folio *folio_for_lower, size_t offset_in_page, size_t size); int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); -int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, +int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); -struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cbdf82f0183f..a9819ddb1ab8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, return rc; } -static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - if (flags & AT_GETATTR_NOSEC) - return vfs_getattr_nosec(path, stat, request_mask, flags); - return vfs_getattr(path, stat, request_mask, flags); -} - static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) @@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, struct kstat lower_stat; int rc; - rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), + &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 577c56302314..8dd1d7189c3b 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -15,10 +15,10 @@ #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> -#include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/magic.h> @@ -153,32 +153,30 @@ void ecryptfs_put_lower_file(struct inode *inode) } } -enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, - ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, - ecryptfs_opt_ecryptfs_key_bytes, - ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, - ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, - ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, - ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, - ecryptfs_opt_check_dev_ruid, - ecryptfs_opt_err }; - -static const match_table_t tokens = { - {ecryptfs_opt_sig, "sig=%s"}, - {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, - {ecryptfs_opt_cipher, "cipher=%s"}, - {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, - {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, - {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, - {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, - {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, - {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, - {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, - {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, - {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, - {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, - {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, - {ecryptfs_opt_err, NULL} +enum { + Opt_sig, Opt_ecryptfs_sig, Opt_cipher, Opt_ecryptfs_cipher, + Opt_ecryptfs_key_bytes, Opt_passthrough, Opt_xattr_metadata, + Opt_encrypted_view, Opt_fnek_sig, Opt_fn_cipher, + Opt_fn_cipher_key_bytes, Opt_unlink_sigs, Opt_mount_auth_tok_only, + Opt_check_dev_ruid +}; + +static const struct fs_parameter_spec ecryptfs_fs_param_spec[] = { + fsparam_string ("sig", Opt_sig), + fsparam_string ("ecryptfs_sig", Opt_ecryptfs_sig), + fsparam_string ("cipher", Opt_cipher), + fsparam_string ("ecryptfs_cipher", Opt_ecryptfs_cipher), + fsparam_u32 ("ecryptfs_key_bytes", Opt_ecryptfs_key_bytes), + fsparam_flag ("ecryptfs_passthrough", Opt_passthrough), + fsparam_flag ("ecryptfs_xattr_metadata", Opt_xattr_metadata), + fsparam_flag ("ecryptfs_encrypted_view", Opt_encrypted_view), + fsparam_string ("ecryptfs_fnek_sig", Opt_fnek_sig), + fsparam_string ("ecryptfs_fn_cipher", Opt_fn_cipher), + fsparam_u32 ("ecryptfs_fn_key_bytes", Opt_fn_cipher_key_bytes), + fsparam_flag ("ecryptfs_unlink_sigs", Opt_unlink_sigs), + fsparam_flag ("ecryptfs_mount_auth_tok_only", Opt_mount_auth_tok_only), + fsparam_flag ("ecryptfs_check_dev_ruid", Opt_check_dev_ruid), + {} }; static int ecryptfs_init_global_auth_toks( @@ -219,19 +217,20 @@ static void ecryptfs_init_mount_crypt_stat( mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; } +struct ecryptfs_fs_context { + /* Mount option status trackers */ + bool check_ruid; + bool sig_set; + bool cipher_name_set; + bool cipher_key_bytes_set; + bool fn_cipher_name_set; + bool fn_cipher_key_bytes_set; +}; + /** - * ecryptfs_parse_options - * @sbi: The ecryptfs super block - * @options: The options passed to the kernel - * @check_ruid: set to 1 if device uid should be checked against the ruid - * - * Parse mount options: - * debug=N - ecryptfs_verbosity level for debug output - * sig=XXX - description(signature) of the key to use - * - * Returns the dentry object of the lower-level (lower/interposed) - * directory; We want to mount our stackable file system on top of - * that lower directory. + * ecryptfs_parse_param + * @fc: The ecryptfs filesystem context + * @param: The mount parameter to parse * * The signature of the key to use must be the description of a key * already in the keyring. Mounting will fail if the key can not be @@ -239,143 +238,118 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, - uid_t *check_ruid) +static int ecryptfs_parse_param( + struct fs_context *fc, + struct fs_parameter *param) { - char *p; - int rc = 0; - int sig_set = 0; - int cipher_name_set = 0; - int fn_cipher_name_set = 0; - int cipher_key_bytes; - int cipher_key_bytes_set = 0; - int fn_cipher_key_bytes; - int fn_cipher_key_bytes_set = 0; + int rc; + int opt; + struct fs_parse_result result; + struct ecryptfs_fs_context *ctx = fc->fs_private; + struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &sbi->mount_crypt_stat; - substring_t args[MAX_OPT_ARGS]; - int token; - char *sig_src; - char *cipher_name_src; - char *fn_cipher_name_src; - char *fnek_src; - char *cipher_key_bytes_src; - char *fn_cipher_key_bytes_src; - u8 cipher_code; - *check_ruid = 0; + opt = fs_parse(fc, ecryptfs_fs_param_spec, param, &result); + if (opt < 0) + return opt; - if (!options) { - rc = -EINVAL; - goto out; - } - ecryptfs_init_mount_crypt_stat(mount_crypt_stat); - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - token = match_token(p, tokens, args); - switch (token) { - case ecryptfs_opt_sig: - case ecryptfs_opt_ecryptfs_sig: - sig_src = args[0].from; - rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, - sig_src, 0); - if (rc) { - printk(KERN_ERR "Error attempting to register " - "global sig; rc = [%d]\n", rc); - goto out; - } - sig_set = 1; - break; - case ecryptfs_opt_cipher: - case ecryptfs_opt_ecryptfs_cipher: - cipher_name_src = args[0].from; - strscpy(mount_crypt_stat->global_default_cipher_name, - cipher_name_src); - cipher_name_set = 1; - break; - case ecryptfs_opt_ecryptfs_key_bytes: - cipher_key_bytes_src = args[0].from; - cipher_key_bytes = - (int)simple_strtol(cipher_key_bytes_src, - &cipher_key_bytes_src, 0); - mount_crypt_stat->global_default_cipher_key_size = - cipher_key_bytes; - cipher_key_bytes_set = 1; - break; - case ecryptfs_opt_passthrough: - mount_crypt_stat->flags |= - ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; - break; - case ecryptfs_opt_xattr_metadata: - mount_crypt_stat->flags |= - ECRYPTFS_XATTR_METADATA_ENABLED; - break; - case ecryptfs_opt_encrypted_view: - mount_crypt_stat->flags |= - ECRYPTFS_XATTR_METADATA_ENABLED; - mount_crypt_stat->flags |= - ECRYPTFS_ENCRYPTED_VIEW_ENABLED; - break; - case ecryptfs_opt_fnek_sig: - fnek_src = args[0].from; - strscpy(mount_crypt_stat->global_default_fnek_sig, - fnek_src); - rc = ecryptfs_add_global_auth_tok( - mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig, - ECRYPTFS_AUTH_TOK_FNEK); - if (rc) { - printk(KERN_ERR "Error attempting to register " - "global fnek sig [%s]; rc = [%d]\n", - mount_crypt_stat->global_default_fnek_sig, - rc); - goto out; - } - mount_crypt_stat->flags |= - (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES - | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); - break; - case ecryptfs_opt_fn_cipher: - fn_cipher_name_src = args[0].from; - strscpy(mount_crypt_stat->global_default_fn_cipher_name, - fn_cipher_name_src); - fn_cipher_name_set = 1; - break; - case ecryptfs_opt_fn_cipher_key_bytes: - fn_cipher_key_bytes_src = args[0].from; - fn_cipher_key_bytes = - (int)simple_strtol(fn_cipher_key_bytes_src, - &fn_cipher_key_bytes_src, 0); - mount_crypt_stat->global_default_fn_cipher_key_bytes = - fn_cipher_key_bytes; - fn_cipher_key_bytes_set = 1; - break; - case ecryptfs_opt_unlink_sigs: - mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; - break; - case ecryptfs_opt_mount_auth_tok_only: - mount_crypt_stat->flags |= - ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; - break; - case ecryptfs_opt_check_dev_ruid: - *check_ruid = 1; - break; - case ecryptfs_opt_err: - default: - printk(KERN_WARNING - "%s: eCryptfs: unrecognized option [%s]\n", - __func__, p); + switch (opt) { + case Opt_sig: + case Opt_ecryptfs_sig: + rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, + param->string, 0); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global sig; rc = [%d]\n", rc); + return rc; + } + ctx->sig_set = 1; + break; + case Opt_cipher: + case Opt_ecryptfs_cipher: + strscpy(mount_crypt_stat->global_default_cipher_name, + param->string); + ctx->cipher_name_set = 1; + break; + case Opt_ecryptfs_key_bytes: + mount_crypt_stat->global_default_cipher_key_size = + result.uint_32; + ctx->cipher_key_bytes_set = 1; + break; + case Opt_passthrough: + mount_crypt_stat->flags |= + ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; + break; + case Opt_xattr_metadata: + mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; + break; + case Opt_encrypted_view: + mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; + mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; + break; + case Opt_fnek_sig: + strscpy(mount_crypt_stat->global_default_fnek_sig, + param->string); + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, rc); + return rc; } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case Opt_fn_cipher: + strscpy(mount_crypt_stat->global_default_fn_cipher_name, + param->string); + ctx->fn_cipher_name_set = 1; + break; + case Opt_fn_cipher_key_bytes: + mount_crypt_stat->global_default_fn_cipher_key_bytes = + result.uint_32; + ctx->fn_cipher_key_bytes_set = 1; + break; + case Opt_unlink_sigs: + mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; + break; + case Opt_mount_auth_tok_only: + mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; + break; + case Opt_check_dev_ruid: + ctx->check_ruid = 1; + break; + default: + return -EINVAL; } - if (!sig_set) { + + return 0; +} + +static int ecryptfs_validate_options(struct fs_context *fc) +{ + int rc = 0; + u8 cipher_code; + struct ecryptfs_fs_context *ctx = fc->fs_private; + struct ecryptfs_sb_info *sbi = fc->s_fs_info; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + + + mount_crypt_stat = &sbi->mount_crypt_stat; + + if (!ctx->sig_set) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "You must supply at least one valid " "auth tok signature as a mount " "parameter; see the eCryptfs README\n"); goto out; } - if (!cipher_name_set) { + if (!ctx->cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); @@ -383,13 +357,13 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, ECRYPTFS_DEFAULT_CIPHER); } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - && !fn_cipher_name_set) + && !ctx->fn_cipher_name_set) strcpy(mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_cipher_name); - if (!cipher_key_bytes_set) + if (!ctx->cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - && !fn_cipher_key_bytes_set) + && !ctx->fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; @@ -453,45 +427,35 @@ struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /* - * ecryptfs_mount - * @fs_type: The filesystem type that the superblock should belong to - * @flags: The flags associated with the mount - * @dev_name: The path to mount over - * @raw_data: The options passed into the kernel + * ecryptfs_get_tree + * @fc: The filesystem context */ -static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) +static int ecryptfs_get_tree(struct fs_context *fc) { struct super_block *s; - struct ecryptfs_sb_info *sbi; + struct ecryptfs_fs_context *ctx = fc->fs_private; + struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; - uid_t check_ruid; int rc; - sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); - if (!sbi) { - rc = -ENOMEM; - goto out; - } - - if (!dev_name) { + if (!fc->source) { rc = -EINVAL; err = "Device name cannot be null"; goto out; } - rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); + mount_crypt_stat = &sbi->mount_crypt_stat; + rc = ecryptfs_validate_options(fc); if (rc) { - err = "Error parsing options"; + err = "Error validating options"; goto out; } - mount_crypt_stat = &sbi->mount_crypt_stat; - s = sget(fs_type, NULL, set_anon_super, flags, NULL); + s = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; @@ -510,7 +474,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; - rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); + rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); goto out1; @@ -529,7 +493,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { + if (ctx->check_ruid && + !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", @@ -544,7 +509,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ - s->s_flags = flags & ~SB_POSIXACL; + s->s_flags = fc->sb_flags & ~SB_POSIXACL; s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** @@ -587,19 +552,19 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags root_info->lower_path = path; s->s_flags |= SB_ACTIVE; - return dget(s->s_root); + fc->root = dget(s->s_root); + return 0; out_free: path_put(&path); out1: deactivate_locked_super(s); out: - if (sbi) { + if (sbi) ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); - kmem_cache_free(ecryptfs_sb_info_cache, sbi); - } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); - return ERR_PTR(rc); + return rc; } /** @@ -618,10 +583,54 @@ static void ecryptfs_kill_block_super(struct super_block *sb) kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } +static void ecryptfs_free_fc(struct fs_context *fc) +{ + struct ecryptfs_fs_context *ctx = fc->fs_private; + struct ecryptfs_sb_info *sbi = fc->s_fs_info; + + kfree(ctx); + + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } +} + +static const struct fs_context_operations ecryptfs_context_ops = { + .free = ecryptfs_free_fc, + .parse_param = ecryptfs_parse_param, + .get_tree = ecryptfs_get_tree, + .reconfigure = NULL, +}; + +static int ecryptfs_init_fs_context(struct fs_context *fc) +{ + struct ecryptfs_fs_context *ctx; + struct ecryptfs_sb_info *sbi = NULL; + + ctx = kzalloc(sizeof(struct ecryptfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + kfree(ctx); + ctx = NULL; + return -ENOMEM; + } + + ecryptfs_init_mount_crypt_stat(&sbi->mount_crypt_stat); + + fc->fs_private = ctx; + fc->s_fs_info = sbi; + fc->ops = &ecryptfs_context_ops; + return 0; +} + static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", - .mount = ecryptfs_mount, + .init_fs_context = ecryptfs_init_fs_context, + .parameters = ecryptfs_fs_param_spec, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index ceda5555971a..60f0ac8744b5 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -23,47 +23,29 @@ #include "ecryptfs_kernel.h" /* - * ecryptfs_get_locked_page - * - * Get one page from cache or lower f/s, return error otherwise. - * - * Returns locked and up-to-date page (if ok), with increased - * refcnt. - */ -struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) -{ - struct page *page = read_mapping_page(inode->i_mapping, index, NULL); - if (!IS_ERR(page)) - lock_page(page); - return page; -} - -/** - * ecryptfs_writepage - * @page: Page that is locked before this call is made - * @wbc: Write-back control structure - * - * Returns zero on success; non-zero otherwise - * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) +static int ecryptfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - int rc; - - rc = ecryptfs_encrypt_page(page); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error encrypting " - "page (upper index [0x%.16lx])\n", page->index); - ClearPageUptodate(page); - goto out; + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + error = ecryptfs_encrypt_page(folio); + if (error) { + ecryptfs_printk(KERN_WARNING, + "Error encrypting folio (index [0x%.16lx])\n", + folio->index); + folio_clear_uptodate(folio); + mapping_set_error(mapping, error); + } + folio_unlock(folio); } - SetPageUptodate(page); -out: - unlock_page(page); - return rc; + + return error; } static void strip_xattr_flag(char *page_virt, @@ -97,7 +79,7 @@ static void strip_xattr_flag(char *page_virt, /** * ecryptfs_copy_up_encrypted_with_header - * @page: Sort of a ``virtual'' representation of the encrypted lower + * @folio: Sort of a ``virtual'' representation of the encrypted lower * file. The actual lower file does not have the metadata in * the header. This is locked. * @crypt_stat: The eCryptfs inode's cryptographic context @@ -106,7 +88,7 @@ static void strip_xattr_flag(char *page_virt, * seeing, with the header information inserted. */ static int -ecryptfs_copy_up_encrypted_with_header(struct page *page, +ecryptfs_copy_up_encrypted_with_header(struct folio *folio, struct ecryptfs_crypt_stat *crypt_stat) { loff_t extent_num_in_page = 0; @@ -115,9 +97,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, int rc = 0; while (extent_num_in_page < num_extents_per_page) { - loff_t view_extent_num = ((((loff_t)page->index) + loff_t view_extent_num = ((loff_t)folio->index * num_extents_per_page) - + extent_num_in_page); + + extent_num_in_page; size_t num_header_extents_at_front = (crypt_stat->metadata_size / crypt_stat->extent_size); @@ -125,21 +107,21 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_local_page(page); + page_virt = kmap_local_folio(folio, 0); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { size_t written; rc = ecryptfs_read_xattr_region( - page_virt, page->mapping->host); + page_virt, folio->mapping->host); strip_xattr_flag(page_virt + 16, crypt_stat); ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); } kunmap_local(page_virt); - flush_dcache_page(page); + flush_dcache_folio(folio); if (rc) { printk(KERN_ERR "%s: Error reading xattr " "region; rc = [%d]\n", __func__, rc); @@ -152,9 +134,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, - crypt_stat->metadata_size); rc = ecryptfs_read_lower_page_segment( - page, (lower_offset >> PAGE_SHIFT), + folio, (lower_offset >> PAGE_SHIFT), (lower_offset & ~PAGE_MASK), - crypt_stat->extent_size, page->mapping->host); + crypt_stat->extent_size, folio->mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "extent at offset [%lld] in the lower " @@ -180,55 +162,50 @@ out: */ static int ecryptfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; - int rc = 0; + &ecryptfs_inode_to_private(inode)->crypt_stat; + int err = 0; if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_read_lower_page_segment(page, page->index, 0, - PAGE_SIZE, - page->mapping->host); + err = ecryptfs_read_lower_page_segment(folio, folio->index, 0, + folio_size(folio), inode); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { - rc = ecryptfs_copy_up_encrypted_with_header(page, - crypt_stat); - if (rc) { + err = ecryptfs_copy_up_encrypted_with_header(folio, + crypt_stat); + if (err) { printk(KERN_ERR "%s: Error attempting to copy " "the encrypted content from the lower " "file whilst inserting the metadata " - "from the xattr into the header; rc = " - "[%d]\n", __func__, rc); + "from the xattr into the header; err = " + "[%d]\n", __func__, err); goto out; } } else { - rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_SIZE, - page->mapping->host); - if (rc) { - printk(KERN_ERR "Error reading page; rc = " - "[%d]\n", rc); + err = ecryptfs_read_lower_page_segment(folio, + folio->index, 0, folio_size(folio), + inode); + if (err) { + printk(KERN_ERR "Error reading page; err = " + "[%d]\n", err); goto out; } } } else { - rc = ecryptfs_decrypt_page(page); - if (rc) { + err = ecryptfs_decrypt_page(folio); + if (err) { ecryptfs_printk(KERN_ERR, "Error decrypting page; " - "rc = [%d]\n", rc); + "err = [%d]\n", err); goto out; } } out: - if (rc) - ClearPageUptodate(page); - else - SetPageUptodate(page); - ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n", - page->index); - unlock_page(page); - return rc; + ecryptfs_printk(KERN_DEBUG, "Unlocking folio with index = [0x%.16lx]\n", + folio->index); + folio_end_read(folio, err == 0); + return err; } /* @@ -285,7 +262,7 @@ static int ecryptfs_write_begin(struct file *file, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - &folio->page, index, 0, PAGE_SIZE, mapping->host); + folio, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", @@ -297,7 +274,7 @@ static int ecryptfs_write_begin(struct file *file, } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { rc = ecryptfs_copy_up_encrypted_with_header( - &folio->page, crypt_stat); + folio, crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting " "to copy the encrypted content " @@ -311,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file, folio_mark_uptodate(folio); } else { rc = ecryptfs_read_lower_page_segment( - &folio->page, index, 0, PAGE_SIZE, + folio, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " @@ -328,7 +305,7 @@ static int ecryptfs_write_begin(struct file *file, folio_zero_range(folio, 0, PAGE_SIZE); folio_mark_uptodate(folio); } else if (len < PAGE_SIZE) { - rc = ecryptfs_decrypt_page(&folio->page); + rc = ecryptfs_decrypt_page(folio); if (rc) { printk(KERN_ERR "%s: Error decrypting " "page at index [%ld]; " @@ -477,7 +454,7 @@ static int ecryptfs_write_end(struct file *file, "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, - &folio->page, 0, to); + folio, 0, to); if (!rc) { rc = copied; fsstack_copy_inode_size(ecryptfs_inode, @@ -499,7 +476,7 @@ static int ecryptfs_write_end(struct file *file, "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(&folio->page); + rc = ecryptfs_encrypt_page(folio); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " "index [0x%.16lx])\n", index); @@ -548,9 +525,10 @@ const struct address_space_operations ecryptfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, #endif - .writepage = ecryptfs_writepage, + .writepages = ecryptfs_writepages, .read_folio = ecryptfs_read_folio, .write_begin = ecryptfs_write_begin, .write_end = ecryptfs_write_end, + .migrate_folio = filemap_migrate_folio, .bmap = ecryptfs_bmap, }; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 3458f153a588..b3b451c2b941 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -41,30 +41,29 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, /** * ecryptfs_write_lower_page_segment * @ecryptfs_inode: The eCryptfs inode - * @page_for_lower: The page containing the data to be written to the + * @folio_for_lower: The folio containing the data to be written to the * lower file - * @offset_in_page: The offset in the @page_for_lower from which to + * @offset_in_page: The offset in the @folio_for_lower from which to * start writing the data - * @size: The amount of data from @page_for_lower to write to the + * @size: The amount of data from @folio_for_lower to write to the * lower file * * Determines the byte offset in the file for the given page and * offset within the page, maps the page, and makes the call to write - * the contents of @page_for_lower to the lower inode. + * the contents of @folio_for_lower to the lower inode. * * Returns zero on success; non-zero otherwise */ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, - struct page *page_for_lower, + struct folio *folio_for_lower, size_t offset_in_page, size_t size) { char *virt; loff_t offset; int rc; - offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) - + offset_in_page); - virt = kmap_local_page(page_for_lower); + offset = (loff_t)folio_for_lower->index * PAGE_SIZE + offset_in_page; + virt = kmap_local_folio(folio_for_lower, 0); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; @@ -93,7 +92,6 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { - struct page *ecryptfs_page; struct ecryptfs_crypt_stat *crypt_stat; char *ecryptfs_page_virt; loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); @@ -111,6 +109,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, else pos = offset; while (pos < (offset + size)) { + struct folio *ecryptfs_folio; pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT); size_t start_offset_in_page = (pos & ~PAGE_MASK); size_t num_bytes = (PAGE_SIZE - start_offset_in_page); @@ -130,17 +129,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; } - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, - ecryptfs_page_idx); - if (IS_ERR(ecryptfs_page)) { - rc = PTR_ERR(ecryptfs_page); + ecryptfs_folio = read_mapping_folio(ecryptfs_inode->i_mapping, + ecryptfs_page_idx, NULL); + if (IS_ERR(ecryptfs_folio)) { + rc = PTR_ERR(ecryptfs_folio); printk(KERN_ERR "%s: Error getting page at " "index [%ld] from eCryptfs inode " "mapping; rc = [%d]\n", __func__, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_local_page(ecryptfs_page); + folio_lock(ecryptfs_folio); + ecryptfs_page_virt = kmap_local_folio(ecryptfs_folio, 0); /* * pos: where we're now writing, offset: where the request was @@ -164,17 +164,17 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, data_offset += num_bytes; } kunmap_local(ecryptfs_page_virt); - flush_dcache_page(ecryptfs_page); - SetPageUptodate(ecryptfs_page); - unlock_page(ecryptfs_page); + flush_dcache_folio(ecryptfs_folio); + folio_mark_uptodate(ecryptfs_folio); + folio_unlock(ecryptfs_folio); if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) - rc = ecryptfs_encrypt_page(ecryptfs_page); + rc = ecryptfs_encrypt_page(ecryptfs_folio); else rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, - ecryptfs_page, + ecryptfs_folio, start_offset_in_page, data_offset); - put_page(ecryptfs_page); + folio_put(ecryptfs_folio); if (rc) { printk(KERN_ERR "%s: Error encrypting " "page; rc = [%d]\n", __func__, rc); @@ -228,7 +228,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, /** * ecryptfs_read_lower_page_segment - * @page_for_ecryptfs: The page into which data for eCryptfs will be + * @folio_for_ecryptfs: The folio into which data for eCryptfs will be * written * @page_index: Page index in @page_for_ecryptfs from which to start * writing @@ -243,7 +243,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, * * Returns zero on success; non-zero otherwise */ -int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, +int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode) @@ -252,12 +252,12 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, loff_t offset; int rc; - offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap_local_page(page_for_ecryptfs); + offset = (loff_t)page_index * PAGE_SIZE + offset_in_page; + virt = kmap_local_folio(folio_for_ecryptfs, 0); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; kunmap_local(virt); - flush_dcache_page(page_for_ecryptfs); + flush_dcache_folio(folio_for_ecryptfs); return rc; } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 586446e02ef7..ec23da8405ff 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -51,7 +51,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, * * VariableName-12345678-1234-1234-1234-1234567891bc */ -bool efivarfs_valid_name(const char *str, int len) +static bool efivarfs_valid_name(const char *str, int len) { const char *s = str + len - EFI_VARIABLE_GUID_LEN; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index d71d2e08422f..74f0602a9e01 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -60,7 +60,6 @@ bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, extern const struct file_operations efivarfs_file_operations; extern const struct inode_operations efivarfs_dir_inode_operations; -extern bool efivarfs_valid_name(const char *str, int len); extern struct inode *efivarfs_get_inode(struct super_block *sb, const struct inode *dir, int mode, dev_t dev, bool is_removable); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a929f1b613be..beba15673be8 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -144,9 +144,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) const unsigned char *s = qstr->name; unsigned int len = qstr->len; - if (!efivarfs_valid_name(s, len)) - return -EINVAL; - while (len-- > EFI_VARIABLE_GUID_LEN) hash = partial_name_hash(*s++, hash); diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 3cc89bb624f0..f7d43c847ee9 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -22,7 +22,7 @@ #include "internal.h" -MODULE_IMPORT_NS(EFIVAR); +MODULE_IMPORT_NS("EFIVAR"); static bool validate_device_path(efi_char16_t *var_name, int match, u8 *buffer, diff --git a/fs/efs/super.c b/fs/efs/super.c index e4421c10caeb..c59086b7eabf 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -15,7 +15,6 @@ #include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/fs_context.h> -#include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> @@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; -enum { - Opt_explicit_open, -}; - -static const struct fs_parameter_spec efs_param_spec[] = { - fsparam_flag ("explicit-open", Opt_explicit_open), - {} -}; - /* * File system definition and registration. */ @@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = { .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = efs_init_fs_context, - .parameters = efs_param_spec, }; MODULE_ALIAS_FS("efs"); @@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); - return -EINVAL; + return invalf(fc, "device does not support %d byte blocks\n", + EFS_BLOCKSIZE); } /* read the vh (volume header) block */ @@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) return 0; } -static void efs_free_fc(struct fs_context *fc) -{ - kfree(fc->fs_private); -} - static int efs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, efs_fill_super); } -static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - int token; - struct fs_parse_result result; - - token = fs_parse(fc, efs_param_spec, param, &result); - if (token < 0) - return token; - return 0; -} - static int efs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; return 0; } -struct efs_context { - unsigned long s_mount_opts; -}; - static const struct fs_context_operations efs_context_opts = { - .parse_param = efs_parse_param, .get_tree = efs_get_tree, .reconfigure = efs_reconfigure, - .free = efs_free_fc, }; /* @@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = { */ static int efs_init_fs_context(struct fs_context *fc) { - struct efs_context *ctx; - - ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - fc->fs_private = ctx; fc->ops = &efs_context_opts; return 0; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 61debd799cf9..0cd6b5c4df98 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -10,10 +10,10 @@ void erofs_unmap_metabuf(struct erofs_buf *buf) { - if (buf->kmap_type == EROFS_KMAP) - kunmap_local(buf->base); + if (!buf->base) + return; + kunmap_local(buf->base); buf->base = NULL; - buf->kmap_type = EROFS_NO_KMAP; } void erofs_put_metabuf(struct erofs_buf *buf) @@ -38,20 +38,13 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, } if (!folio || !folio_contains(folio, index)) { erofs_put_metabuf(buf); - folio = read_mapping_folio(buf->mapping, index, NULL); + folio = read_mapping_folio(buf->mapping, index, buf->file); if (IS_ERR(folio)) return folio; } buf->page = folio_file_page(folio, index); - - if (buf->kmap_type == EROFS_NO_KMAP) { - if (type == EROFS_KMAP) - buf->base = kmap_local_page(buf->page); - buf->kmap_type = type; - } else if (buf->kmap_type != type) { - DBG_BUGON(1); - return ERR_PTR(-EFAULT); - } + if (!buf->base && type == EROFS_KMAP) + buf->base = kmap_local_page(buf->page); if (type == EROFS_NO_KMAP) return NULL; return buf->base + (offset & ~PAGE_MASK); @@ -61,10 +54,12 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (erofs_is_fileio_mode(sbi)) - buf->mapping = file_inode(sbi->fdev)->i_mapping; - else if (erofs_is_fscache_mode(sb)) - buf->mapping = sbi->s_fscache->inode->i_mapping; + buf->file = NULL; + if (erofs_is_fileio_mode(sbi)) { + buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ + buf->mapping = buf->file->f_mapping; + } else if (erofs_is_fscache_mode(sb)) + buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; } @@ -184,19 +179,13 @@ out: } static void erofs_fill_from_devinfo(struct erofs_map_dev *map, - struct erofs_device_info *dif) + struct super_block *sb, struct erofs_device_info *dif) { + map->m_sb = sb; + map->m_dif = dif; map->m_bdev = NULL; - map->m_fp = NULL; - if (dif->file) { - if (S_ISBLK(file_inode(dif->file)->i_mode)) - map->m_bdev = file_bdev(dif->file); - else - map->m_fp = dif->file; - } - map->m_daxdev = dif->dax_dev; - map->m_dax_part_off = dif->dax_part_off; - map->m_fscache = dif->fscache; + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) + map->m_bdev = file_bdev(dif->file); } int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) @@ -206,12 +195,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) erofs_off_t startoff, length; int id; - map->m_bdev = sb->s_bdev; - map->m_daxdev = EROFS_SB(sb)->dax_dev; - map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; - map->m_fscache = EROFS_SB(sb)->s_fscache; - map->m_fp = EROFS_SB(sb)->fdev; - + erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); + map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */ if (map->m_deviceid) { down_read(&devs->rwsem); dif = idr_find(&devs->tree, map->m_deviceid - 1); @@ -224,7 +209,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); up_read(&devs->rwsem); } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); @@ -237,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - erofs_fill_from_devinfo(map, dif); + erofs_fill_from_devinfo(map, sb, dif); break; } } @@ -307,7 +292,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->offset = map.m_la; if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_daxdev; + iomap->dax_dev = mdev.m_dif->dax_dev; else iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; @@ -336,7 +321,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = mdev.m_pa; if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dax_part_off; + iomap->addr += mdev.m_dif->dax_part_off; } return 0; } @@ -350,7 +335,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, struct erofs_buf buf = { .page = kmap_to_page(ptr), .base = ptr, - .kmap_type = EROFS_KMAP, }; DBG_BUGON(iomap->type != IOMAP_INLINE); @@ -411,22 +395,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif - if (iocb->ki_flags & IOCB_DIRECT) { - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = bdev_logical_block_size(bdev) - 1; - else - blksize_mask = i_blocksize(inode) - 1; - - if ((iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to)) & blksize_mask) - return -EINVAL; - + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, NULL, 0, NULL, 0); - } return filemap_read(iocb, to, 0); } @@ -473,8 +444,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) #define erofs_file_mmap generic_file_readonly_mmap #endif +static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + const struct iomap_ops *ops = &erofs_iomap_ops; + + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) +#ifdef CONFIG_EROFS_FS_ZIP + ops = &z_erofs_iomap_report_ops; +#else + return generic_file_llseek(file, offset, whence); +#endif + + if (whence == SEEK_HOLE) + offset = iomap_seek_hole(inode, offset, ops); + else if (whence == SEEK_DATA) + offset = iomap_seek_data(inode, offset, ops); + else + return generic_file_llseek(file, offset, whence); + + if (offset < 0) + return offset; + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct file_operations erofs_file_fops = { - .llseek = generic_file_llseek, + .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 3af96b1e2c2a..33f8539dda4a 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -9,6 +9,7 @@ struct erofs_fileio_rq { struct bio_vec bvecs[BIO_MAX_VECS]; struct bio bio; struct kiocb iocb; + struct super_block *sb; }; struct erofs_fileio { @@ -52,8 +53,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT; rq->iocb.ki_ioprio = get_current_ioprio(); rq->iocb.ki_complete = erofs_fileio_ki_complete; - rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? - IOCB_DIRECT : 0; + if (test_opt(&EROFS_SB(rq->sb)->opt, DIRECT_IO) && + rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) + rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); @@ -67,7 +69,8 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) GFP_KERNEL | __GFP_NOFAIL); bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); - rq->iocb.ki_filp = mdev->m_fp; + rq->iocb.ki_filp = mdev->m_dif->file; + rq->sb = mdev->m_sb; return rq; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fda16eedafb5..ce3d8737df85 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -198,7 +198,7 @@ struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL); bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ); - io->io.private = mdev->m_fscache->cookie; + io->io.private = mdev->m_dif->fscache->cookie; io->io.end_io = erofs_fscache_bio_endio; refcount_set(&io->io.ref, 1); return &io->bio; @@ -316,7 +316,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) if (!io) return -ENOMEM; iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count); - ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie, + ret = erofs_fscache_read_io_async(mdev.m_dif->fscache->cookie, mdev.m_pa + (pos - map.m_la), io); erofs_fscache_req_io_put(io); @@ -657,7 +657,7 @@ int erofs_fscache_register_fs(struct super_block *sb) if (IS_ERR(fscache)) return PTR_ERR(fscache); - sbi->s_fscache = fscache; + sbi->dif0.fscache = fscache; return 0; } @@ -665,14 +665,14 @@ void erofs_fscache_unregister_fs(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - erofs_fscache_unregister_cookie(sbi->s_fscache); + erofs_fscache_unregister_cookie(sbi->dif0.fscache); if (sbi->domain) erofs_fscache_domain_put(sbi->domain); else fscache_relinquish_volume(sbi->volume, NULL, false); - sbi->s_fscache = NULL; + sbi->dif0.fscache = NULL; sbi->volume = NULL; sbi->domain = NULL; } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index db29190656eb..d4b89407822a 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -318,6 +318,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); + struct block_device *bdev = inode->i_sb->s_bdev; bool compressed = erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); @@ -330,15 +331,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, /* * Return the DIO alignment restrictions if requested. * - * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and - * compressed files, so in these cases we report no DIO support. + * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode + * and uncompressed inodes, otherwise we report no DIO support. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { stat->result_mask |= STATX_DIOALIGN; - if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) { - stat->dio_mem_align = - bdev_logical_block_size(inode->i_sb->s_bdev); - stat->dio_offset_align = stat->dio_mem_align; + if (bdev && !compressed) { + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); } } generic_fillattr(idmap, request_mask, inode, stat); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4efd578d7c62..686d835eb533 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -20,18 +20,12 @@ #include <linux/iomap.h> #include "erofs_fs.h" -/* redefine pr_fmt "erofs: " */ -#undef pr_fmt -#define pr_fmt(fmt) "erofs: " fmt - -__printf(3, 4) void _erofs_err(struct super_block *sb, - const char *function, const char *fmt, ...); +__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); #define erofs_err(sb, fmt, ...) \ - _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) -__printf(3, 4) void _erofs_info(struct super_block *sb, - const char *function, const char *fmt, ...); + _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__) #define erofs_info(sb, fmt, ...) \ - _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) + _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__) + #ifdef CONFIG_EROFS_FS_DEBUG #define DBG_BUGON BUG_ON #else @@ -113,6 +107,7 @@ struct erofs_xattr_prefix_item { }; struct erofs_sb_info { + struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -130,13 +125,9 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ - struct file *fdev; struct inode *packed_inode; struct erofs_dev_context *devs; - struct dax_device *dax_dev; - u64 dax_part_off; u64 total_blocks; - u32 primarydevice_blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -172,7 +163,6 @@ struct erofs_sb_info { /* fscache support */ struct fscache_volume *volume; - struct erofs_fscache *s_fscache; struct erofs_domain *domain; char *fsid; char *domain_id; @@ -186,6 +176,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_POSIX_ACL 0x00000020 #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 +#define EROFS_MOUNT_DIRECT_IO 0x00000100 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) @@ -193,7 +184,7 @@ struct erofs_sb_info { static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) { - return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev; + return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; } static inline bool erofs_is_fscache_mode(struct super_block *sb) @@ -208,12 +199,6 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -/* basic unit of the workstation of a super_block */ -struct erofs_workgroup { - pgoff_t index; - struct lockref lockref; -}; - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ @@ -221,9 +206,9 @@ enum erofs_kmap_type { struct erofs_buf { struct address_space *mapping; + struct file *file; struct page *page; void *base; - enum erofs_kmap_type kmap_type; }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) @@ -369,11 +354,9 @@ enum { }; struct erofs_map_dev { - struct erofs_fscache *m_fscache; + struct super_block *m_sb; + struct erofs_device_info *m_dif; struct block_device *m_bdev; - struct dax_device *m_daxdev; - struct file *m_fp; - u64 m_dax_part_off; erofs_off_t m_pa; unsigned int m_deviceid; @@ -456,20 +439,17 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -void erofs_workgroup_put(struct erofs_workgroup *grp); -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index); -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp); -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) + +extern atomic_long_t erofs_global_shrink_cnt; void erofs_shrinker_register(struct super_block *sb); void erofs_shrinker_unregister(struct super_block *sb); int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); -int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, - struct erofs_workgroup *egrp); +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); void *z_erofs_get_gbuf(unsigned int requiredpages); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 666873f745da..f5956474bfde 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -18,37 +18,22 @@ static struct kmem_cache *erofs_inode_cachep __read_mostly; -void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) +void _erofs_printk(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; + int level; va_start(args, fmt); - vaf.fmt = fmt; + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - - if (sb) - pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); - else - pr_err("%s: %pV", func, &vaf); - va_end(args); -} - -void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - if (sb) - pr_info("(device %s): %pV", sb->s_id, &vaf); + printk("%c%cerofs (device %s): %pV", + KERN_SOH_ASCII, level, sb->s_id, &vaf); else - pr_info("%pV", &vaf); + printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf); va_end(args); } @@ -191,10 +176,14 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(file)) return PTR_ERR(file); - dif->file = file; - if (!erofs_is_fileio_mode(sbi)) + if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), &dif->dax_part_off, NULL, NULL); + } else if (!S_ISREG(file_inode(file)->i_mode)) { + fput(file); + return -EINVAL; + } + dif->file = file; } dif->blocks = le32_to_cpu(dis->blocks); @@ -214,7 +203,7 @@ static int erofs_scan_devices(struct super_block *sb, struct erofs_device_info *dif; int id, err = 0; - sbi->total_blocks = sbi->primarydevice_blocks; + sbi->total_blocks = sbi->dif0.blocks; if (!erofs_sb_has_device_table(sbi)) ondisk_extradevs = 0; else @@ -318,7 +307,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -375,14 +364,8 @@ static void erofs_default_options(struct erofs_sb_info *sbi) } enum { - Opt_user_xattr, - Opt_acl, - Opt_cache_strategy, - Opt_dax, - Opt_dax_enum, - Opt_device, - Opt_fsid, - Opt_domain_id, + Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_err }; @@ -409,6 +392,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("device", Opt_device), fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), + fsparam_flag_no("directio", Opt_directio), {} }; @@ -522,6 +506,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); break; #endif + case Opt_directio: +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE + if (result.boolean) + set_opt(&sbi->opt, DIRECT_IO); + else + clear_opt(&sbi->opt, DIRECT_IO); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; default: return -ENOPARAM; } @@ -613,9 +607,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -EINVAL; } - sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off, - NULL, NULL); + sbi->dif0.dax_dev = fs_dax_get_by_bdev(sb->s_bdev, + &sbi->dif0.dax_part_off, NULL, NULL); } err = erofs_read_superblock(sb); @@ -627,14 +620,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) errorfc(fc, "unsupported blksize for fscache mode"); return -EINVAL; } - if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + + if (erofs_is_fileio_mode(sbi)) { + sb->s_blocksize = 1 << sbi->blkszbits; + sb->s_blocksize_bits = sbi->blkszbits; + } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { errorfc(fc, "failed to set erofs blksize"); return -EINVAL; } } if (test_opt(&sbi->opt, DAX_ALWAYS)) { - if (!sbi->dax_dev) { + if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); } else if (sbi->blkszbits != PAGE_SHIFT) { @@ -705,16 +702,23 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - ret = get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, + IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? + GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { + struct file *file; + if (!fc->source) return invalf(fc, "No source specified"); - sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); - if (IS_ERR(sbi->fdev)) - return PTR_ERR(sbi->fdev); + file = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + sbi->dif0.file = file; - return get_tree_nodev(fc, erofs_fc_fill_super); + if (S_ISREG(file_inode(sbi->dif0.file)->i_mode) && + sbi->dif0.file->f_mapping->a_ops->read_folio) + return get_tree_nodev(fc, erofs_fc_fill_super); } #endif return ret; @@ -765,19 +769,24 @@ static void erofs_free_dev_context(struct erofs_dev_context *devs) kfree(devs); } -static void erofs_fc_free(struct fs_context *fc) +static void erofs_sb_free(struct erofs_sb_info *sbi) { - struct erofs_sb_info *sbi = fc->s_fs_info; - - if (!sbi) - return; - erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); kfree(sbi->domain_id); + if (sbi->dif0.file) + fput(sbi->dif0.file); kfree(sbi); } +static void erofs_fc_free(struct fs_context *fc) +{ + struct erofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) /* free here if an error occurs before transferring to sb */ + erofs_sb_free(sbi); +} + static const struct fs_context_operations erofs_context_ops = { .parse_param = erofs_fc_parse_param, .get_tree = erofs_fc_get_tree, @@ -811,19 +820,14 @@ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev) + if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || + sbi->dif0.file) kill_anon_super(sb); else kill_block_super(sb); - - erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev, NULL); + fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->fsid); - kfree(sbi->domain_id); - if (sbi->fdev) - fput(sbi->fdev); - kfree(sbi); + erofs_sb_free(sbi); sb->s_fs_info = NULL; } @@ -949,6 +953,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",dax=always"); if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); + if (erofs_is_fileio_mode(sbi) && test_opt(opt, DIRECT_IO)) + seq_puts(seq, ",directio"); #ifdef CONFIG_EROFS_FS_ONDEMAND if (sbi->fsid) seq_printf(seq, ",fsid=%s", sbi->fsid); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 63cffd0fd261..19d586273b70 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -10,6 +10,7 @@ enum { attr_feature, + attr_drop_caches, attr_pointer_ui, attr_pointer_bool, }; @@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \ #ifdef CONFIG_EROFS_FS_ZIP EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +EROFS_ATTR_FUNC(drop_caches, 0200); #endif static struct attribute *erofs_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), + ATTR_LIST(drop_caches), #endif NULL, }; @@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, return -EINVAL; *(bool *)ptr = !!t; return len; +#ifdef CONFIG_EROFS_FS_ZIP + case attr_drop_caches: + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t < 1 || t > 3) + return -EINVAL; + + if (t & 2) + z_erofs_shrink_scan(sbi, ~0UL); + if (t & 1) + invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); + return len; +#endif } return 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8936790618c6..254f6ad2c336 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { - struct erofs_workgroup obj; struct mutex lock; + struct lockref lockref; /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; + /* I: start block address of this pcluster */ + erofs_off_t index; + /* L: the maximum decompression size of this round */ unsigned int length; @@ -108,7 +111,7 @@ struct z_erofs_decompressqueue { static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) { - return !pcl->obj.index; + return !pcl->index; } static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) @@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) { return fo->mapping == MNGD_MAPPING(sbi); @@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, pcl->obj.index + i); + page = find_get_page(mc, pcl->index + i); if (!page) { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); } - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (!pcl->compressed_bvecs[i].page) { pcl->compressed_bvecs[i].page = page ? page : newpage; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); continue; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (page) put_page(page); @@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) } /* (erofs_shrinker) disconnect cached encoded data with pclusters */ -int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) +static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); unsigned int pclusterpages = z_erofs_pclusterpages(pcl); struct folio *folio; int i; @@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) return true; ret = false; - spin_lock(&pcl->obj.lockref.lock); - if (pcl->obj.lockref.count <= 0) { + spin_lock(&pcl->lockref.lock); + if (pcl->lockref.count <= 0) { DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { @@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) } } } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return ret; } @@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, if (exclusive) { /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); while (fe->icur > 0) { if (pcl->compressed_bvecs[--fe->icur].page) continue; pcl->compressed_bvecs[fe->icur] = *bvec; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); return 0; } - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -710,31 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, return ret; } -static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) +static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *pcl = f->pcl; - z_erofs_next_pcluster_t *owned_head = &f->owned_head; - - /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) == Z_EROFS_PCLUSTER_NIL) { - *owned_head = &pcl->next; - /* so we can attach this pcluster to our submission chain. */ - f->mode = Z_EROFS_PCLUSTER_FOLLOWED; - return; + if (lockref_get_not_zero(&pcl->lockref)) + return true; + + spin_lock(&pcl->lockref.lock); + if (__lockref_is_dead(&pcl->lockref)) { + spin_unlock(&pcl->lockref.lock); + return false; } - /* type 2, it belongs to an ongoing chain */ - f->mode = Z_EROFS_PCLUSTER_INFLIGHT; + if (!pcl->lockref.count++) + atomic_long_dec(&erofs_global_shrink_cnt); + spin_unlock(&pcl->lockref.lock); + return true; } static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); bool ztailpacking = map->m_flags & EROFS_MAP_META; - struct z_erofs_pcluster *pcl; - struct erofs_workgroup *grp; + struct z_erofs_pcluster *pcl, *pre; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || @@ -748,8 +747,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->obj.lockref.lock); - pcl->obj.lockref.count = 1; /* one ref for this request */ + lockref_init(&pcl->lockref, 1); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -767,19 +765,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { - pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->index = 0; /* which indicates ztailpacking */ } else { - pcl->obj.index = erofs_blknr(sb, map->m_pa); - - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); - if (IS_ERR(grp)) { - err = PTR_ERR(grp); - goto err_out; + pcl->index = erofs_blknr(sb, map->m_pa); + while (1) { + xa_lock(&sbi->managed_pslots); + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + NULL, pcl, GFP_KERNEL); + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { + xa_unlock(&sbi->managed_pslots); + break; + } + /* try to legitimize the current in-tree one */ + xa_unlock(&sbi->managed_pslots); + cond_resched(); } - - if (grp != &pcl->obj) { - fe->pcl = container_of(grp, - struct z_erofs_pcluster, obj); + if (xa_is_err(pre)) { + err = xa_err(pre); + goto err_out; + } else if (pre) { + fe->pcl = pre; err = -EEXIST; goto err_out; } @@ -799,23 +804,31 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); - struct erofs_workgroup *grp = NULL; + struct z_erofs_pcluster *pcl = NULL; int ret; DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(sb, blknr); + while (1) { + rcu_read_lock(); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + if (!pcl || z_erofs_get_pcluster(pcl)) { + DBG_BUGON(pcl && blknr != pcl->index); + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + } } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } - if (grp) { - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + if (pcl) { + fe->pcl = pcl; ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); @@ -823,7 +836,15 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - z_erofs_try_to_claim_pcluster(fe); + /* check if this pcluster hasn't been linked into any chain. */ + if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL, + fe->owned_head) == Z_EROFS_PCLUSTER_NIL) { + /* .. so it can be attached to our submission chain */ + fe->owned_head = &fe->pcl->next; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; + } else { /* otherwise, it belongs to an inflight chain */ + fe->mode = Z_EROFS_PCLUSTER_INFLIGHT; + } } else if (ret) { return ret; } @@ -862,12 +883,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head) struct z_erofs_pcluster, rcu)); } -void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) +static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) { - struct z_erofs_pcluster *const pcl = - container_of(grp, struct z_erofs_pcluster, obj); + if (pcl->lockref.count) + return false; + + /* + * Note that all cached folios should be detached before deleted from + * the XArray. Otherwise some folios could be still attached to the + * orphan old pcluster when the new one is available in the tree. + */ + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) + return false; + + /* + * It's impossible to fail after the pcluster is freezed, but in order + * to avoid some race conditions, add a DBG_BUGON to observe this. + */ + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); - call_rcu(&pcl->rcu, z_erofs_rcu_callback); + lockref_mark_dead(&pcl->lockref); + return true; +} + +static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl) +{ + bool free; + + spin_lock(&pcl->lockref.lock); + free = __erofs_try_to_release_pcluster(sbi, pcl); + spin_unlock(&pcl->lockref.lock); + if (free) { + atomic_long_dec(&erofs_global_shrink_cnt); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); + } + return free; +} + +unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, + unsigned long nr_shrink) +{ + struct z_erofs_pcluster *pcl; + unsigned int freed = 0; + unsigned long index; + + xa_lock(&sbi->managed_pslots); + xa_for_each(&sbi->managed_pslots, index, pcl) { + /* try to shrink each valid pcluster */ + if (!erofs_try_to_release_pcluster(sbi, pcl)) + continue; + xa_unlock(&sbi->managed_pslots); + + ++freed; + if (!--nr_shrink) + return freed; + xa_lock(&sbi->managed_pslots); + } + xa_unlock(&sbi->managed_pslots); + return freed; +} + +static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, + struct z_erofs_pcluster *pcl, bool try_free) +{ + bool free = false; + + if (lockref_put_or_lock(&pcl->lockref)) + return; + + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); + if (!--pcl->lockref.count) { + if (try_free && xa_trylock(&sbi->managed_pslots)) { + free = __erofs_try_to_release_pcluster(sbi, pcl); + xa_unlock(&sbi->managed_pslots); + } + atomic_long_add(!free, &erofs_global_shrink_cnt); + } + spin_unlock(&pcl->lockref.lock); + if (free) + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) @@ -888,7 +984,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) - erofs_workgroup_put(&pcl->obj); + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); fe->pcl = NULL; } @@ -1190,6 +1286,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, int i, j, jtop, err2; struct page *page; bool overlapped; + bool try_free = true; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; @@ -1247,9 +1344,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { page = be->compressed_pages[i]; - if (!page || - erofs_folio_is_managed(sbi, page_folio(page))) + if (!page) + continue; + if (erofs_folio_is_managed(sbi, page_folio(page))) { + try_free = false; continue; + } (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1295,6 +1395,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); mutex_unlock(&pcl->lock); + + if (z_erofs_is_inline_pcluster(pcl)) + z_erofs_free_pcluster(pcl); + else + z_erofs_put_pcluster(sbi, pcl, try_free); return err; } @@ -1317,10 +1422,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); err = z_erofs_decompress_pcluster(&be, err) ?: err; - if (z_erofs_is_inline_pcluster(be.pcl)) - z_erofs_free_pcluster(be.pcl); - else - erofs_workgroup_put(&be.pcl->obj); } return err; } @@ -1402,9 +1503,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, bvec->bv_offset = 0; bvec->bv_len = PAGE_SIZE; repeat: - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); zbv = pcl->compressed_bvecs[nr]; - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); if (!zbv.page) goto out_allocfolio; @@ -1466,23 +1567,23 @@ repeat: folio_put(folio); out_allocfolio: page = __erofs_allocpage(&f->pagepool, gfp, true); - spin_lock(&pcl->obj.lockref.lock); + spin_lock(&pcl->lockref.lock); if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { if (page) erofs_pagepool_add(&f->pagepool, page); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); cond_resched(); goto repeat; } pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); - spin_unlock(&pcl->obj.lockref.lock); + spin_unlock(&pcl->lockref.lock); bvec->bv_page = page; if (!page) return; folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { + filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1614,7 +1715,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->index), }; (void)erofs_map_dev(sb, &mdev); @@ -1690,9 +1791,9 @@ drain_io: erofs_fscache_submit_bio(bio); else submit_bio(bio); - if (memstall) - psi_memstall_leave(&pflags); } + if (memstall) + psi_memstall_leave(&pflags); /* * although background is preferred, no one is pending for submission. diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1253a8456e59..4535f2f0a014 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -10,8 +10,6 @@ struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; - void *kaddr; - unsigned long lcn; /* compression extent information gathered */ u8 type, headtype; @@ -33,14 +31,11 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, struct z_erofs_lcluster_index *di; unsigned int advise; - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); - - m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(di)) + return PTR_ERR(di); m->lcn = lcn; - di = m->kaddr; + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; @@ -53,8 +48,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = m->delta[0] & - ~Z_EROFS_LI_D0_CBLKCNT; + m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); @@ -110,9 +104,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; - int i; - u8 *in, type; bool big_pcluster; + u8 *in, type; + int i; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; @@ -121,6 +115,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + if (IS_ERR(in)) + return PTR_ERR(in); + /* it doesn't equal to round_up(..) */ m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); @@ -128,9 +126,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; bytes = pos & ((vcnt << amortizedshift) - 1); - - in = m->kaddr - bytes; - + in -= bytes; i = bytes >> amortizedshift; lo = decode_compactedbits(lobits, in, encodebits * i, &type); @@ -223,7 +219,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, unsigned int amortizedshift; erofs_off_t pos; - if (lcn >= totalidx) + if (lcn >= totalidx || vi->z_logical_clusterbits > 14) return -EINVAL; m->lcn = lcn; @@ -255,10 +251,6 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, amortizedshift = 2; out: pos += lcn * (1 << amortizedshift); - m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - pos, EROFS_KMAP); - if (IS_ERR(m->kaddr)) - return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); } @@ -398,7 +390,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; - do { + while (1) { /* handle the last EOF pcluster (no next HEAD lcluster) */ if ((lcn << lclusterbits) >= inode->i_size) { map->m_llen = inode->i_size - map->m_la; @@ -410,14 +402,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return err; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - DBG_BUGON(!m->delta[1] && - m->clusterofs != 1 << lclusterbits); + /* work around invalid d1 generated by pre-1.0 mkfs */ + if (unlikely(!m->delta[1])) { + m->delta[1] = 1; + DBG_BUGON(1); + } } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { - /* go on until the next HEAD lcluster */ if (lcn != headlcn) - break; + break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; } else { erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", @@ -426,8 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return -EOPNOTSUPP; } lcn += m->delta[1]; - } while (m->delta[1]); - + } map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; return 0; } diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c index 37afe2024840..0dd65cefce33 100644 --- a/fs/erofs/zutil.c +++ b/fs/erofs/zutil.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2024 Alibaba Cloud */ #include "internal.h" @@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages, module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); -static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ -/* protected by 'erofs_sb_list_lock' */ -static unsigned int shrinker_run_no; +atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ -/* protects the mounted 'erofs_sb_list' */ +/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */ static DEFINE_SPINLOCK(erofs_sb_list_lock); static LIST_HEAD(erofs_sb_list); +static unsigned int shrinker_run_no; static struct shrinker *erofs_shrinker_info; static unsigned int z_erofs_gbuf_id(void) @@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool) } } -static bool erofs_workgroup_get(struct erofs_workgroup *grp) -{ - if (lockref_get_not_zero(&grp->lockref)) - return true; - - spin_lock(&grp->lockref.lock); - if (__lockref_is_dead(&grp->lockref)) { - spin_unlock(&grp->lockref.lock); - return false; - } - - if (!grp->lockref.count++) - atomic_long_dec(&erofs_global_shrink_cnt); - spin_unlock(&grp->lockref.lock); - return true; -} - -struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, - pgoff_t index) -{ - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_workgroup *grp; - -repeat: - rcu_read_lock(); - grp = xa_load(&sbi->managed_pslots, index); - if (grp) { - if (!erofs_workgroup_get(grp)) { - /* prefer to relax rcu read side */ - rcu_read_unlock(); - goto repeat; - } - - DBG_BUGON(index != grp->index); - } - rcu_read_unlock(); - return grp; -} - -struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, - struct erofs_workgroup *grp) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct erofs_workgroup *pre; - - DBG_BUGON(grp->lockref.count < 1); -repeat: - xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, - NULL, grp, GFP_KERNEL); - if (pre) { - if (xa_is_err(pre)) { - pre = ERR_PTR(xa_err(pre)); - } else if (!erofs_workgroup_get(pre)) { - /* try to legitimize the current in-tree one */ - xa_unlock(&sbi->managed_pslots); - cond_resched(); - goto repeat; - } - grp = pre; - } - xa_unlock(&sbi->managed_pslots); - return grp; -} - -static void __erofs_workgroup_free(struct erofs_workgroup *grp) -{ - atomic_long_dec(&erofs_global_shrink_cnt); - erofs_workgroup_free_rcu(grp); -} - -void erofs_workgroup_put(struct erofs_workgroup *grp) -{ - if (lockref_put_or_lock(&grp->lockref)) - return; - - DBG_BUGON(__lockref_is_dead(&grp->lockref)); - if (grp->lockref.count == 1) - atomic_long_inc(&erofs_global_shrink_cnt); - --grp->lockref.count; - spin_unlock(&grp->lockref.lock); -} - -static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp) -{ - int free = false; - - spin_lock(&grp->lockref.lock); - if (grp->lockref.count) - goto out; - - /* - * Note that all cached pages should be detached before deleted from - * the XArray. Otherwise some cached pages could be still attached to - * the orphan old workgroup when the new one is available in the tree. - */ - if (erofs_try_to_free_all_cached_folios(sbi, grp)) - goto out; - - /* - * It's impossible to fail after the workgroup is freezed, - * however in order to avoid some race conditions, add a - * DBG_BUGON to observe this in advance. - */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - - lockref_mark_dead(&grp->lockref); - free = true; -out: - spin_unlock(&grp->lockref.lock); - if (free) - __erofs_workgroup_free(grp); - return free; -} - -static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink) -{ - struct erofs_workgroup *grp; - unsigned int freed = 0; - unsigned long index; - - xa_lock(&sbi->managed_pslots); - xa_for_each(&sbi->managed_pslots, index, grp) { - /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp)) - continue; - xa_unlock(&sbi->managed_pslots); - - ++freed; - if (!--nr_shrink) - return freed; - xa_lock(&sbi->managed_pslots); - } - xa_unlock(&sbi->managed_pslots); - return freed; -} - void erofs_shrinker_register(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -369,9 +230,10 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - /* clean up all remaining workgroups in memory */ - erofs_shrink_workstation(sbi, ~0UL); - + while (!xa_empty(&sbi->managed_pslots)) { + z_erofs_shrink_scan(sbi, ~0UL); + cond_resched(); + } spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); spin_unlock(&erofs_sb_list_lock); @@ -418,9 +280,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; - - freed += erofs_shrink_workstation(sbi, nr - freed); - + freed += z_erofs_shrink_scan(sbi, nr - freed); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ p = p->next; diff --git a/fs/eventfd.c b/fs/eventfd.c index 22c934f3a080..76129bfcd663 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct eventfd_ctx *ctx; - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - ctx = eventfd_ctx_fileget(fd_file(f)); - fdput(f); - return ctx; + return eventfd_ctx_fileget(fd_file(f)); } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..f9898e60dd8b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -420,7 +420,9 @@ static bool busy_loop_ep_timeout(unsigned long start_time, static bool ep_busy_loop_on(struct eventpoll *ep) { - return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on(); + return !!READ_ONCE(ep->busy_poll_usecs) || + READ_ONCE(ep->prefer_busy_poll) || + net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) @@ -455,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ + if (prefer_busy_poll) + napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } @@ -538,6 +542,22 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, } } +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + napi_suspend_irqs(napi_id); +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + napi_resume_irqs(napi_id); +} + #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) @@ -555,6 +575,14 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, return -EOPNOTSUPP; } +static void ep_suspend_napi_irqs(struct eventpoll *ep) +{ +} + +static void ep_resume_napi_irqs(struct eventpoll *ep) +{ +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -786,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep) static void ep_free(struct eventpoll *ep) { + ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); @@ -823,7 +852,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1002,7 +1032,7 @@ static struct file *epi_fget(const struct epitem *epi) struct file *file; file = epi->ffd.file; - if (!atomic_long_inc_not_zero(&file->f_count)) + if (!file_ref_get(&file->f_ref)) file = NULL; return file; } @@ -1372,7 +1402,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1603,7 +1636,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); @@ -2003,8 +2037,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); - if (res) + if (res) { + if (res > 0) + ep_suspend_napi_irqs(ep); return res; + } } if (timed_out) @@ -2254,25 +2291,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, { int error; int full_check = 0; - struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; - error = -EBADF; - f = fdget(epfd); - if (!fd_file(f)) - goto error_return; + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; /* Get the "struct file *" for the target file */ - tf = fdget(fd); - if (!fd_file(tf)) - goto error_fput; + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; /* The target file descriptor must support poll */ - error = -EPERM; if (!file_can_poll(fd_file(tf))) - goto error_tgt_fput; + return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) @@ -2391,12 +2425,6 @@ error_tgt_fput: loop_check_gen++; mutex_unlock(&epnested_mutex); } - - fdput(tf); -error_fput: - fdput(f); -error_return: - return error; } @@ -2424,8 +2452,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { - int error; - struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -2437,17 +2463,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - f = fdget(epfd); - if (!fd_file(f)) + CLASS(fd, f)(epfd); + if (fd_empty(f)) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ - error = -EINVAL; if (!is_file_epoll(fd_file(f))) - goto error_fput; + return -EINVAL; /* * At this point it is safe to assume that the "private_data" contains @@ -2456,11 +2481,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, ep = fd_file(f)->private_data; /* Time to fish for events ... */ - error = ep_poll(ep, events, maxevents, to); - -error_fput: - fdput(f); - return error; + return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, diff --git a/fs/exec.c b/fs/exec.c index 1843366be6ff..2f0acef8908e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -883,7 +883,8 @@ EXPORT_SYMBOL(transfer_args_to_stack); */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { - struct file *file; + int err; + struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, @@ -908,12 +909,14 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || - path_noexec(&file->f_path)) { - fput(file); + path_noexec(&file->f_path)) return ERR_PTR(-EACCES); - } - return file; + err = deny_write_access(file); + if (err) + return ERR_PTR(err); + + return no_free_ptr(file); } /** @@ -923,7 +926,8 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) * * Returns ERR_PTR on failure or allocated struct file on success. * - * As this is a wrapper for the internal do_open_execat(). Also see + * As this is a wrapper for the internal do_open_execat(), callers + * must call allow_write_access() before fput() on release. Also see * do_close_execat(). */ struct file *open_exec(const char *name) @@ -990,7 +994,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; - mm_init_cid(mm); + mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1189,16 +1193,6 @@ static int unshare_sighand(struct task_struct *me) return 0; } -char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) -{ - task_lock(tsk); - /* Always NUL terminated and zero-padded */ - strscpy_pad(buf, tsk->comm, buf_size); - task_unlock(tsk); - return buf; -} -EXPORT_SYMBOL_GPL(__get_task_comm); - /* * This is unlocked -- the string will always be NUL-terminated, but * may show overlapping contents if racing concurrent reads. @@ -1496,8 +1490,10 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { - if (file) - fput(file); + if (!file) + return; + allow_write_access(file); + fput(file); } static void free_bprm(struct linux_binprm *bprm) @@ -1810,6 +1806,7 @@ static int exec_binprm(struct linux_binprm *bprm) bprm->file = bprm->interpreter; bprm->interpreter = NULL; + allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 7446bf09a04a..3103b932b674 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -82,11 +82,8 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent if (ei->type != TYPE_DIR) return -EPERM; - if (ei->entry == -1) - exfat_chain_set(&dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); - else - exfat_chain_set(&dir, ei->start_clu, - EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + exfat_chain_set(&dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); dentries_per_clu = sbi->dentries_per_clu; max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, @@ -125,7 +122,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); - break; + goto out; } if (type != TYPE_FILE && type != TYPE_DIR) { @@ -135,21 +132,6 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent num_ext = ep->dentry.file.num_ext; dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); - exfat_get_entry_time(sbi, &dir_entry->crtime, - ep->dentry.file.create_tz, - ep->dentry.file.create_time, - ep->dentry.file.create_date, - ep->dentry.file.create_time_cs); - exfat_get_entry_time(sbi, &dir_entry->mtime, - ep->dentry.file.modify_tz, - ep->dentry.file.modify_time, - ep->dentry.file.modify_date, - ep->dentry.file.modify_time_cs); - exfat_get_entry_time(sbi, &dir_entry->atime, - ep->dentry.file.access_tz, - ep->dentry.file.access_time, - ep->dentry.file.access_date, - 0); *uni_name.name = 0x0; err = exfat_get_uniname_from_ext_entry(sb, &clu, i, @@ -166,9 +148,8 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent ep = exfat_get_dentry(sb, &clu, i + 1, &bh); if (!ep) return -EIO; - dir_entry->size = - le64_to_cpu(ep->dentry.stream.valid_size); - dir_entry->entry = dentry; + dir_entry->entry = i; + dir_entry->dir = clu; brelse(bh); ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi); @@ -189,6 +170,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } +out: dir_entry->namebuf.lfn[0] = '\0'; *cpos = EXFAT_DEN_TO_B(dentry); return 0; @@ -276,7 +258,7 @@ get_new: if (!nb->lfn[0]) goto end_of_dir; - i_pos = ((loff_t)ei->start_clu << 32) | (de.entry & 0xffffffff); + i_pos = ((loff_t)de.dir.dir << 32) | (de.entry & 0xffffffff); tmp = exfat_iget(sb, i_pos); if (tmp) { inum = tmp->i_ino; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 3cdc1de362a9..78be6964a8a0 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -204,7 +204,9 @@ struct exfat_entry_set_cache { #define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh) struct exfat_dir_entry { + /* the cluster where file dentry is located */ struct exfat_chain dir; + /* the index of file dentry in ->dir */ int entry; unsigned int type; unsigned int start_clu; @@ -290,7 +292,9 @@ struct exfat_sb_info { * EXFAT file system inode in-memory data */ struct exfat_inode_info { + /* the cluster where file dentry is located */ struct exfat_chain dir; + /* the index of file dentry in ->dir */ int entry; unsigned int type; unsigned short attr; @@ -508,6 +512,8 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries); +#define exfat_get_dentry_set_by_ei(es, sb, ei) \ + exfat_get_dentry_set(es, sb, &(ei)->dir, (ei)->entry, ES_ALL_ENTRIES) int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 773c320d68f3..9e5492ac409b 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -216,6 +216,16 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain if (err) goto dec_used_clus; + + if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { + /* + * The cluster chain includes a loop, scan the + * bitmap to get the number of used clusters. + */ + exfat_count_used_clusters(sb, &sbi->used_clusters); + + return 0; + } } while (clu != EXFAT_EOF_CLUSTER); } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index a25d7eb789f4..05b51e721783 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -545,6 +545,7 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) while (pos < new_valid_size) { u32 len; struct folio *folio; + unsigned long off; len = PAGE_SIZE - (pos & (PAGE_SIZE - 1)); if (pos + len > new_valid_size) @@ -554,6 +555,9 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (err) goto out; + off = offset_in_folio(folio, pos); + folio_zero_new_buffers(folio, off, off + len); + err = ops->write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; @@ -563,6 +567,8 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) cond_resched(); } + return 0; + out: return err; } @@ -584,6 +590,16 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret < 0) goto unlock; + if (iocb->ki_flags & IOCB_DIRECT) { + unsigned long align = pos | iov_iter_alignment(iter); + + if (!IS_ALIGNED(align, i_blocksize(inode)) && + !IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) { + ret = -EINVAL; + goto unlock; + } + } + if (pos > valid_size) { ret = exfat_extend_valid_size(file, pos); if (ret < 0 && ret != -ENOSPC) { diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index d724de8f57bf..96952d4acb50 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -43,7 +43,7 @@ int __exfat_write_inode(struct inode *inode, int sync) exfat_set_volume_dirty(sb); /* get the directory entry of given file or directory */ - if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES)) + if (exfat_get_dentry_set_by_ei(&es, sb, ei)) return -EIO; ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 2c4c44229352..099f80645072 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -288,8 +288,22 @@ static int exfat_check_max_dentries(struct inode *inode) return 0; } -/* find empty directory entry. - * if there isn't any empty slot, expand cluster chain. +/* + * Find an empty directory entry set. + * + * If there isn't any empty slot, expand cluster chain. + * + * in: + * inode: inode of the parent directory + * num_entries: specifies how many dentries in the empty directory entry set + * + * out: + * p_dir: the cluster where the empty directory entry set is located + * es: The found empty directory entry set + * + * return: + * the directory entry index in p_dir is returned on succeeds + * -error code is returned on failure */ static int exfat_find_empty_entry(struct inode *inode, struct exfat_chain *p_dir, int num_entries, @@ -311,10 +325,13 @@ static int exfat_find_empty_entry(struct inode *inode, ei->hint_femp.eidx = EXFAT_HINT_NONE; } + exfat_chain_set(p_dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, num_entries, es)) < 0) { - if (dentry == -EIO) - break; + if (dentry != -ENOSPC) + return dentry; if (exfat_check_max_dentries(inode)) return -ENOSPC; @@ -345,6 +362,7 @@ static int exfat_find_empty_entry(struct inode *inode, if (ei->start_clu == EXFAT_EOF_CLUSTER) { ei->start_clu = clu.dir; p_dir->dir = clu.dir; + hint_femp.eidx = 0; } /* append to the FAT chain */ @@ -377,7 +395,10 @@ static int exfat_find_empty_entry(struct inode *inode, inode->i_blocks += sbi->cluster_size >> 9; } - return dentry; + p_dir->dir = exfat_sector_to_cluster(sbi, es->bh[0]->b_blocknr); + p_dir->size -= dentry / sbi->dentries_per_clu; + + return dentry & (sbi->dentries_per_clu - 1); } /* @@ -385,14 +406,11 @@ static int exfat_find_empty_entry(struct inode *inode, * Zero if it was successful; otherwise nonzero. */ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, - struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, - int lookup) + struct exfat_uni_name *p_uniname, int lookup) { int namelen; int lossy = NLS_NAME_NO_LOSSY; struct super_block *sb = inode->i_sb; - struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_inode_info *ei = EXFAT_I(inode); int pathlen = strlen(path); /* @@ -431,24 +449,19 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, if ((lossy && !lookup) || !namelen) return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL; - exfat_chain_set(p_dir, ei->start_clu, - EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); - return 0; } static inline int exfat_resolve_path(struct inode *inode, - const unsigned char *path, struct exfat_chain *dir, - struct exfat_uni_name *uni) + const unsigned char *path, struct exfat_uni_name *uni) { - return __exfat_resolve_path(inode, path, dir, uni, 0); + return __exfat_resolve_path(inode, path, uni, 0); } static inline int exfat_resolve_path_for_lookup(struct inode *inode, - const unsigned char *path, struct exfat_chain *dir, - struct exfat_uni_name *uni) + const unsigned char *path, struct exfat_uni_name *uni) { - return __exfat_resolve_path(inode, path, dir, uni, 1); + return __exfat_resolve_path(inode, path, uni, 1); } static inline loff_t exfat_make_i_pos(struct exfat_dir_entry *info) @@ -457,8 +470,7 @@ static inline loff_t exfat_make_i_pos(struct exfat_dir_entry *info) } static int exfat_add_entry(struct inode *inode, const char *path, - struct exfat_chain *p_dir, unsigned int type, - struct exfat_dir_entry *info) + unsigned int type, struct exfat_dir_entry *info) { int ret, dentry, num_entries; struct super_block *sb = inode->i_sb; @@ -470,7 +482,7 @@ static int exfat_add_entry(struct inode *inode, const char *path, int clu_size = 0; unsigned int start_clu = EXFAT_FREE_CLUSTER; - ret = exfat_resolve_path(inode, path, p_dir, &uniname); + ret = exfat_resolve_path(inode, path, &uniname); if (ret) goto out; @@ -481,7 +493,7 @@ static int exfat_add_entry(struct inode *inode, const char *path, } /* exfat_find_empty_entry must be called before alloc_cluster() */ - dentry = exfat_find_empty_entry(inode, p_dir, num_entries, &es); + dentry = exfat_find_empty_entry(inode, &info->dir, num_entries, &es); if (dentry < 0) { ret = dentry; /* -EIO or -ENOSPC */ goto out; @@ -508,7 +520,6 @@ static int exfat_add_entry(struct inode *inode, const char *path, if (ret) goto out; - info->dir = *p_dir; info->entry = dentry; info->flags = ALLOC_NO_FAT_CHAIN; info->type = type; @@ -541,7 +552,6 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, { struct super_block *sb = dir->i_sb; struct inode *inode; - struct exfat_chain cdir; struct exfat_dir_entry info; loff_t i_pos; int err; @@ -552,8 +562,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&EXFAT_SB(sb)->s_lock); exfat_set_volume_dirty(sb); - err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE, - &info); + err = exfat_add_entry(dir, dentry->d_name.name, TYPE_FILE, &info); if (err) goto unlock; @@ -601,10 +610,13 @@ static int exfat_find(struct inode *dir, struct qstr *qname, return -ENOENT; /* check the validity of directory name in the given pathname */ - ret = exfat_resolve_path_for_lookup(dir, qname->name, &cdir, &uni_name); + ret = exfat_resolve_path_for_lookup(dir, qname->name, &uni_name); if (ret) return ret; + exfat_chain_set(&cdir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(dir), sbi), ei->flags); + /* check the validation of hint_stat and initialize it if required */ if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) { ei->hint_stat.clu = cdir.dir; @@ -618,15 +630,16 @@ static int exfat_find(struct inode *dir, struct qstr *qname, if (dentry < 0) return dentry; /* -error value */ - info->dir = cdir; - info->entry = dentry; - info->num_subdirs = 0; - /* adjust cdir to the optimized value */ cdir.dir = hint_opt.clu; if (cdir.flags & ALLOC_NO_FAT_CHAIN) cdir.size -= dentry / sbi->dentries_per_clu; dentry = hint_opt.eidx; + + info->dir = cdir; + info->entry = dentry; + info->num_subdirs = 0; + if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES)) return -EIO; ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); @@ -637,14 +650,26 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->size = le64_to_cpu(ep2->dentry.stream.valid_size); info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); + + info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu); + if (!is_valid_cluster(sbi, info->start_clu) && info->size) { + exfat_warn(sb, "start_clu is invalid cluster(0x%x)", + info->start_clu); + info->size = 0; + info->valid_size = 0; + } + + if (info->valid_size > info->size) { + exfat_warn(sb, "valid_size(%lld) is greater than size(%lld)", + info->valid_size, info->size); + info->valid_size = info->size; + } + if (info->size == 0) { info->flags = ALLOC_NO_FAT_CHAIN; info->start_clu = EXFAT_EOF_CLUSTER; - } else { + } else info->flags = ep2->dentry.stream.flags; - info->start_clu = - le32_to_cpu(ep2->dentry.stream.start_clu); - } exfat_get_entry_time(sbi, &info->crtime, ep->dentry.file.create_tz, @@ -766,26 +791,23 @@ unlock: /* remove an entry, BUT don't truncate */ static int exfat_unlink(struct inode *dir, struct dentry *dentry) { - struct exfat_chain cdir; struct super_block *sb = dir->i_sb; struct inode *inode = dentry->d_inode; struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_entry_set_cache es; - int entry, err = 0; + int err = 0; if (unlikely(exfat_forced_shutdown(sb))) return -EIO; mutex_lock(&EXFAT_SB(sb)->s_lock); - exfat_chain_dup(&cdir, &ei->dir); - entry = ei->entry; if (ei->dir.dir == DIR_DELETED) { exfat_err(sb, "abnormal access to deleted dentry"); err = -ENOENT; goto unlock; } - err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES); + err = exfat_get_dentry_set_by_ei(&es, sb, ei); if (err) { err = -EIO; goto unlock; @@ -824,7 +846,6 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct super_block *sb = dir->i_sb; struct inode *inode; struct exfat_dir_entry info; - struct exfat_chain cdir; loff_t i_pos; int err; loff_t size = i_size_read(dir); @@ -834,8 +855,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&EXFAT_SB(sb)->s_lock); exfat_set_volume_dirty(sb); - err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR, - &info); + err = exfat_add_entry(dir, dentry->d_name.name, TYPE_DIR, &info); if (err) goto unlock; @@ -915,21 +935,18 @@ static int exfat_check_dir_empty(struct super_block *sb, static int exfat_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - struct exfat_chain cdir, clu_to_free; + struct exfat_chain clu_to_free; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_entry_set_cache es; - int entry, err; + int err; if (unlikely(exfat_forced_shutdown(sb))) return -EIO; mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); - exfat_chain_dup(&cdir, &ei->dir); - entry = ei->entry; - if (ei->dir.dir == DIR_DELETED) { exfat_err(sb, "abnormal access to deleted dentry"); err = -ENOENT; @@ -947,7 +964,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } - err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES); + err = exfat_get_dentry_set_by_ei(&es, sb, ei); if (err) { err = -EIO; goto unlock; @@ -982,15 +999,14 @@ unlock: return err; } -static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, - int oldentry, struct exfat_uni_name *p_uniname, - struct exfat_inode_info *ei) +static int exfat_rename_file(struct inode *parent_inode, + struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) { int ret, num_new_entries; struct exfat_dentry *epold, *epnew; - struct super_block *sb = inode->i_sb; + struct super_block *sb = parent_inode->i_sb; struct exfat_entry_set_cache old_es, new_es; - int sync = IS_DIRSYNC(inode); + int sync = IS_DIRSYNC(parent_inode); if (unlikely(exfat_forced_shutdown(sb))) return -EIO; @@ -999,7 +1015,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (num_new_entries < 0) return num_new_entries; - ret = exfat_get_dentry_set(&old_es, sb, p_dir, oldentry, ES_ALL_ENTRIES); + ret = exfat_get_dentry_set_by_ei(&old_es, sb, ei); if (ret) { ret = -EIO; return ret; @@ -1009,9 +1025,10 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (old_es.num_entries < num_new_entries) { int newentry; + struct exfat_chain dir; - newentry = exfat_find_empty_entry(inode, p_dir, num_new_entries, - &new_es); + newentry = exfat_find_empty_entry(parent_inode, &dir, + num_new_entries, &new_es); if (newentry < 0) { ret = newentry; /* -EIO or -ENOSPC */ goto put_old_es; @@ -1034,8 +1051,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, if (ret) goto put_old_es; - exfat_remove_entries(inode, &old_es, ES_IDX_FILE); - ei->dir = *p_dir; + exfat_remove_entries(parent_inode, &old_es, ES_IDX_FILE); + ei->dir = dir; ei->entry = newentry; } else { if (exfat_get_entry_type(epold) == TYPE_FILE) { @@ -1043,7 +1060,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, ei->attr |= EXFAT_ATTR_ARCHIVE; } - exfat_remove_entries(inode, &old_es, ES_IDX_FIRST_FILENAME + 1); + exfat_remove_entries(parent_inode, &old_es, ES_IDX_FIRST_FILENAME + 1); exfat_init_ext_entry(&old_es, num_new_entries, p_uniname); } return exfat_put_dentry_set(&old_es, sync); @@ -1053,26 +1070,24 @@ put_old_es: return ret; } -static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, - int oldentry, struct exfat_chain *p_newdir, +static int exfat_move_file(struct inode *parent_inode, struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) { int ret, newentry, num_new_entries; struct exfat_dentry *epmov, *epnew; - struct super_block *sb = inode->i_sb; struct exfat_entry_set_cache mov_es, new_es; + struct exfat_chain newdir; num_new_entries = exfat_calc_num_entries(p_uniname); if (num_new_entries < 0) return num_new_entries; - ret = exfat_get_dentry_set(&mov_es, sb, p_olddir, oldentry, - ES_ALL_ENTRIES); + ret = exfat_get_dentry_set_by_ei(&mov_es, parent_inode->i_sb, ei); if (ret) return -EIO; - newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries, - &new_es); + newentry = exfat_find_empty_entry(parent_inode, &newdir, + num_new_entries, &new_es); if (newentry < 0) { ret = newentry; /* -EIO or -ENOSPC */ goto put_mov_es; @@ -1091,18 +1106,16 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, *epnew = *epmov; exfat_init_ext_entry(&new_es, num_new_entries, p_uniname); - exfat_remove_entries(inode, &mov_es, ES_IDX_FILE); - - exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size, - p_newdir->flags); + exfat_remove_entries(parent_inode, &mov_es, ES_IDX_FILE); + ei->dir = newdir; ei->entry = newentry; - ret = exfat_put_dentry_set(&new_es, IS_DIRSYNC(inode)); + ret = exfat_put_dentry_set(&new_es, IS_DIRSYNC(parent_inode)); if (ret) goto put_mov_es; - return exfat_put_dentry_set(&mov_es, IS_DIRSYNC(inode)); + return exfat_put_dentry_set(&mov_es, IS_DIRSYNC(parent_inode)); put_mov_es: exfat_put_dentry_set(&mov_es, false); @@ -1116,19 +1129,12 @@ static int __exfat_rename(struct inode *old_parent_inode, struct dentry *new_dentry) { int ret; - int dentry; - struct exfat_chain olddir, newdir; - struct exfat_chain *p_dir = NULL; struct exfat_uni_name uni_name; - struct exfat_dentry *ep; struct super_block *sb = old_parent_inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); const unsigned char *new_path = new_dentry->d_name.name; struct inode *new_inode = new_dentry->d_inode; struct exfat_inode_info *new_ei = NULL; - unsigned int new_entry_type = TYPE_UNUSED; - int new_entry = 0; - struct buffer_head *new_bh = NULL; /* check the validity of pointer parameters */ if (new_path == NULL || strlen(new_path) == 0) @@ -1139,11 +1145,6 @@ static int __exfat_rename(struct inode *old_parent_inode, return -ENOENT; } - exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu, - EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi), - EXFAT_I(old_parent_inode)->flags); - dentry = ei->entry; - /* check whether new dir is existing directory and empty */ if (new_inode) { ret = -EIO; @@ -1154,17 +1155,8 @@ static int __exfat_rename(struct inode *old_parent_inode, goto out; } - p_dir = &(new_ei->dir); - new_entry = new_ei->entry; - ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh); - if (!ep) - goto out; - - new_entry_type = exfat_get_entry_type(ep); - brelse(new_bh); - /* if new_inode exists, update ei */ - if (new_entry_type == TYPE_DIR) { + if (S_ISDIR(new_inode->i_mode)) { struct exfat_chain new_clu; new_clu.dir = new_ei->start_clu; @@ -1180,26 +1172,22 @@ static int __exfat_rename(struct inode *old_parent_inode, } /* check the validity of directory name in the given new pathname */ - ret = exfat_resolve_path(new_parent_inode, new_path, &newdir, - &uni_name); + ret = exfat_resolve_path(new_parent_inode, new_path, &uni_name); if (ret) goto out; exfat_set_volume_dirty(sb); - if (olddir.dir == newdir.dir) - ret = exfat_rename_file(new_parent_inode, &olddir, dentry, - &uni_name, ei); + if (new_parent_inode == old_parent_inode) + ret = exfat_rename_file(new_parent_inode, &uni_name, ei); else - ret = exfat_move_file(new_parent_inode, &olddir, dentry, - &newdir, &uni_name, ei); + ret = exfat_move_file(new_parent_inode, &uni_name, ei); if (!ret && new_inode) { struct exfat_entry_set_cache es; /* delete entries of new_dir */ - ret = exfat_get_dentry_set(&es, sb, p_dir, new_entry, - ES_ALL_ENTRIES); + ret = exfat_get_dentry_set_by_ei(&es, sb, new_ei); if (ret) { ret = -EIO; goto del_out; @@ -1212,7 +1200,7 @@ static int __exfat_rename(struct inode *old_parent_inode, goto del_out; /* Free the clusters if new_inode is a dir(as if exfat_rmdir) */ - if (new_entry_type == TYPE_DIR && + if (S_ISDIR(new_inode->i_mode) && new_ei->start_clu != EXFAT_EOF_CLUSTER) { /* new_ei, new_clu_to_free */ struct exfat_chain new_clu_to_free; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4f2dd4ab4486..0c899cfba578 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -382,14 +382,24 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags) { const struct export_operations *nop = inode->i_sb->s_export_op; + enum fid_type type; if (!exportfs_can_encode_fh(nop, flags)) return -EOPNOTSUPP; if (!nop && (flags & EXPORT_FH_FID)) - return exportfs_encode_ino64_fid(inode, fid, max_len); + type = exportfs_encode_ino64_fid(inode, fid, max_len); + else + type = nop->encode_fh(inode, fid->raw, max_len, parent); + + if (type > 0 && FILEID_USER_FLAGS(type)) { + pr_warn_once("%s: unexpected fh type value 0x%x from fstype %s.\n", + __func__, type, inode->i_sb->s_type->name); + return -EINVAL; + } + + return type; - return nop->encode_fh(inode, fid->raw, max_len, parent); } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); @@ -436,6 +446,9 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, char nbuf[NAME_MAX+1]; int err; + if (fileid_type < 0 || FILEID_USER_FLAGS(fileid_type)) + return ERR_PTR(-EINVAL); + /* * Try to get any dentry for the given file handle from the filesystem. */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 591fb3f710be..8042ad873808 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), - ext4_end_bitmap_read); + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef6a3c8f3a9a..02d47a64e8d1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -418,7 +418,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[]; + char name[] __counted_by(name_len); }; /* @@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; - int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + ent_name->len + 1; - new_fn = kzalloc(len, GFP_KERNEL); + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..74f2071189b2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1729,6 +1729,10 @@ struct ext4_sb_info { */ struct work_struct s_sb_upd_work; + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1865,14 +1869,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb, return false; } -static inline void ext4_simulate_fail_bh(struct super_block *sb, - struct buffer_head *bh, - unsigned long code) -{ - if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) - clear_buffer_uptodate(bh); -} - /* * Error number codes for s_{first,last}_error_errno * @@ -3100,9 +3096,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); @@ -3855,6 +3851,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 34e25eee6521..a07a98a4b97a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); - err = ext4_read_bh(bh, 0, NULL); + err = ext4_read_bh(bh, 0, NULL, false); if (err < 0) goto errout; } @@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN, 0); + EXTENT_STATUS_WRITTEN, false); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -4158,7 +4158,7 @@ insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); ext4_es_insert_extent(inode, hole_start, len, ~0, - EXTENT_STATUS_HOLE, 0); + EXTENT_STATUS_HOLE, false); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) @@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode); BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4541,6 +4541,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c786691dabd3..ae29832aab1e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -848,7 +848,7 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags) + unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; @@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, flags, inode->i_ino); + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; @@ -945,7 +945,7 @@ error: resv_used += pending; if (resv_used) ext4_da_update_reserve_space(inode, resv_used, - flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + delalloc_reserve_used); if (err1 || err2 || err3 < 0) goto retry; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 4424232de298..8f9c008d11e8 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags); + unsigned int status, + bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b33664f6ce2a..26c4fc37edcf 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode) return; restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_unlock(&sbi->s_fc_lock); return; } @@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl } spin_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - if (has_transaction && - (!is_ineligible || - (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid)))) + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); spin_unlock(&sbi->s_fc_lock); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..3bd96c3d4cd0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) - return size; - return ext4_handle_inode_extension(inode, pos, size, size); + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -564,12 +565,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - ext4_journal_stop(handle); + if (ret) + goto out; } if (ilock_shared && !unwritten) @@ -599,6 +597,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) @@ -692,6 +697,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + int ret; + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else @@ -884,6 +903,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index df853c4d3a8c..383c6edea6dd 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) return fmr->fmr_physical + fmr->fmr_length; } +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* Return relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical < info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_start || + p->fmr_physical + p->fmr_length <= fs_end) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, @@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, ext4_getfsmap_datadev_helper, info); if (error) goto err; @@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb, /* Report any gaps at the end of the bg */ info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, + 0, info); if (error) goto err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7f1a5f90dbbd..21d228073d79 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7404f0935c90..7de327fa7b1c 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, } if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { put_bh(bh); goto failure; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..7c54ae5fcbd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, 0); + map->m_pblk, status, false); return retval; } @@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, flags); + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } @@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { @@ -1307,8 +1314,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1423,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2985,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); @@ -3029,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); + zero_len = pos - old_size; + } - if (disksize_changed) { - handle_t *handle; + if (!disksize_changed && !zero_len) + return copied; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); return copied; } @@ -3444,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -4497,10 +4529,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; @@ -4974,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } @@ -5426,6 +5459,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5436,12 +5477,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, @@ -5578,6 +5624,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1c77400bd88e..7b9ce71c1c81 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1330,7 +1330,6 @@ group_extend_out: case EXT4_IOC_MOVE_EXT: { struct move_extent me; - struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || @@ -1342,30 +1341,26 @@ group_extend_out: return -EFAULT; me.moved_len = 0; - donor = fdget(me.donor_fd); - if (!fd_file(donor)) + CLASS(fd, donor)(me.donor_fd); + if (fd_empty(donor)) return -EBADF; - if (!(fd_file(donor)->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto mext_out; - } + if (!(fd_file(donor)->f_mode & FMODE_WRITE)) + return -EBADF; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); - err = -EOPNOTSUPP; - goto mext_out; + return -EOPNOTSUPP; } else if (IS_DAX(inode)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with DAX"); - err = -EOPNOTSUPP; - goto mext_out; + return -EOPNOTSUPP; } err = mnt_want_write_file(filp); if (err) - goto mext_out; + return err; err = ext4_move_extents(filp, fd_file(donor), me.orig_start, me.donor_start, me.len, &me.moved_len); @@ -1374,8 +1369,6 @@ group_extend_out: if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; -mext_out: - fdput(donor); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d73e38323879..b25a27c86696 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); - mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); @@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, } out_dbg: - mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } @@ -6999,13 +6999,14 @@ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, + ext4_grpblk_t first, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; - ext4_grpblk_t next; + ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; @@ -7016,10 +7017,19 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = max(e4b.bd_info->bb_first_free, start); + start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d8553f1498d3..f8280de3e882 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -259,6 +259,7 @@ ext4_mballoc_query_range( ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index bd946d0c71b7..d64c04ed061a 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } lock_buffer(*bh); - ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL); + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false); if (ret) goto warn_exit; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b64661ea6e0e..898443e98efc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) unlock_buffer(bh); continue; } - ext4_read_bh_nowait(bh, 0, NULL); + ext4_read_bh_nowait(bh, 0, NULL, false); nr++; } while (block++, (bh = bh->b_this_page) != head); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 790db7eac6c2..536d56d15072 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) - return (struct buffer_head *) frame; + return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); @@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - return (struct ext4_dir_entry_2 *) bh2; + return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); @@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, else split = count/2; + if (WARN_ON_ONCE(split == 0)) { + /* Should never happen, but avoid out-of-bounds access below */ + ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, + "bad indexed directory? hash=%08x:%08x count=%d move=%u", + hinfo->hash, hinfo->minor_hash, count, move); + err = -EFSCORRUPTED; + goto out; + } + hash2 = map[split].hash; - continued = split > 0 ? hash2 == map[split - 1].hash : 0; + continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); @@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, return de; journal_error: + ext4_std_error(dir->i_sb, err); +out: brelse(*bh); brelse(bh2); *bh = NULL; - ext4_std_error(dir->i_sb, err); return ERR_PTR(err); } @@ -2395,11 +2405,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#if IS_ENABLED(CONFIG_UNICODE) - if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - utf8_validate(sb->s_encoding, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; -#endif retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) @@ -3411,7 +3418,6 @@ retry: inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; } } @@ -3427,6 +3433,9 @@ retry: disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..69b8a7221a2b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -417,11 +417,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) + if (io->io_bio == NULL) { io_submit_init_bio(io, bh); + io->io_bio->bi_write_hint = inode->i_write_hint; + } if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a2704f064361..72f77f78ae8d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { brelse(bh); return NULL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..785809f33ff4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -161,8 +161,14 @@ MODULE_ALIAS("ext3"); static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { + if (simu_fail) { + clear_buffer_uptodate(bh); + unlock_buffer(bh); + return; + } + /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we @@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, unlock_buffer(bh); return; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); } -int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io return 0; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { - ext4_read_bh_nowait(bh, op_flags, NULL); + ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } - return ext4_read_bh(bh, op_flags, NULL); + return ext4_read_bh(bh, op_flags, NULL, false); } /* @@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) if (likely(bh)) { if (trylock_buffer(bh)) - ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } @@ -346,9 +353,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb, __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { - return le16_to_cpu(bg->bg_free_inodes_count_lo) | + return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); + (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, @@ -402,9 +409,9 @@ void ext4_free_group_clusters_set(struct super_block *sb, void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { - bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); + WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, @@ -2096,16 +2103,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, } #define EXT4_SET_CTX(name) \ -static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ -static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ @@ -3030,6 +3037,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("mb_optimize_scan=1"); } + if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) + SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3709,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1; if (!ret) { - start_time = ktime_get_real_ns(); + start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { - elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3774,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg) cont_thread: while (true) { - next_wakeup = MAX_JIFFY_OFFSET; + bool next_wakeup_initialized = false; + next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); @@ -3788,8 +3799,11 @@ cont_thread: lr_request); if (time_before(jiffies, elr->lr_next_sched)) { - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { @@ -3817,16 +3831,18 @@ cont_thread: elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || - (MAX_JIFFY_OFFSET == next_wakeup)) { + if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } @@ -4425,6 +4441,36 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * @sb: super block + * TODO: Later add support for bigalloc + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(sb->s_blocksize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -5336,6 +5382,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -6301,7 +6348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) - return 0; + return -EIO; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6518,8 +6565,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (test_opt2(sb, ABORT)) - ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && + !test_opt(sb, DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); + err = -EINVAL; + goto restore_opts; + } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -6689,6 +6740,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); + /* + * Handle aborting the filesystem as the last thing during remount to + * avoid obsure errors during remount when some option changes fail to + * apply due to shutdown filesystem. + */ + if (test_opt2(sb, ABORT)) + ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + return 0; restore_opts: @@ -7329,7 +7388,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 8bffdeccdbc3..1fbc0607363b 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -296,9 +296,8 @@ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f76460b721f..efda9a022981 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -32,7 +32,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, f2fs_build_fault_attr(sbi, 0, 0); if (!end_io) f2fs_flush_merged_writes(sbi); - f2fs_handle_critical_error(sbi, reason, end_io); + f2fs_handle_critical_error(sbi, reason); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..a2478c2afb3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; @@ -1676,7 +1679,8 @@ next_block: /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; - if (flag != F2FS_GET_BLOCK_DIO || !is_hole) + /* DIO READ and hole case, should not map the blocks. */ + if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; @@ -1818,16 +1822,6 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } -static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) -{ - return (bytes >> inode->i_blkbits); -} - -static inline u64 blks_to_bytes(struct inode *inode, u64 blks) -{ - return (blks << inode->i_blkbits); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1853,7 +1847,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1885,7 +1879,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1901,30 +1895,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, return (err < 0 ? err : 0); } -static loff_t max_inode_blocks(struct inode *inode) -{ - loff_t result = ADDRS_PER_INODE(inode); - loff_t leaf_count = ADDRS_PER_BLOCK(inode); - - /* two direct node blocks */ - result += (leaf_count * 2); - - /* two indirect node blocks */ - leaf_count *= NIDS_PER_BLOCK; - result += (leaf_count * 2); - - /* one double indirect node block */ - leaf_count *= NIDS_PER_BLOCK; - result += leaf_count; - - return result; -} - int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; - sector_t start_blk, last_blk; + sector_t start_blk, last_blk, blk_len, max_len; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; @@ -1966,16 +1941,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (bytes_to_blks(inode, len) == 0) - len = blks_to_bytes(inode, 1); - - start_blk = bytes_to_blks(inode, start); - last_blk = bytes_to_blks(inode, start + len - 1); + start_blk = F2FS_BYTES_TO_BLK(start); + last_blk = F2FS_BYTES_TO_BLK(start + len - 1); + blk_len = last_blk - start_blk + 1; + max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; - map.m_len = bytes_to_blks(inode, len); + map.m_len = blk_len; map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; @@ -1992,13 +1966,23 @@ next: if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, - max_inode_blocks(inode))) + if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } + /* + * current extent may cross boundary of inquiry, increase len to + * requery. + */ + if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && + map.m_lblk + map.m_len - 1 == last_blk && + blk_len != max_len) { + blk_len = max_len; + goto next; + } + compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || @@ -2030,14 +2014,14 @@ skip_fill: } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; - size += blks_to_bytes(inode, appended_blks); + size += F2FS_BLK_TO_BYTES(appended_blks); start_blk += appended_blks; compr_cluster = false; } else { - logical = blks_to_bytes(inode, start_blk); + logical = F2FS_BLK_TO_BYTES(start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? - blks_to_bytes(inode, map.m_pblk) : 0; - size = blks_to_bytes(inode, map.m_len); + F2FS_BLK_TO_BYTES(map.m_pblk) : 0; + size = F2FS_BLK_TO_BYTES(map.m_len); flags = 0; if (compr_cluster) { @@ -2045,13 +2029,13 @@ skip_fill: count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; - size += blks_to_bytes(inode, 1); + size += F2FS_BLKSIZE; } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } - start_blk += bytes_to_blks(inode, size); + start_blk += F2FS_BYTES_TO_BLK(size); } prep_next: @@ -2089,7 +2073,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, struct readahead_control *rac) { struct bio *bio = *bio_ret; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -2099,8 +2083,8 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -2200,7 +2184,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; @@ -2209,8 +2193,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2385,10 +2369,10 @@ static int f2fs_mpage_readpages(struct inode *inode, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; + pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; - pgoff_t index; int ret = 0; map.m_pblk = 0; @@ -2406,9 +2390,9 @@ static int f2fs_mpage_readpages(struct inode *inode, prefetchw(&folio->flags); } +#ifdef CONFIG_F2FS_FS_COMPRESSION index = folio_index(folio); -#ifdef CONFIG_F2FS_FS_COMPRESSION if (!f2fs_compressed_file(inode)) goto read_single_page; @@ -3441,6 +3425,11 @@ restart: if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { + if (IS_DEVICE_ALIASING(inode)) { + err = -ENODATA; + goto out; + } + if (locked) { err = f2fs_reserve_block(&dn, index); goto out; @@ -3971,7 +3960,7 @@ static int check_swap_activate(struct swap_info_struct *sis, * to be very smart. */ cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); + last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; @@ -4214,8 +4203,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, pgoff_t next_pgofs = 0; int err; - map.m_lblk = bytes_to_blks(inode, offset); - map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_lblk = F2FS_BYTES_TO_BLK(offset); + map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -4226,7 +4215,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (err) return err; - iomap->offset = blks_to_bytes(inode, map.m_lblk); + iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file @@ -4246,21 +4235,21 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; - iomap->addr = blks_to_bytes(inode, map.m_pblk); + iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; if (map.m_pblk == NULL_ADDR) { - iomap->length = blks_to_bytes(inode, next_pgofs) - - iomap->offset; + iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - + iomap->offset; iomap->type = IOMAP_HOLE; } else if (map.m_pblk == NEW_ADDR) { - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_UNWRITTEN; } else { f2fs_bug_on(F2FS_I_SB(inode), 1); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 546b8ba91261..468828288a4a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -60,6 +60,70 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_DEBUG_FS +static void update_multidevice_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_dev_stats *dev_stats = si->dev_stats; + int i, j; + + if (!f2fs_is_multi_device(sbi)) + return; + + memset(dev_stats, 0, sizeof(struct f2fs_dev_stats) * sbi->s_ndevs); + for (i = 0; i < sbi->s_ndevs; i++) { + unsigned int start_segno, end_segno; + block_t start_blk, end_blk; + + if (i == 0) { + start_blk = MAIN_BLKADDR(sbi); + end_blk = FDEV(i).end_blk + 1 - SEG0_BLKADDR(sbi); + } else { + start_blk = FDEV(i).start_blk; + end_blk = FDEV(i).end_blk + 1; + } + + start_segno = GET_SEGNO(sbi, start_blk); + end_segno = GET_SEGNO(sbi, end_blk); + + for (j = start_segno; j < end_segno; j++) { + unsigned int seg_blks, sec_blks; + + seg_blks = get_seg_entry(sbi, j)->valid_blocks; + + /* update segment stats */ + if (IS_CURSEG(sbi, j)) + dev_stats[i].devstats[0][DEVSTAT_INUSE]++; + else if (seg_blks == BLKS_PER_SEG(sbi)) + dev_stats[i].devstats[0][DEVSTAT_FULL]++; + else if (seg_blks != 0) + dev_stats[i].devstats[0][DEVSTAT_DIRTY]++; + else if (!test_bit(j, FREE_I(sbi)->free_segmap)) + dev_stats[i].devstats[0][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[0][DEVSTAT_PREFREE]++; + + if (!__is_large_section(sbi) || + (j % SEGS_PER_SEC(sbi)) != 0) + continue; + + sec_blks = get_sec_entry(sbi, j)->valid_blocks; + + /* update section stats */ + if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + dev_stats[i].devstats[1][DEVSTAT_INUSE]++; + else if (sec_blks == BLKS_PER_SEC(sbi)) + dev_stats[i].devstats[1][DEVSTAT_FULL]++; + else if (sec_blks != 0) + dev_stats[i].devstats[1][DEVSTAT_DIRTY]++; + else if (!test_bit(GET_SEC_FROM_SEG(sbi, j), + FREE_I(sbi)->free_secmap)) + dev_stats[i].devstats[1][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[1][DEVSTAT_PREFREE]++; + } + } +} + static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -214,6 +278,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_blks[type] += blks; } + update_multidevice_stats(sbi); + for (i = 0; i < MAX_CALL_TYPE; i++) si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); @@ -498,6 +564,36 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + if (f2fs_is_multi_device(sbi)) { + seq_puts(s, "Multidevice stats:\n"); + seq_printf(s, " [seg: %8s %8s %8s %8s %8s]", + "inuse", "dirty", "full", "free", "prefree"); + if (__is_large_section(sbi)) + seq_printf(s, " [sec: %8s %8s %8s %8s %8s]\n", + "inuse", "dirty", "full", "free", "prefree"); + else + seq_puts(s, "\n"); + + for (i = 0; i < sbi->s_ndevs; i++) { + seq_printf(s, " #%-2d %8u %8u %8u %8u %8u", i, + si->dev_stats[i].devstats[0][DEVSTAT_INUSE], + si->dev_stats[i].devstats[0][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[0][DEVSTAT_FULL], + si->dev_stats[i].devstats[0][DEVSTAT_FREE], + si->dev_stats[i].devstats[0][DEVSTAT_PREFREE]); + if (!__is_large_section(sbi)) { + seq_puts(s, "\n"); + continue; + } + seq_printf(s, " %8u %8u %8u %8u %8u\n", + si->dev_stats[i].devstats[1][DEVSTAT_INUSE], + si->dev_stats[i].devstats[1][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[1][DEVSTAT_FULL], + si->dev_stats[i].devstats[1][DEVSTAT_FREE], + si->dev_stats[i].devstats[1][DEVSTAT_PREFREE]); + } + seq_puts(s, "\n"); + } seq_printf(s, "CP calls: %d (BG: %d)\n", si->cp_call_count[TOTAL_CALL], si->cp_call_count[BACKGROUND]); @@ -598,9 +694,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - data: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + seq_printf(s, " - quota data: %4d in quota files:%4d\n", si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); @@ -665,6 +761,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + struct f2fs_dev_stats *dev_stats; unsigned long flags; int i; @@ -672,6 +769,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) if (!si) return -ENOMEM; + dev_stats = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_stats) * + sbi->s_ndevs, GFP_KERNEL); + if (!dev_stats) { + kfree(si); + return -ENOMEM; + } + + si->dev_stats = dev_stats; + si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -724,6 +830,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + kfree(si->dev_stats); kfree(si); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 62ac440d9416..347b3b647834 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -24,6 +24,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_info ei; + int devi; get_read_extent_info(&ei, i_ext); @@ -38,7 +39,36 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) ei.blk, ei.fofs, ei.len); return false; } - return true; + + if (!IS_DEVICE_ALIASING(inode)) + return true; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (FDEV(devi).start_blk != ei.blk || + FDEV(devi).end_blk != ei.blk + ei.len - 1) + continue; + + if (devi == 0) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) is an alias of meta device", + __func__, inode->i_ino); + return false; + } + + if (bdev_is_zoned(FDEV(devi).bdev)) { + f2fs_warn(sbi, + "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] maps to zoned block device", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; + } + return true; + } + + f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] is inconsistent w/ any devices", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; } static void __set_extent_info(struct extent_info *ei, @@ -76,6 +106,9 @@ static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) static bool __may_extent_tree(struct inode *inode, enum extent_type type) { + if (IS_DEVICE_ALIASING(inode) && type == EX_READ) + return true; + /* * for recovered files during mount do not create extents * if shrinker is not registered. @@ -346,21 +379,22 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et) + struct extent_tree *et, unsigned int nr_shrink) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = atomic_read(&et->node_cnt); + unsigned int count; node = rb_first_cached(&et->root); - while (node) { + + for (count = 0; node && count < nr_shrink; count++) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); __release_extent_node(sbi, et, en); node = next; } - return count - atomic_read(&et->node_cnt); + return count; } static void __drop_largest_extent(struct extent_tree *et, @@ -401,6 +435,11 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt) || !ei.len) goto skip; + if (IS_DEVICE_ALIASING(inode)) { + et->largest = ei; + goto skip; + } + en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); if (en) { @@ -463,6 +502,11 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } + if (IS_DEVICE_ALIASING(inode)) { + ret = false; + goto out; + } + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); if (!en) goto out; @@ -579,6 +623,30 @@ do_insert: return en; } +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int nr_shrink = type == EX_READ ? + READ_EXTENT_CACHE_SHRINK_NUMBER : + AGE_EXTENT_CACHE_SHRINK_NUMBER; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + while (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, nr_shrink); + write_unlock(&et->lock); + } + + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + return node_cnt; +} + static void __update_extent_tree_range(struct inode *inode, struct extent_info *tei, enum extent_type type) { @@ -649,7 +717,9 @@ static void __update_extent_tree_range(struct inode *inode, } if (end < org_end && (type != EX_READ || - org_end - end >= F2FS_MIN_EXTENT_LEN)) { + (org_end - end >= F2FS_MIN_EXTENT_LEN && + atomic_read(&et->node_cnt) < + sbi->max_read_extent_count))) { if (parts) { __set_extent_info(&ei, end, org_end - end, @@ -717,9 +787,6 @@ static void __update_extent_tree_range(struct inode *inode, } } - if (is_inode_flag_set(inode, FI_NO_EXTENT)) - __free_extent_tree(sbi, et); - if (et->largest_updated) { et->largest_updated = false; updated = true; @@ -737,6 +804,9 @@ update_age_extent_cache: out_read_extent_cache: write_unlock(&et->lock); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __destroy_extent_node(inode, EX_READ); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } @@ -899,10 +969,14 @@ static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et); + node_cnt += __free_extent_tree(sbi, et, + nr_shrink - node_cnt - tree_cnt); write_unlock(&et->lock); } - f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + if (atomic_read(&et->node_cnt)) + goto unlock_out; + list_del_init(&et->list); radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); @@ -1041,23 +1115,6 @@ unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); } -static unsigned int __destroy_extent_node(struct inode *inode, - enum extent_type type) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; - unsigned int node_cnt = 0; - - if (!et || !atomic_read(&et->node_cnt)) - return 0; - - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et); - write_unlock(&et->lock); - - return node_cnt; -} - void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); @@ -1066,7 +1123,6 @@ void f2fs_destroy_extent_node(struct inode *inode) static void __drop_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; @@ -1074,7 +1130,6 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) return; write_lock(&et->lock); - __free_extent_tree(sbi, et); if (type == EX_READ) { set_inode_flag(inode, FI_NO_EXTENT); if (et->largest.len) { @@ -1083,6 +1138,9 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) } } write_unlock(&et->lock); + + __destroy_extent_node(inode, type); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } @@ -1156,6 +1214,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; sbi->last_age_weight = LAST_AGE_WEIGHT; + sbi->max_read_extent_count = DEF_MAX_READ_EXTENT_COUNT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 33f5449dc22d..6f2cbf4c5740 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -213,6 +213,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_CASEFOLD 0x00001000 #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 +#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -634,6 +635,9 @@ enum { #define DEF_HOT_DATA_AGE_THRESHOLD 262144 #define DEF_WARM_DATA_AGE_THRESHOLD 2621440 +/* default max read extent count per inode */ +#define DEF_MAX_READ_EXTENT_COUNT 10240 + /* extent cache type */ enum extent_type { EX_READ, @@ -1018,7 +1022,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) -enum { +enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ @@ -1063,7 +1067,6 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ - unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -1619,6 +1622,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; atomic64_t allocated_data_blocks; /* for block age extent_cache */ + unsigned int max_read_extent_count; /* max read extent count per inode */ /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; @@ -1758,6 +1762,7 @@ struct f2fs_sb_info { unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ + unsigned int first_zoned_segno; /* first zoned segno */ /* For write statistics */ u64 sectors_written_start; @@ -3046,6 +3051,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ #define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) @@ -3061,6 +3067,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) +#define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) + static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) @@ -3632,8 +3640,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); @@ -3754,7 +3761,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -int f2fs_get_segment_temp(int seg_type); +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3771,8 +3779,7 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); @@ -3783,6 +3790,8 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno); #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 @@ -3935,6 +3944,19 @@ void f2fs_destroy_recovery_cache(void); * debug.c */ #ifdef CONFIG_F2FS_STAT_FS +enum { + DEVSTAT_INUSE, + DEVSTAT_DIRTY, + DEVSTAT_FULL, + DEVSTAT_FREE, + DEVSTAT_PREFREE, + DEVSTAT_MAX, +}; + +struct f2fs_dev_stats { + unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ +}; + struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; @@ -3998,6 +4020,7 @@ struct f2fs_stat_info { unsigned int block_count[2]; unsigned int inplace_count; unsigned long long base_mem, cache_mem, page_mem; + struct f2fs_dev_stats *dev_stats; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) @@ -4510,6 +4533,7 @@ F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); +F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ae54c4c72fe..aa9679b3d8e4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -725,6 +725,11 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); + if (IS_DEVICE_ALIASING(inode) && from) { + err = -EINVAL; + goto out_err; + } + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= max_file_blocks(inode)) @@ -739,6 +744,21 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } + if (IS_DEVICE_ALIASING(inode)) { + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_info ei = et->largest; + unsigned int i; + + for (i = 0; i < ei.len; i++) + f2fs_invalidate_blocks(sbi, ei.blk + i); + + dec_valid_block_count(sbi, inode, ei.len); + f2fs_update_time(sbi, REQ_TIME); + + f2fs_put_page(ipage, 1); + goto out; + } + if (f2fs_has_inline_data(inode)) { f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); @@ -774,7 +794,7 @@ free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); - +out_err: trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -863,7 +883,11 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return true; if (f2fs_compressed_file(inode)) return true; - if (f2fs_has_inline_data(inode)) + /* + * only force direct read to use buffered IO, for direct write, + * it expects inline data conversion before committing IO. + */ + if (f2fs_has_inline_data(inode) && rw == READ) return true; /* disallow direct IO if any of devices has unaligned blksize */ @@ -992,7 +1016,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EPERM; if ((attr->ia_valid & ATTR_SIZE)) { - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || + IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) && !IS_ALIGNED(attr->ia_size, @@ -1790,7 +1815,8 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: - if (has_not_enough_free_secs(sbi, 0, + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); @@ -1860,7 +1886,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ @@ -2343,9 +2369,12 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, if (readonly) goto out; - /* grab sb->s_umount to avoid racing w/ remount() */ + /* + * grab sb->s_umount to avoid racing w/ remount() and other shutdown + * paths. + */ if (need_lock) - down_read(&sbi->sb->s_umount); + down_write(&sbi->sb->s_umount); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); @@ -2354,7 +2383,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, clear_opt(sbi, DISCARD); if (need_lock) - up_read(&sbi->sb->s_umount); + up_write(&sbi->sb->s_umount); f2fs_update_time(sbi, REQ_TIME); out: @@ -2861,7 +2890,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) @@ -3038,32 +3067,27 @@ out: static int __f2fs_ioc_move_range(struct file *filp, struct f2fs_move_range *range) { - struct fd dst; int err; if (!(filp->f_mode & FMODE_READ) || !(filp->f_mode & FMODE_WRITE)) return -EBADF; - dst = fdget(range->dst_fd); - if (!fd_file(dst)) + CLASS(fd, dst)(range->dst_fd); + if (fd_empty(dst)) return -EBADF; - if (!(fd_file(dst)->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto err_out; - } + if (!(fd_file(dst)->f_mode & FMODE_WRITE)) + return -EBADF; err = mnt_want_write_file(filp); if (err) - goto err_out; + return err; err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst), range->pos_out, range->len); mnt_drop_write_file(filp); -err_out: - fdput(dst); return err; } @@ -3296,6 +3320,9 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + if (IS_DEVICE_ALIASING(inode)) + return -EINVAL; + if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", __func__, inode->i_ino, fi->i_gc_failures); @@ -3326,6 +3353,9 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (!pin && IS_DEVICE_ALIASING(inode)) + return -EOPNOTSUPP; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -3391,6 +3421,12 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) return put_user(pin, (u32 __user *)arg); } +static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) +{ + return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0, + (u32 __user *)arg); +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -3792,7 +3828,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, to_reserved = cluster_size - compr_blocks - reserved; /* for the case all blocks in cluster were reserved */ - if (to_reserved == 1) { + if (reserved && to_reserved == 1) { dn->ofs_in_node += cluster_size; goto next; } @@ -4490,6 +4526,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_decompress_file(filp); case F2FS_IOC_COMPRESS_FILE: return f2fs_ioc_compress_file(filp); + case F2FS_IOC_GET_DEV_ALIAS_FILE: + return f2fs_ioc_get_dev_alias_file(filp, arg); default: return -ENOTTY; } @@ -4647,7 +4685,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_count(to), READ); /* In LFS mode, if there is inflight dio, wait for its completion */ - if (f2fs_lfs_mode(F2FS_I_SB(inode))) + if (f2fs_lfs_mode(F2FS_I_SB(inode)) && + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE)) inode_dio_wait(inode); if (f2fs_should_use_dio(inode, iocb, to)) { @@ -4764,7 +4803,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, else return 0; - map.m_may_create = true; + if (!IS_DEVICE_ALIASING(inode)) + map.m_may_create = true; if (dio) { map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); @@ -4820,8 +4860,8 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, { struct inode *inode = iter->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); - enum temp_type temp = f2fs_get_segment_temp(seg_type); + enum log_type type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); + enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); submit_bio(bio); @@ -5201,6 +5241,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_COMPRESS_OPTION: case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: + case F2FS_IOC_GET_DEV_ALIAS_FILE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9322a7200e31..3e1b6d2ff3a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -257,6 +257,8 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) switch (sbi->gc_mode) { case GC_IDLE_CB: + case GC_URGENT_LOW: + case GC_URGENT_MID: gc_mode = GC_CB; break; case GC_IDLE_GREEDY: @@ -361,20 +363,15 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; unsigned char u; - unsigned int i; unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); - for (i = 0; i < usable_segs_per_sec; i++) - mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); vblocks = get_valid_blocks(sbi, segno, true); - - mtime = div_u64(mtime, usable_segs_per_sec); vblocks = div_u64(vblocks, usable_segs_per_sec); u = BLKS_TO_SEGS(sbi, vblocks * 100); @@ -519,10 +516,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, struct victim_sel_policy *p, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; - unsigned int i; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (p->gc_mode == GC_AT && @@ -530,9 +524,8 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, return; } - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, start + i)->mtime; - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2914b678bf8f..5c1eaf55e127 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -35,6 +35,7 @@ #define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */ #define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3 #define BOOST_GC_MULTIPLE 5 +#define ZONED_PIN_SEC_REQUIRED_COUNT 1 #define DEF_GC_FAILED_PINNED_FILES 2048 #define MAX_GC_FAILED_PINNED_FILES USHRT_MAX diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1ed86df343a5..282fd320bdb3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -372,6 +372,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (IS_DEVICE_ALIASING(inode)) { + if (!f2fs_sb_has_device_alias(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + __func__, inode->i_ino); + return false; + } + if (!f2fs_is_pinned_file(inode)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + __func__, inode->i_ino); + return false; + } + } + return true; } @@ -775,8 +788,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; - if (!f2fs_is_checkpoint_ready(sbi)) + if (!f2fs_is_checkpoint_ready(sbi)) { + f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; + } /* * We need to balance fs here to prevent from producing dirty node pages @@ -823,7 +838,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); - f2fs_destroy_extent_tree(inode); + if (!IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; @@ -879,6 +895,9 @@ retry: goto retry; } + if (IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); + if (err) { f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59b13ff243fa..0b900a7a48e5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -905,6 +905,16 @@ static int truncate_node(struct dnode_of_data *dn) if (err) return err; + if (ni.blk_addr != NEW_ADDR && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { + f2fs_err_ratelimited(sbi, + "nat entry is corrupted, run fsck to fix it, ino:%u, " + "nid:%u, blkaddr:%u", ni.ino, ni.nid, ni.blk_addr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } + /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); @@ -1056,7 +1066,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + nid[0] = get_nid(dn->inode_page, offset[0], true); if (!nid[0]) return 0; @@ -1167,7 +1177,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = get_nid(page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1199,13 +1209,10 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && - ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + if (offset[1] == 0 && get_nid(page, offset[0], true)) { lock_page(page); BUG_ON(page->mapping != NODE_MAPPING(sbi)); - f2fs_wait_on_page_writeback(page, NODE, true, true); - ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; - set_page_dirty(page); + set_nid(page, offset[0], 0, true); unlock_page(page); } offset[1] = 0; @@ -1331,7 +1338,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) err = -EFSCORRUPTED; dec_valid_node_count(sbi, dn->inode, !ofs); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + f2fs_warn_ratelimited(sbi, + "f2fs_new_node_page: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + new_ni.ino, new_ni.nid, new_ni.blk_addr, + new_ni.version, new_ni.flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); goto fail; } #endif diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e4d81b8705d1..f35be2c48e3c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -899,13 +899,8 @@ skip: * and the f2fs is not read only, check and fix zoned block devices' * write pointer consistency. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) { - int err2 = f2fs_fix_curseg_write_pointer(sbi); - - if (!err2) - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; + if (!err) { + err = f2fs_check_and_fix_write_pointer(sbi); ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1766254279d2..eade36c5ef13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,16 +1290,18 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, wait_list, issued); return 0; } - - /* - * Issue discard for conventional zones only if the device - * supports discard. - */ - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; } #endif + /* + * stop issuing discard for any of below cases: + * 1. device is conventional zone, but it doesn't support discard. + * 2. device is regulare device, after snapshot it doesn't support + * discard. + */ + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; @@ -2711,7 +2713,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else - segno = max(first_zoned_segno(sbi), *newseg); + segno = max(sbi->first_zoned_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif @@ -2723,7 +2725,7 @@ find_other_zone: if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { - hint = GET_SEC_FROM_SEG(sbi, first_zoned_segno(sbi)); + hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, @@ -2926,7 +2928,8 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -3237,7 +3240,8 @@ retry: if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1); + err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), + true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; @@ -3581,18 +3585,35 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } } -int f2fs_get_segment_temp(int seg_type) +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type type) { - if (IS_HOT(seg_type)) - return HOT; - else if (IS_WARM(seg_type)) - return WARM; - return COLD; + struct curseg_info *curseg = CURSEG_I(sbi, type); + enum temp_type temp = COLD; + + switch (curseg->seg_type) { + case CURSEG_HOT_NODE: + case CURSEG_HOT_DATA: + temp = HOT; + break; + case CURSEG_WARM_NODE: + case CURSEG_WARM_DATA: + temp = WARM; + break; + case CURSEG_COLD_NODE: + case CURSEG_COLD_DATA: + temp = COLD; + break; + default: + f2fs_bug_on(sbi, 1); + } + + return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { - int type = 0; + enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: @@ -3608,7 +3629,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) f2fs_bug_on(fio->sbi, true); } - fio->temp = f2fs_get_segment_temp(type); + fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } @@ -3793,10 +3814,35 @@ void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, } } +static int log_type_to_seg_type(enum log_type type) +{ + int seg_type = CURSEG_COLD_DATA; + + switch (type) { + case CURSEG_HOT_DATA: + case CURSEG_WARM_DATA: + case CURSEG_COLD_DATA: + case CURSEG_HOT_NODE: + case CURSEG_WARM_NODE: + case CURSEG_COLD_NODE: + seg_type = (int)type; + break; + case CURSEG_COLD_DATA_PINNED: + case CURSEG_ALL_DATA_ATGC: + seg_type = CURSEG_COLD_DATA; + break; + default: + break; + } + return seg_type; +} + static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio); - bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); + enum log_type type = __get_segment_type(fio); + int seg_type = log_type_to_seg_type(type); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && + seg_type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); @@ -3977,8 +4023,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } - f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); + f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -4778,12 +4824,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; - if (i < NR_PERSISTENT_LOG) - array[i].seg_type = CURSEG_HOT_DATA + i; - else if (i == CURSEG_COLD_DATA_PINNED) - array[i].seg_type = CURSEG_COLD_DATA; - else if (i == CURSEG_ALL_DATA_ATGC) - array[i].seg_type = CURSEG_COLD_DATA; + array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); @@ -5207,7 +5248,7 @@ static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; @@ -5312,12 +5353,12 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; } -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { - ret = fix_curseg_write_pointer(sbi, i); + ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } @@ -5340,7 +5381,7 @@ static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, return check_zone_write_pointer(args->sbi, args->fdev, zone); } -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; @@ -5360,6 +5401,20 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) +{ + int ret; + + if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb)) + return 0; + + f2fs_notice(sbi, "Checking entire write pointers"); + ret = fix_curseg_write_pointer(sbi); + if (!ret) + ret = check_write_pointer(sbi); + return ret; +} + /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for @@ -5396,12 +5451,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( return BLKS_PER_SEG(sbi); } #else -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) -{ - return 0; -} - -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } @@ -5430,6 +5480,35 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) return SEGS_PER_SEC(sbi); } +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); + unsigned int secno = 0, start = 0; + unsigned int total_valid_blocks = 0; + unsigned long long mtime = 0; + unsigned int i = 0; + + secno = GET_SEC_FROM_SEG(sbi, segno); + start = GET_SEG_FROM_SEC(sbi, secno); + + if (!__is_large_section(sbi)) + return get_seg_entry(sbi, start + i)->mtime; + + for (i = 0; i < usable_segs_per_sec; i++) { + /* for large section, only check the mtime of valid segments */ + struct seg_entry *se = get_seg_entry(sbi, start+i); + + mtime += se->mtime * se->valid_blocks; + total_valid_blocks += se->valid_blocks; + } + + if (total_valid_blocks == 0) + return INVALID_MTIME; + + return div_u64(mtime, total_valid_blocks); +} + /* * Update min, max modified time for cost-benefit GC algorithm */ @@ -5443,13 +5522,9 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { - unsigned int i; unsigned long long mtime = 0; - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, segno + i)->mtime; - - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 71adb4a43bec..943be4f1d6d2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ #define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ +#define INVALID_MTIME ULLONG_MAX /* no valid blocks in a segment/section */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) @@ -32,10 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) -#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) -#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) - #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ @@ -524,8 +522,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments + - SM_I(sbi)->additional_reserved_segments; + return SM_I(sbi)->reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) @@ -559,18 +556,21 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, - unsigned int node_blocks, unsigned int dent_blocks) + unsigned int node_blocks, unsigned int data_blocks, + unsigned int dent_blocks) { - unsigned segno, left_blocks; + unsigned int segno, left_blocks, blocks; int i; - /* check current node sections in the worst case. */ - for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + /* check current data/node sections in the worst case. */ + for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); - if (node_blocks > left_blocks) + + blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; + if (blocks > left_blocks) return false; } @@ -584,8 +584,9 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, } /* - * calculate needed sections for dirty node/dentry - * and call has_curseg_enough_space + * calculate needed sections for dirty node/dentry and call + * has_curseg_enough_space, please note that, it needs to account + * dirty data as well in lfs mode when checkpoint is disabled. */ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) @@ -594,19 +595,30 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int total_data_blocks = 0; unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int data_secs = 0; unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int data_blocks = 0; + + if (f2fs_lfs_mode(sbi) && + unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); + data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); + data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + } if (lower_p) - *lower_p = node_secs + dent_secs; + *lower_p = node_secs + dent_secs + data_secs; if (upper_p) *upper_p = node_secs + dent_secs + - (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + + (data_blocks ? 1 : 0); if (curseg_p) *curseg_p = has_curseg_enough_space(sbi, - node_blocks, dent_blocks); + node_blocks, data_blocks, dent_blocks); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -637,12 +649,30 @@ static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, return !has_not_enough_free_secs(sbi, freed, needed); } +static inline bool has_enough_free_blks(struct f2fs_sb_info *sbi) +{ + unsigned int total_free_blocks = 0; + unsigned int avail_user_block_count; + + spin_lock(&sbi->stat_lock); + + avail_user_block_count = get_available_block_count(sbi, NULL, true); + total_free_blocks = avail_user_block_count - (unsigned int)valid_user_blocks(sbi); + + spin_unlock(&sbi->stat_lock); + + return total_free_blocks > 0; +} + static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (likely(has_enough_free_secs(sbi, 0, 0))) return true; + if (!f2fs_lfs_mode(sbi) && + likely(has_enough_free_blks(sbi))) + return true; return false; } @@ -957,13 +987,3 @@ wake_up: dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } - -static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi) -{ - int devi; - - for (devi = 0; devi < sbi->s_ndevs; devi++) - if (bdev_is_zoned(FDEV(devi).bdev)) - return GET_SEGNO(sbi, FDEV(devi).start_blk); - return 0; -} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87ab5696bd48..fc7d463dee15 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -150,6 +150,8 @@ enum { Opt_mode, Opt_fault_injection, Opt_fault_type, + Opt_lazytime, + Opt_nolazytime, Opt_quota, Opt_noquota, Opt_usrquota, @@ -226,6 +228,8 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_fault_type, "fault_type=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, @@ -834,6 +838,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: + if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + f2fs_err(sbi, "device aliasing requires extent cache"); + return -EINVAL; + } clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: @@ -918,6 +926,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "fault_type options not supported"); break; #endif + case Opt_lazytime: + sb->s_flags |= SB_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~SB_LAZYTIME; + break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: @@ -1158,7 +1172,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(ext[ext_cnt], name); + ret = strscpy(ext[ext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; @@ -1187,7 +1205,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(noext[noext_cnt], name); + ret = strscpy(noext[noext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); break; @@ -1738,6 +1760,18 @@ static int f2fs_freeze(struct super_block *sb) static int f2fs_unfreeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + /* + * It will update discard_max_bytes of mounted lvm device to zero + * after creating snapshot on this lvm device, let's drop all + * remained discards. + * We don't need to disable real-time discard because discard_max_bytes + * will recover after removal of snapshot. + */ + if (test_opt(sbi, DISCARD) && !f2fs_hw_support_discard(sbi)) + f2fs_issue_discard_timeout(sbi); + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } @@ -2474,6 +2508,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + adjust_unusable_cap_perc(sbi); if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); @@ -2518,7 +2553,6 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_checkpoint: @@ -3322,7 +3356,7 @@ loff_t max_file_blocks(struct inode *inode) * fit within U32_MAX + 1 data units. */ - result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); + result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); return result; } @@ -4155,8 +4189,7 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context) +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) { struct super_block *sb = sbi->sb; bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; @@ -4168,10 +4201,12 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, if (!f2fs_hw_is_readonly(sbi)) { save_stop_reason(sbi, reason); - if (irq_context && !shutdown) - schedule_work(&sbi->s_error_work); - else - f2fs_record_stop_reason(sbi); + /* + * always create an asynchronous task to record stop_reason + * in order to avoid potential deadlock when running into + * f2fs_record_stop_reason() synchronously. + */ + schedule_work(&sbi->s_error_work); } /* @@ -4217,6 +4252,16 @@ static void f2fs_record_error_work(struct work_struct *work) f2fs_record_stop_reason(sbi); } +static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi) +{ + int devi; + + for (devi = 0; devi < sbi->s_ndevs; devi++) + if (bdev_is_zoned(FDEV(devi).bdev)) + return GET_SEGNO(sbi, FDEV(devi).start_blk); + return 0; +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4617,6 +4662,9 @@ try_onemore: /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); + /* get segno of first zoned block device */ + sbi->first_zoned_segno = get_first_zoned_segno(sbi); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) @@ -4738,26 +4786,23 @@ try_onemore: reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, - * check zoned block devices' write pointer consistency. + * write pointer consistency of cursegs and other zones are already + * checked and fixed during recovery. However, if recovery fails, + * write pointers are left untouched, and retry-mount should check + * them here. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) { - int err2; - - f2fs_notice(sbi, "Checking entire write pointers"); - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; - } + if (skip_recovery) + err = f2fs_check_and_fix_write_pointer(sbi); if (err) goto free_meta; + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + err = f2fs_init_inmem_curseg(sbi); if (err) goto sync_free_meta; - /* f2fs_recover_fsync_data() cleared this already */ - clear_sbi_flag(sbi, SBI_POR_DOING); - if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) @@ -4991,9 +5036,6 @@ static int __init init_f2fs_fs(void) err = f2fs_init_shrinker(); if (err) goto free_sysfs; - err = register_filesystem(&f2fs_fs_type); - if (err) - goto free_shrinker; f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) @@ -5016,7 +5058,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_casefold_cache(); if (err) goto free_compress_cache; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_casefold_cache; return 0; +free_casefold_cache: + f2fs_destroy_casefold_cache(); free_compress_cache: f2fs_destroy_compress_cache(); free_compress_mempool: @@ -5031,8 +5078,6 @@ free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); -free_shrinker: f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); @@ -5056,6 +5101,7 @@ fail: static void __exit exit_f2fs_fs(void) { + unregister_filesystem(&f2fs_fs_type); f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); @@ -5064,7 +5110,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c56e8c873935..6b99dc49f776 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -501,9 +501,7 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks - - SEGS_TO_BLKS(sbi, - SM_I(sbi)->additional_reserved_segments))) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -789,6 +787,13 @@ out: return count; } + if (!strcmp(a->attr.name, "max_read_extent_count")) { + if (t > UINT_MAX) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + if (!strcmp(a->attr.name, "ipu_policy")) { if (t >= BIT(F2FS_IPU_MAX)) return -EINVAL; @@ -1054,6 +1059,8 @@ F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +/* read extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); #ifdef CONFIG_BLK_DEV_ZONED F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); @@ -1244,6 +1251,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), + ATTR_LIST(max_read_extent_count), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1313,6 +1321,7 @@ F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); F2FS_SB_FEATURE_RO_ATTR(readonly, RO); +F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_encryption), @@ -1329,6 +1338,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_casefold), ATTR_LIST(sb_compression), ATTR_LIST(sb_readonly), + ATTR_LIST(sb_device_alias), NULL, }; ATTRIBUTE_GROUPS(f2fs_sb_feat); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6423e1dedf14..15bf32c21ac0 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1037,7 +1037,7 @@ error_inode: if (corrupt < 0) { fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __func__, sinfo.i_pos); + __func__, new_i_pos); } goto out; } diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..49884fa3c81d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> @@ -375,6 +374,9 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, u64 __user *argp = (u64 __user *)arg; u64 hint; + if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) + return -EPERM; + if (copy_from_user(&hint, argp, sizeof(hint))) return -EFAULT; if (!rw_hint_valid(hint)) @@ -397,6 +399,9 @@ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return -EBADF; + /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. @@ -570,24 +575,21 @@ static int check_fcntl_cmd(unsigned cmd) SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget_raw(fd); - long err = -EBADF; + CLASS(fd_raw, f)(fd); + long err; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); -out1: - fdput(f); -out: return err; } @@ -596,21 +598,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock64 flock; - long err = -EBADF; + long err; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out1; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out1; + return err; switch (cmd) { case F_GETLK64: @@ -635,9 +637,6 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out1: - fdput(f); -out: return err; } #endif @@ -733,21 +732,21 @@ static int fixup_compat_flock(struct flock *flock) static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); struct flock flock; - long err = -EBADF; + long err; - if (!fd_file(f)) - return err; + if (fd_empty(f)) + return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) - goto out_put; + return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) - goto out_put; + return err; switch (cmd) { case F_GETLK: @@ -790,8 +789,6 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } -out_put: - fdput(f); return err; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 82df28d45cd7..3e092ae6d142 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -31,6 +31,14 @@ static long do_sys_name_to_handle(const struct path *path, if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; + /* + * A request to encode a connectable handle for a disconnected dentry + * is unexpected since AT_EMPTY_PATH is not allowed. + */ + if (fh_flags & EXPORT_FH_CONNECTABLE && + WARN_ON(path->dentry->d_flags & DCACHE_DISCONNECTED)) + return -EINVAL; + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; @@ -45,7 +53,7 @@ static long do_sys_name_to_handle(const struct path *path, /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; - /* we ask for a non connectable maybe decodeable file handle */ + /* Encode a possibly decodeable/connectable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, &handle_dwords, fh_flags); @@ -67,8 +75,23 @@ static long do_sys_name_to_handle(const struct path *path, * non variable part of the file_handle */ handle_bytes = 0; - } else + } else { + /* + * When asked to encode a connectable file handle, encode this + * property in the file handle itself, so that we later know + * how to decode it. + * For sanity, also encode in the file handle if the encoded + * object is a directory and verify this during decode, because + * decoding directory file handles is quite different than + * decoding connectable non-directory file handles. + */ + if (fh_flags & EXPORT_FH_CONNECTABLE) { + handle->handle_type |= FILEID_IS_CONNECTABLE; + if (d_is_dir(path->dentry)) + fh_flags |= FILEID_IS_DIR; + } retval = 0; + } /* copy the mount id */ if (unique_mntid) { if (put_user(real_mount(path->mnt)->mnt_id_unique, @@ -109,15 +132,30 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, { struct path path; int lookup_flags; - int fh_flags; + int fh_flags = 0; int err; if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | - AT_HANDLE_MNT_ID_UNIQUE)) + AT_HANDLE_MNT_ID_UNIQUE | AT_HANDLE_CONNECTABLE)) return -EINVAL; + /* + * AT_HANDLE_FID means there is no intention to decode file handle + * AT_HANDLE_CONNECTABLE means there is an intention to decode a + * connected fd (with known path), so these flags are conflicting. + * AT_EMPTY_PATH could be used along with a dfd that refers to a + * disconnected non-directory, which cannot be used to encode a + * connectable file handle, because its parent is unknown. + */ + if (flag & AT_HANDLE_CONNECTABLE && + flag & (AT_HANDLE_FID | AT_EMPTY_PATH)) + return -EINVAL; + else if (flag & AT_HANDLE_FID) + fh_flags |= EXPORT_FH_FID; + else if (flag & AT_HANDLE_CONNECTABLE) + fh_flags |= EXPORT_FH_CONNECTABLE; + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; - fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); @@ -139,28 +177,16 @@ static int get_path_from_fd(int fd, struct path *root) path_get(root); spin_unlock(&fs->lock); } else { - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); - fdput(f); } return 0; } -enum handle_to_path_flags { - HANDLE_CHECK_PERMS = (1 << 0), - HANDLE_CHECK_SUBTREE = (1 << 1), -}; - -struct handle_to_path_ctx { - struct path root; - enum handle_to_path_flags flags; - unsigned int fh_flags; -}; - static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { struct handle_to_path_ctx *ctx = context; @@ -208,7 +234,13 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) retval = 1; - WARN_ON_ONCE(d != root && d != root->d_sb->s_root); + /* + * exportfs_decode_fh_raw() does not call acceptable() callback with + * a disconnected directory dentry, so we should have reached either + * mount fd directory or sb root. + */ + if (ctx->fh_flags & EXPORT_FH_DIR_ONLY) + WARN_ON_ONCE(d != root && d != root->d_sb->s_root); dput(d); return retval; } @@ -218,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path, { int handle_dwords; struct vfsmount *mnt = ctx->root.mnt; + struct dentry *dentry; /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; - path->dentry = exportfs_decode_fh_raw(mnt, - (struct fid *)handle->f_handle, - handle_dwords, handle->handle_type, - ctx->fh_flags, - vfs_dentry_acceptable, ctx); - if (IS_ERR_OR_NULL(path->dentry)) { - if (path->dentry == ERR_PTR(-ENOMEM)) + dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + ctx->fh_flags, vfs_dentry_acceptable, + ctx); + if (IS_ERR_OR_NULL(dentry)) { + if (dentry == ERR_PTR(-ENOMEM)) return -ENOMEM; return -ESTALE; } + path->dentry = dentry; path->mnt = mntget(mnt); return 0; } -/* - * Allow relaxed permissions of file handles if the caller has the - * ability to mount the filesystem or create a bind-mount of the - * provided @mountdirfd. - * - * In both cases the caller may be able to get an unobstructed way to - * the encoded file handle. If the caller is only able to create a - * bind-mount we need to verify that there are no locked mounts on top - * of it that could prevent us from getting to the encoded file. - * - * In principle, locked mounts can prevent the caller from mounting the - * filesystem but that only applies to procfs and sysfs neither of which - * support decoding file handles. - */ -static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, - unsigned int o_flags) +static inline int may_decode_fh(struct handle_to_path_ctx *ctx, + unsigned int o_flags) { struct path *root = &ctx->root; + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + /* - * Restrict to O_DIRECTORY to provide a deterministic API that avoids a - * confusing api in the face of disconnected non-dir dentries. + * Allow relaxed permissions of file handles if the caller has + * the ability to mount the filesystem or create a bind-mount of + * the provided @mountdirfd. + * + * In both cases the caller may be able to get an unobstructed + * way to the encoded file handle. If the caller is only able to + * create a bind-mount we need to verify that there are no + * locked mounts on top of it that could prevent us from getting + * to the encoded file. + * + * In principle, locked mounts can prevent the caller from + * mounting the filesystem but that only applies to procfs and + * sysfs neither of which support decoding file handles. + * + * Restrict to O_DIRECTORY to provide a deterministic API that + * avoids a confusing api in the face of disconnected non-dir + * dentries. * * There's only one dentry for each directory inode (VFS rule)... */ if (!(o_flags & O_DIRECTORY)) - return false; + return -EPERM; if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; @@ -271,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else - return false; + return -EPERM; /* Are we able to override DAC permissions? */ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) - return false; + return -EPERM; ctx->fh_flags = EXPORT_FH_DIR_ONLY; - return true; + return 0; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, @@ -288,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct file_handle f_handle; struct file_handle *handle = NULL; struct handle_to_path_ctx ctx = {}; + const struct export_operations *eops; retval = get_path_from_fd(mountdirfd, &ctx.root); if (retval) goto out_err; - if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { - retval = -EPERM; + eops = ctx.root.mnt->mnt_sb->s_export_op; + if (eops && eops->permission) + retval = eops->permission(&ctx, o_flags); + else + retval = may_decode_fh(&ctx, o_flags); + if (retval) goto out_path; - } if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; @@ -307,6 +348,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, retval = -EINVAL; goto out_path; } + if (f_handle.handle_type < 0 || + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { + retval = -EINVAL; + goto out_path; + } + handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { @@ -322,6 +369,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_handle; } + /* + * If handle was encoded with AT_HANDLE_CONNECTABLE, verify that we + * are decoding an fd with connected path, which is accessible from + * the mount fd path. + */ + if (f_handle.handle_type & FILEID_IS_CONNECTABLE) { + ctx.fh_flags |= EXPORT_FH_CONNECTABLE; + ctx.flags |= HANDLE_CHECK_SUBTREE; + } + if (f_handle.handle_type & FILEID_IS_DIR) + ctx.fh_flags |= EXPORT_FH_DIR_ONLY; + /* Filesystem code should not be exposed to user flags */ + handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); out_handle: @@ -336,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; - struct path path; + struct path path __free(path_put) = {}; struct file *file; - int fd; + const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; - fd = get_unused_fd_flags(open_flag); - if (fd < 0) { - path_put(&path); + CLASS(get_unused_fd, fd)(O_CLOEXEC); + if (fd < 0) return fd; - } - file = file_open_root(&path, "", open_flag, 0); - if (IS_ERR(file)) { - put_unused_fd(fd); - retval = PTR_ERR(file); - } else { - retval = fd; - fd_install(fd, file); - } - path_put(&path); - return retval; + + eops = path.mnt->mnt_sb->s_export_op; + if (eops->open) + file = eops->open(&path, open_flag); + else + file = file_open_root(&path, "", open_flag, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + fd_install(fd, file); + return take_fd(fd); } /** diff --git a/fs/file.c b/fs/file.c index eb093e736972..d868cdb95d1e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -20,10 +20,74 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> +#include <linux/file_ref.h> #include <net/sock.h> +#include <linux/init_task.h> #include "internal.h" +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) +{ + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} +EXPORT_SYMBOL_GPL(__file_ref_put); + unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ @@ -89,18 +153,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". - * - * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied - * by that "1024/sizeof(ptr)" before, we already know there are sufficient - * clear low bits. Clang seems to realize that, gcc ends up being confused. - * - * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, - * let's consider it documentation (and maybe a test-case for gcc to improve - * its code generation ;) */ -static struct fdtable * alloc_fdtable(unsigned int nr) +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -108,22 +165,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); - nr = ALIGN(nr, BITS_PER_LONG); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) @@ -152,14 +219,14 @@ out_arr: out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) @@ -169,7 +236,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); + new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. @@ -178,16 +245,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) synchronize_rcu(); spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); @@ -196,15 +255,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); - return 1; + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) @@ -212,50 +270,50 @@ static int expand_files(struct files_struct *files, unsigned int nr) __acquires(files->file_lock) { struct fdtable *fdt; - int expanded = 0; + int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return expanded; - - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; + return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); - expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } + /* Can we expand? */ + if (unlikely(nr >= sysctl_nr_open)) + return -EMFILE; + /* All good, so we try */ files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); + error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); - return expanded; -} - -static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->close_on_exec); + return error; } -static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - if (test_bit(fd, fdt->close_on_exec)) - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); @@ -264,7 +322,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); - __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) @@ -306,7 +366,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - int error; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) @@ -338,17 +397,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - error = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - error = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -389,10 +441,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); - return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) @@ -413,7 +461,7 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); @@ -470,6 +518,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) @@ -496,7 +553,7 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) + if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* @@ -504,36 +561,22 @@ repeat: * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif out: spin_unlock(&files->file_lock); @@ -599,7 +642,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); + WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -713,7 +756,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, } /** - * __close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close @@ -721,8 +764,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. */ -int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -839,7 +884,7 @@ static struct file *__get_file_rcu(struct file __rcu **f) if (!file) return NULL; - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); @@ -853,8 +898,8 @@ static struct file *__get_file_rcu(struct file __rcu **f) OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* - * atomic_long_inc_not_zero() above provided a full memory - * barrier when we acquired a reference. + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still @@ -952,11 +997,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * We need to confirm it by incrementing the refcount * and then check the lookup again. * - * atomic_long_inc_not_zero() gives us a full memory - * barrier. We only really need an 'acquire' one to - * protect the loads below, but we don't have that. + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. */ - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) continue; /* @@ -1037,29 +1082,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *lookup_fdget_rcu(unsigned int fd) -{ - return __fget_files_rcu(current->files, fd, 0); - -} -EXPORT_SYMBOL_GPL(lookup_fdget_rcu); - -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) -{ - /* Must be called with rcu_read_lock held */ - struct files_struct *files; - struct file *file = NULL; - - task_lock(task); - files = task->files; - if (files) - file = __fget_files_rcu(files, fd, 0); - task_unlock(task); - - return file; -} - -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -1069,17 +1092,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int * task_lock(task); files = task->files; if (files) { + rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } + rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fdget_rcu); +EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1096,6 +1121,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu); * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. + * + * (As an exception to rule 2, you can call filp_close between fget_light and + * fput_light provided that you capture a real refcount with get_file before + * the call to filp_close, and ensure that this real refcount is fput *after* + * the fput_light call.) + * + * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { @@ -1176,13 +1208,8 @@ void __f_unlock_pos(struct file *f) void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } @@ -1204,17 +1231,9 @@ __releases(&files->file_lock) /* * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * not finished descriptor. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); @@ -1223,11 +1242,7 @@ __releases(&files->file_lock) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..a32171d2b83f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,7 +9,6 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> @@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = { /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; +static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path user_path; + union { + struct path user_path; + freeptr_t bf_freeptr; + }; }; static inline struct backing_file *backing_file(struct file *f) @@ -68,7 +71,7 @@ static inline void file_free(struct file *f) put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); - kfree(backing_file(f)); + kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } @@ -125,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = { .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, @@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ + memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->f_ra, 0, sizeof(f->f_ra)); + + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + + f->f_op = NULL; + f->f_mapping = NULL; + f->private_data = NULL; + f->f_inode = NULL; + f->f_owner = NULL; +#ifdef CONFIG_EPOLL + f->f_ep = NULL; +#endif + + f->f_iocb_flags = 0; + f->f_pos = 0; + f->f_wb_err = 0; + f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ - atomic_long_set(&f->f_count, 1); + file_ref_init(&f->f_ref, 1); return 0; } @@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) struct backing_file *ff; int error; - ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { - kfree(ff); + kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } @@ -459,6 +478,8 @@ static void ____fput(struct callback_head *work) __fput(container_of(work, struct file, f_task_work)); } +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we @@ -472,14 +493,13 @@ static void ____fput(struct callback_head *work) void flush_delayed_fput(void) { delayed_fput(NULL); + flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); -static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); - void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { @@ -512,7 +532,7 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (file_ref_put(&file->f_ref)) __fput(file); } @@ -529,6 +549,11 @@ void __init files_init(void) filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + + args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); + bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), + &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3cd99e2dc6ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); @@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) @@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/fs/fs_context.c b/fs/fs_context.c index 98589aae5208..582d33e81117 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc) if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; - for (i = 0; i <= 7; i++) + for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..16fa61ef56bf 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc, f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); + param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: @@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_fd); +int fs_param_is_file_or_string(struct p_log *log, + const struct fs_parameter_spec *p, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + switch (param->type) { + case fs_value_is_string: + return fs_param_is_string(log, p, param, result); + case fs_value_is_file: + result->uint_32 = param->dirfd; + if (result->uint_32 <= INT_MAX) + return 0; + break; + default: + break; + } + return fs_param_bad_value(log, param); +} +EXPORT_SYMBOL(fs_param_is_file_or_string); + int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/fs/fsopen.c b/fs/fsopen.c index 6cef3deccded..094a7f510edf 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -349,7 +349,6 @@ SYSCALL_DEFINE5(fsconfig, int, aux) { struct fs_context *fc; - struct fd f; int ret; int lookup_flags = 0; @@ -392,12 +391,11 @@ SYSCALL_DEFINE5(fsconfig, return -EOPNOTSUPP; } - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (fd_file(f)->f_op != &fscontext_fops) - goto out_f; + return -EINVAL; fc = fd_file(f)->private_data; if (fc->ops == &legacy_fs_context_ops) { @@ -407,17 +405,14 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: case FSCONFIG_CMD_CREATE_EXCL: - ret = -EOPNOTSUPP; - goto out_f; + return -EOPNOTSUPP; } } if (_key) { param.key = strndup_user(_key, 256); - if (IS_ERR(param.key)) { - ret = PTR_ERR(param.key); - goto out_f; - } + if (IS_ERR(param.key)) + return PTR_ERR(param.key); } switch (cmd) { @@ -496,7 +491,5 @@ SYSCALL_DEFINE5(fsconfig, } out_key: kfree(param.key); -out_f: - fdput(f); return ret; } diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index ce0ff7a9007b..2c372180d631 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -14,5 +14,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_SYSCTL) += sysctl.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 0b2da7b7e2ad..b39844d75a80 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -303,8 +303,8 @@ struct cuse_init_args { struct fuse_args_pages ap; struct cuse_init_in in; struct cuse_init_out out; - struct page *page; - struct fuse_page_desc desc; + struct folio *folio; + struct fuse_folio_desc desc; }; /** @@ -326,7 +326,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = &ia->out; - struct page *page = ap->pages[0]; + struct folio *folio = ap->folios[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; @@ -343,7 +343,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, + rc = cuse_parse_devinfo(folio_address(folio), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -411,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, kobject_uevent(&dev->kobj, KOBJ_ADD); out: kfree(ia); - __free_page(page); + folio_put(folio); return; err_cdev: @@ -429,7 +429,7 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct page *page; + struct folio *folio; struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -437,13 +437,14 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); rc = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto err; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (!ia) - goto err_free_page; + goto err_free_folio; ap = &ia->ap; ia->in.major = FUSE_KERNEL_VERSION; @@ -459,18 +460,18 @@ static int cuse_send_init(struct cuse_conn *cc) ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; ap->args.out_argvar = true; ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &ia->page; + ap->num_folios = 1; + ap->folios = &ia->folio; ap->descs = &ia->desc; - ia->page = page; + ia->folio = folio; ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); -err_free_page: - __free_page(page); +err_free_folio: + folio_put(folio); } err: return rc; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..9abbc2f2894f 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -774,16 +774,6 @@ out: return ret; } -static int fuse_dax_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - - return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); -} - static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, bool write) { @@ -1323,7 +1313,6 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) } static const struct address_space_operations fuse_dax_file_aops = { - .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1f64ae6d7a69..27ccae63495d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1028,17 +1028,27 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - - for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { + for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); + struct page *orig, *pagep; + + orig = pagep = &ap->folios[i]->page; - err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); + err = fuse_copy_page(cs, &pagep, offset, count, zeroing); if (err) return err; nbytes -= count; + + /* + * fuse_copy_page may have moved a page from a pipe instead of + * copying into our given page, so update the folios if it was + * replaced. + */ + if (pagep != orig) + ap->folios[i] = page_folio(pagep); } return 0; } @@ -1654,24 +1664,25 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { + struct folio *folio; struct page *page; unsigned int this_num; - err = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) + folio = filemap_grab_folio(mapping, index); + err = PTR_ERR(folio); + if (IS_ERR(folio)) goto out_iput; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + page = &folio->page; + this_num = min_t(unsigned, num, folio_size(folio) - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!PageUptodate(page) && !err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) { - zero_user_segment(page, this_num, PAGE_SIZE); - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && !err && offset == 0 && + (this_num == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, this_num, folio_size(folio)); + folio_mark_uptodate(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (err) goto out_iput; @@ -1703,7 +1714,7 @@ static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_retrieve_args *ra = container_of(args, typeof(*ra), ap.args); - release_pages(ra->ap.pages, ra->ap.num_pages); + release_pages(ra->ap.folios, ra->ap.num_folios); kfree(ra); } @@ -1717,7 +1728,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages; + unsigned int num_pages, cur_pages = 0; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1736,15 +1747,15 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); + args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) return -ENOMEM; ap = &ra->ap; - ap->pages = (void *) (ra + 1); - ap->descs = (void *) (ap->pages + num_pages); + ap->folios = (void *) (ra + 1); + ap->descs = (void *) (ap->folios + num_pages); args = &ap->args; args->nodeid = outarg->nodeid; @@ -1755,19 +1766,20 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && ap->num_pages < num_pages) { - struct page *page; + while (num && cur_pages < num_pages) { + struct folio *folio; unsigned int this_num; - page = find_get_page(mapping, index); - if (!page) + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].offset = offset; - ap->descs[ap->num_pages].length = this_num; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = this_num; + ap->num_folios++; + cur_pages++; offset = 0; num -= this_num; @@ -2371,13 +2383,12 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) int res; int oldfd; struct fuse_dev *fud = NULL; - struct fd f; if (get_user(oldfd, argp)) return -EFAULT; - f = fdget(oldfd); - if (!fd_file(f)) + CLASS(fd, f)(oldfd); + if (fd_empty(f)) return -EINVAL; /* @@ -2394,7 +2405,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) mutex_unlock(&fuse_mutex); } - fdput(f); return res; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54104dd48af7..e540d05549ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -366,7 +366,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; - u64 attr_version; + u64 attr_version, evict_ctr; int err; *inode = NULL; @@ -381,6 +381,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out; attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); @@ -398,7 +399,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), - attr_version); + attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); @@ -691,7 +692,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -822,7 +823,7 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, ATTR_TIMEOUT(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -1585,13 +1586,13 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct page *page) +static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { - .num_pages = 1, - .pages = &page, + .num_folios = 1, + .folios = &folio, .descs = &desc, }; char *link; @@ -1614,7 +1615,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) if (WARN_ON(res >= PAGE_SIZE)) return -EIO; - link = page_address(page); + link = folio_address(folio); link[res] = '\0'; return 0; @@ -1624,7 +1625,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - struct page *page; + struct folio *folio; int err; err = -EIO; @@ -1638,20 +1639,20 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) goto out_err; - page = alloc_page(GFP_KERNEL); + folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; - if (!page) + if (!folio) goto out_err; - err = fuse_readlink_page(inode, page); + err = fuse_readlink_page(inode, folio); if (err) { - __free_page(page); + folio_put(folio); goto out_err; } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, &folio->page); - return page_address(page); + return folio_address(folio); out_err: return ERR_PTR(err); @@ -1680,6 +1681,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -2028,7 +2031,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), - fuse_get_cache_mask(inode)); + fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) @@ -2231,7 +2234,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, &folio->page); + int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f33fbce86ae0..7d92a5479998 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -436,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) + if (idx_from >= curr_index + wpa->ia.ap.num_folios) n = n->rb_right; else if (idx_to < curr_index) n = n->rb_left; @@ -483,6 +483,21 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static inline bool fuse_folio_is_writeback(struct inode *inode, + struct folio *folio) +{ + pgoff_t last = folio_next_index(folio) - 1; + return fuse_range_is_writeback(inode, folio_index(folio), last); +} + +static void fuse_wait_on_folio_writeback(struct inode *inode, + struct folio *folio) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); +} + /* * Wait for all pending writepages on the inode to finish. * @@ -645,17 +660,20 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(ap->pages[i]); + folio_mark_dirty_lock(ap->folios[i]); if (ap->args.is_pinned) - unpin_user_page(ap->pages[i]); + unpin_folio(ap->folios[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -725,16 +743,16 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, - unsigned int npages) + unsigned int nfolios) { struct fuse_io_args *ia; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (ia) { ia->io = io; - ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, - &ia->ap.descs); - if (!ia->ap.pages) { + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.folios) { kfree(ia); ia = NULL; } @@ -744,7 +762,7 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, static void fuse_io_free(struct fuse_io_args *ia) { - kfree(ia->ap.pages); + kfree(ia->ap.folios); kfree(ia); } @@ -754,25 +772,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -843,33 +865,33 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { - loff_t pos = page_offset(ap->pages[0]) + num_read; + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = page_offset(page); - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio); + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.num_pages = 1, - .ap.pages = &page, + .ap.num_folios = 1, + .ap.folios = &folio, .ap.descs = &desc, }; ssize_t res; u64 attr_ver; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * With the temporary pages that are used to complete writeback, we can + * have writeback that extends beyond the lifetime of the folio. So + * make sure we read a properly synced folio. */ - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); attr_ver = fuse_get_attr_version(fm->fc); @@ -887,25 +909,24 @@ static int fuse_do_readpage(struct file *file, struct page *page) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } static int fuse_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int err; err = -EIO; if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); fuse_invalidate_atime(inode); out: - unlock_page(page); + folio_unlock(folio); return err; } @@ -919,8 +940,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < ap->num_pages; i++) - mapping = ap->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_folios; i++) + mapping = ap->folios[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -934,12 +955,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_pages; i++) { - struct folio *folio = page_folio(ap->pages[i]); - - folio_end_read(folio, !err); - folio_put(folio); - } + for (i = 0; i < ap->num_folios; i++) + folio_end_read(ap->folios[i], !err); if (ia->ff) fuse_file_put(ia->ff, false); @@ -951,8 +968,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; - loff_t pos = page_offset(ap->pages[0]); - size_t count = ap->num_pages << PAGE_SHIFT; + loff_t pos = folio_pos(ap->folios[0]); + /* Currently, all folios in FUSE are one page */ + size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -963,7 +981,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->descs[ap->num_pages - 1].length--; + ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -985,18 +1003,36 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int i, max_pages, nr_pages = 0; + unsigned int max_pages, nr_pages; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); - for (;;) { + /* + * This is only accurate the first time through, since readahead_folio() + * doesn't update readahead_count() from the previous folio until the + * next call. Grab nr_pages here so we know how many pages we're going + * to have to process. This means that we will exit here with + * readahead_count() == folio_nr_pages(last_folio), but we will have + * consumed all of the folios, and read_pages() will call + * readahead_folio() again which will clean up the rac. + */ + nr_pages = readahead_count(rac); + + while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; + struct folio *folio; + unsigned cur_pages = min(max_pages, nr_pages); if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1006,23 +1042,19 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - nr_pages = readahead_count(rac) - nr_pages; - if (nr_pages > max_pages) - nr_pages = max_pages; - if (nr_pages == 0) - break; - ia = fuse_io_alloc(NULL, nr_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - nr_pages = __readahead_batch(rac, ap->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); - ap->descs[i].length = PAGE_SIZE; + + while (ap->num_folios < cur_pages) { + folio = readahead_folio(rac); + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = folio_size(folio); + ap->num_folios++; } - ap->num_pages = nr_pages; fuse_send_readpages(ia, rac->file); + nr_pages -= cur_pages; } } @@ -1139,8 +1171,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, bool short_write; int err; - for (i = 0; i < ap->num_pages; i++) - fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + for (i = 0; i < ap->num_folios; i++) + fuse_wait_on_folio_writeback(inode, ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1154,24 +1186,24 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; - for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; if (err) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); } else { - if (count >= PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; else { if (short_write) - ClearPageUptodate(page); + folio_clear_uptodate(folio); count = 0; } offset = 0; } - if (ia->write.page_locked && (i == ap->num_pages - 1)) - unlock_page(page); - put_page(page); + if (ia->write.folio_locked && (i == ap->num_folios - 1)) + folio_unlock(folio); + folio_put(folio); } return err; @@ -1185,6 +1217,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); + unsigned int nr_pages = 0; size_t count = 0; int err; @@ -1193,7 +1226,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, do { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); @@ -1205,27 +1238,30 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (fault_in_iov_iter_readable(ii, bytes)) break; - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + flush_dcache_folio(folio); if (!tmp) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto again; } err = 0; - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = tmp; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].length = tmp; + ap->num_folios++; + nr_pages++; count += tmp; pos += tmp; @@ -1235,18 +1271,18 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full page, mark it uptodate */ if (tmp == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); } else { - ia->write.page_locked = true; + ia->write.folio_locked = true; break; } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - ap->num_pages < max_pages && offset == 0); + nr_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1280,8 +1316,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); - if (!ap->pages) { + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->folios) { err = -ENOMEM; break; } @@ -1303,7 +1339,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) err = -EIO; } } - kfree(ap->pages); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); @@ -1430,11 +1466,7 @@ writethrough: task_io_account_write(count); - err = file_remove_privs(file); - if (err) - goto out; - - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; @@ -1468,52 +1500,89 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } + } + + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; } - while (nbytes < *nbytesp && ap->num_pages < max_pages) { - unsigned npages; + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; size_t start; - struct page **pt_pages; - pt_pages = &ap->pages[ap->num_pages]; - ret = iov_iter_extract_pages(ii, &pt_pages, + ret = iov_iter_extract_pages(ii, &pages, *nbytesp - nbytes, - max_pages - ap->num_pages, + max_pages - nr_pages, 0, &start); if (ret < 0) break; nbytes += ret; - ret += start; - npages = DIV_ROUND_UP(ret, PAGE_SIZE); + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); - ap->descs[ap->num_pages].offset = start; - fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); - ap->num_pages += npages; - ap->descs[ap->num_pages - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } + + nr_pages += nfolios; } + kfree(pages); + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + + ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) @@ -1521,6 +1590,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, else ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; @@ -1582,7 +1652,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1596,7 +1666,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; @@ -1650,7 +1720,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); @@ -1664,7 +1734,6 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; bool exclusive; @@ -1672,9 +1741,11 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) res = generic_write_checks(iocb, from); if (res > 0) { task_io_account_write(res); - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); @@ -1760,21 +1831,21 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + folio_put(ap->folios[i]); fuse_file_put(wpa->ia.ff, false); - kfree(ap->pages); + kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct page *page) +static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } @@ -1785,8 +1856,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_inode *fi = get_fuse_inode(inode); int i; - for (i = 0; i < ap->num_pages; i++) - fuse_writepage_finish_stat(inode, ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + fuse_writepage_finish_stat(inode, ap->folios[i]); wake_up(&fi->page_waitq); } @@ -1801,7 +1872,8 @@ __acquires(fi->lock) struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; - __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + /* Currently, all folios in FUSE are one page */ + __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; int err; fi->writectr++; @@ -1841,7 +1913,8 @@ __acquires(fi->lock) for (aux = wpa->next; aux; aux = next) { next = aux->next; aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]); + fuse_writepage_finish_stat(aux->inode, + aux->ia.ap.folios[0]); fuse_writepage_free(aux); } @@ -1876,11 +1949,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - WARN_ON(!wpa->ia.ap.num_pages); + WARN_ON(!wpa->ia.ap.num_folios); while (*p) { struct fuse_writepage_args *curr; pgoff_t curr_index; @@ -1891,7 +1964,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, WARN_ON(curr->inode != wpa->inode); curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + curr->ia.ap.num_pages) + if (idx_from >= curr_index + curr->ia.ap.num_folios) p = &(*p)->rb_right; else if (idx_to < curr_index) p = &(*p)->rb_left; @@ -2023,9 +2096,9 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) wpa = kzalloc(sizeof(*wpa), GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; - ap->num_pages = 0; - ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); - if (!ap->pages) { + ap->num_folios = 0; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->folios) { kfree(wpa); wpa = NULL; } @@ -2049,19 +2122,19 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t page_index) + struct folio *tmp_folio, uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; folio_copy(tmp_folio, folio); - ap->pages[page_index] = &tmp_folio->page; - ap->descs[page_index].offset = 0; - ap->descs[page_index].length = PAGE_SIZE; + ap->folios[folio_index] = tmp_folio; + ap->descs[folio_index].offset = 0; + ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2115,7 +2188,7 @@ static int fuse_writepage_locked(struct folio *folio) goto err_writepage_args; ap = &wpa->ia.ap; - ap->num_pages = 1; + ap->num_folios = 1; folio_start_writeback(folio); fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); @@ -2143,32 +2216,32 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; - unsigned int max_pages; + struct folio **orig_folios; + unsigned int max_folios; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct fuse_conn *fc = get_fuse_conn(data->inode); - struct page **pages; - struct fuse_page_desc *descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, data->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), fc->max_pages); - WARN_ON(npages <= data->max_pages); + WARN_ON(nfolios <= data->max_folios); - pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); - if (!pages) + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) return false; - memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); - memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); - kfree(ap->pages); - ap->pages = pages; + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; ap->descs = descs; - data->max_pages = npages; + data->max_folios = nfolios; return true; } @@ -2178,7 +2251,7 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; + int num_folios = wpa->ia.ap.num_folios; int i; spin_lock(&fi->lock); @@ -2186,8 +2259,8 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + for (i = 0; i < num_folios; i++) + folio_end_writeback(data->orig_folios[i]); } /* @@ -2198,15 +2271,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * swapping the new temp page with the old one. */ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) + struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; struct fuse_writepage_args *old_wpa; struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; + WARN_ON(new_ap->num_folios != 0); + new_ap->num_folios = 1; spin_lock(&fi->lock); old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); @@ -2220,9 +2293,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, WARN_ON(tmp->inode != new_wpa->inode); curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + if (curr_index == folio->index) { + WARN_ON(tmp->ia.ap.num_folios != 1); + swap(tmp->ia.ap.folios[0], new_ap->folios[0]); break; } } @@ -2235,18 +2308,19 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]); + fuse_writepage_finish_stat(new_wpa->inode, + folio); fuse_writepage_free(new_wpa); } return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_pages); + WARN_ON(!ap->num_folios); /* * Being under writeback is unlikely but possible. For example direct @@ -2254,23 +2328,23 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (fuse_folio_is_writeback(data->inode, folio)) return true; /* Reached max pages */ - if (ap->num_pages == fc->max_pages) + if (ap->num_folios == fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) return true; return false; @@ -2288,7 +2362,14 @@ static int fuse_writepages_fill(struct folio *folio, struct folio *tmp_folio; int err; - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (!data->ff) { + err = -EIO; + data->ff = fuse_write_file_get(fi); + if (!data->ff) + goto out_unlock; + } + + if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2307,7 +2388,7 @@ static int fuse_writepages_fill(struct folio *folio, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. + * request to the fi->writepages list and increment ap->num_folios. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ @@ -2319,13 +2400,13 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } fuse_file_get(wpa->ia.ff); - data->max_pages = 1; + data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages); - data->orig_pages[ap->num_pages] = &folio->page; + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); + data->orig_folios[ap->num_folios] = folio; err = 0; if (data->wpa) { @@ -2334,9 +2415,9 @@ static int fuse_writepages_fill(struct folio *folio, * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - ap->num_pages++; + ap->num_folios++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + } else if (fuse_writepage_add(wpa, folio)) { data->wpa = wpa; } else { folio_end_writeback(folio); @@ -2351,13 +2432,13 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; + err = -EIO; if (fuse_is_bad(inode)) - return -EIO; + goto out; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) @@ -2365,26 +2446,25 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; - data.ff = fuse_write_file_get(fi); - if (!data.ff) - return -EIO; + data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) + data.orig_folios = kcalloc(fc->max_pages, + sizeof(struct folio *), + GFP_NOFS); + if (!data.orig_folios) goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_pages); + WARN_ON(!data.wpa->ia.ap.num_folios); fuse_writepages_send(&data); } + if (data.ff) + fuse_file_put(data.ff, false); - kfree(data.orig_pages); + kfree(data.orig_folios); out: - fuse_file_put(data.ff, false); return err; } @@ -2423,7 +2503,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, &folio->page); + err = fuse_do_readfolio(file, folio); if (err) goto cleanup; success: @@ -2512,17 +2592,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); return VM_FAULT_LOCKED; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6cc3d552b13..74744c6f2860 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -35,9 +35,6 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -47,6 +44,9 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -285,8 +285,8 @@ struct fuse_arg { void *value; }; -/** FUSE page descriptor */ -struct fuse_page_desc { +/** FUSE folio descriptor */ +struct fuse_folio_desc { unsigned int length; unsigned int offset; }; @@ -309,16 +309,19 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool is_pinned:1; + bool invalidate_vmap:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { struct fuse_args args; - struct page **pages; - struct fuse_page_desc *descs; - unsigned int num_pages; + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int num_folios; }; struct fuse_release_args { @@ -857,6 +860,9 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -884,6 +890,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Version counter for evict inode */ + atomic64_t evict_ctr; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -978,6 +987,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc) +{ + return atomic64_read(&fc->evict_ctr); +} + static inline bool fuse_stale_inode(const struct inode *inode, int generation, struct fuse_attr *attr) { @@ -995,25 +1009,25 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } -static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) +static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, + struct fuse_folio_desc **desc) { - struct page **pages; + struct folio **folios; - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); + folios = kzalloc(nfolios * (sizeof(struct folio *) + + sizeof(struct fuse_folio_desc)), flags); + *desc = (void *) (folios + nfolios); - return pages; + return folios; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) +static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs, + unsigned int index, + unsigned int nr_folios) { int i; - for (i = index; i < index + nr_pages; i++) + for (i = index; i < index + nr_folios; i++) descs[i].length = PAGE_SIZE - descs[i].offset; } @@ -1037,7 +1051,8 @@ extern const struct dentry_operations fuse_root_dentry_operations; */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + u64 evict_ctr); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); @@ -1062,7 +1077,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; - bool page_locked; + bool folio_locked; } write; }; struct fuse_args_pages ap; @@ -1127,7 +1142,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask); + u64 attr_valid, u32 cache_mask, + u64 evict_ctr); u32 fuse_get_cache_mask(struct inode *inode); @@ -1480,4 +1496,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd3321e29a3e..3ce4f4e81d09 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,6 +35,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -173,6 +175,14 @@ static void fuse_evict_inode(struct inode *inode) fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(fi->iocachectr != 0); @@ -206,17 +216,30 @@ static ino_t fuse_squash_ino(u64 ino64) void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask) + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - /* Clear basic stats from invalid mask */ - set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -295,9 +318,9 @@ u32 fuse_get_cache_mask(struct inode *inode) return STATX_MTIME | STATX_CTIME | STATX_SIZE; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - struct fuse_statx *sx, - u64 attr_valid, u64 attr_version) +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -331,7 +354,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode_get_mtime(inode); - fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -372,6 +396,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { @@ -426,7 +457,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; @@ -487,8 +519,8 @@ retry: fi->nlookup++; spin_unlock(&fi->lock); done: - fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); - + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } @@ -940,11 +972,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -1001,7 +1034,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -1610,7 +1643,8 @@ static int fuse_fill_super_submount(struct super_block *sb, return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); - root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does @@ -2063,8 +2097,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2075,6 +2115,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 572ce8a82ceb..2d9abf48828f 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -10,6 +10,8 @@ #include <linux/fileattr.h> #include <linux/fsverity.h> +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 + static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) { @@ -140,7 +142,7 @@ static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, { struct fsverity_enable_arg enable; struct fsverity_enable_arg __user *uarg = (void __user *)arg; - const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE; + const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; if (copy_from_user(&enable, uarg, sizeof(enable))) return -EFAULT; @@ -249,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) + if (!ap.folios || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -304,14 +306,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) + while (ap.num_folios < max_pages) { + ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); + if (!ap.folios[ap.num_folios]) goto out; - ap.num_pages++; + ap.num_folios++; } - /* okay, let's send it to the client */ ap.args.opcode = FUSE_IOCTL; ap.args.nodeid = ff->nodeid; @@ -325,8 +326,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } @@ -364,7 +365,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_local_page(ap.pages[0]); + vaddr = kmap_local_folio(ap.folios[0], 0); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -392,17 +393,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); + while (ap.num_folios) + folio_put(ap.folios[--ap.num_folios]); + kfree(ap.folios); return err ? err : outarg.result; } diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 62aee8289d11..607ef735ad4a 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file) fuse_invalidate_atime(inode); } -static void fuse_file_modified(struct file *file) +static void fuse_passthrough_end_write(struct kiocb *iocb, ssize_t ret) { - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + fuse_write_update_attr(inode, iocb->ki_pos, ret); } ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -34,7 +34,6 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, .accessed = fuse_file_accessed, }; @@ -62,8 +61,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, @@ -88,15 +86,20 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, struct file *backing_file = fuse_file_passthrough(ff); struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = in, .accessed = fuse_file_accessed, }; + struct kiocb iocb; + ssize_t ret; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, - backing_file, ppos ? *ppos : 0, len, flags); + backing_file, *ppos, len, flags); - return backing_file_splice_read(backing_file, ppos, pipe, len, flags, - &ctx); + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + ret = backing_file_splice_read(backing_file, &iocb, pipe, len, flags, &ctx); + *ppos = iocb.ki_pos; + + return ret; } ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, @@ -109,16 +112,18 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, ssize_t ret; struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = out, - .end_write = fuse_file_modified, + .end_write = fuse_passthrough_end_write, }; + struct kiocb iocb; pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, - backing_file, ppos ? *ppos : 0, len, flags); + backing_file, *ppos, len, flags); inode_lock(inode); - ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, - &ctx); + init_sync_kiocb(&iocb, out); + iocb.ki_pos = *ppos; + ret = backing_file_splice_write(pipe, backing_file, &iocb, len, flags, &ctx); + *ppos = iocb.ki_pos; inode_unlock(inode); return ret; @@ -130,7 +135,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) struct file *backing_file = fuse_file_passthrough(ff); struct backing_file_ctx ctx = { .cred = ff->cred, - .user_file = file, .accessed = fuse_file_accessed, }; @@ -234,7 +238,6 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) goto out; backing_sb = file_inode(file)->i_sb; - pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth); res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) goto out_fput; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c8..17ce9636a2b1 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, - u64 attr_version) + u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; @@ -233,7 +233,7 @@ retry: } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), - attr_version); + attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid) } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) + struct dir_context *ctx, u64 attr_version, + u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, buf += reclen; nbytes -= reclen; - ret = fuse_direntplus_link(file, direntplus, attr_version); + ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } @@ -331,26 +332,27 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct page *page; + struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; - struct fuse_page_desc desc = { .length = PAGE_SIZE }; - u64 attr_version = 0; + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + u64 attr_version = 0, evict_ctr = 0; bool locked; - page = alloc_page(GFP_KERNEL); - if (!page) + folio = folio_alloc(GFP_KERNEL, 0); + if (!folio) return -ENOMEM; plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &page; + ap->num_folios = 1; + ap->folios = &folio; ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -367,15 +369,16 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(page_address(page), res, - file, ctx, attr_version); + res = parse_dirplusfile(folio_address(folio), res, + file, ctx, attr_version, + evict_ctr); } else { - res = parse_dirfile(page_address(page), res, file, + res = parse_dirfile(folio_address(folio), res, file, ctx); } } - __free_page(page); + folio_put(folio); fuse_invalidate_atime(inode); return res; } diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 000000000000..b272bb333005 --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include <linux/sysctl.h> + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +static struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6404a189e989..82afe78ec542 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,7 +97,8 @@ struct virtio_fs_req_work { }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight); + struct fuse_req *req, bool in_flight, + gfp_t gfp); static const struct constant_table dax_param_enums[] = { {"always", FUSE_DAX_ALWAYS }, @@ -242,7 +243,7 @@ static ssize_t cpu_list_show(struct kobject *kobj, qid = fsvq->vq->index; for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid - VQ_REQUEST)) { + if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid)) { if (first) ret = snprintf(buf + pos, size - pos, "%u", cpu); else @@ -521,6 +522,7 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) return -EINVAL; } + dev_info(&vdev->dev, "discovered new tag: %s\n", fs->tag); return 0; } @@ -575,6 +577,8 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) /* Dispatch pending requests */ while (1) { + unsigned int flags; + spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->queued_reqs, struct fuse_req, list); @@ -585,7 +589,9 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - ret = virtio_fs_enqueue_req(fsvq, req, true); + flags = memalloc_nofs_save(); + ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL); + memalloc_nofs_restore(flags); if (ret < 0) { if (ret == -ENOSPC) { spin_lock(&fsvq->lock); @@ -686,7 +692,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } /* Allocate and copy args into req->argbuf */ -static int copy_args_to_argbuf(struct fuse_req *req) +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp) { struct fuse_args *args = req->args; unsigned int offset = 0; @@ -700,7 +706,7 @@ static int copy_args_to_argbuf(struct fuse_req *req) len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + fuse_len_args(num_out, args->out_args); - req->argbuf = kmalloc(len, GFP_ATOMIC); + req->argbuf = kmalloc(len, gfp); if (!req->argbuf) return -ENOMEM; @@ -760,7 +766,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; - struct page *page; + struct folio *folio; /* * TODO verify that server properly follows FUSE protocol @@ -772,12 +778,12 @@ static void virtio_fs_request_complete(struct fuse_req *req, if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { thislen = ap->descs[i].length; if (len < thislen) { WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); + folio = ap->folios[i]; + folio_zero_segment(folio, len, thislen); len = 0; } else { len -= thislen; @@ -870,23 +876,23 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f goto fallback; for_each_cpu(cpu, mask) - fs->mq_map[cpu] = q; + fs->mq_map[cpu] = q + VQ_REQUEST; } return; fallback: /* Attempt to map evenly in groups over the CPUs */ masks = group_cpus_evenly(fs->num_request_queues); - /* If even this fails we default to all CPUs use queue zero */ + /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) - fs->mq_map[cpu] = 0; + fs->mq_map[cpu] = VQ_REQUEST; return; } for (q = 0; q < fs->num_request_queues; q++) { for_each_cpu(cpu, &masks[q]) - fs->mq_map[cpu] = q; + fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); } @@ -1267,15 +1273,15 @@ static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *r } /* Count number of scatter-gather elements required */ -static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { - this_len = min(page_descs[i].length, total_len); + for (i = 0; i < num_folios && total_len; i++) { + this_len = min(folio_descs[i].length, total_len); total_len -= this_len; } @@ -1294,8 +1300,8 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } if (!test_bit(FR_ISREPLY, &req->flags)) @@ -1308,27 +1314,27 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } return total_sgs; } -/* Add pages to scatter-gather list and return number of elements used */ -static unsigned int sg_init_fuse_pages(struct scatterlist *sg, - struct page **pages, - struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +/* Add folios to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_folios(struct scatterlist *sg, + struct folio **folios, + struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { + for (i = 0; i < num_folios && total_len; i++) { sg_init_table(&sg[i], 1); - this_len = min(page_descs[i].length, total_len); - sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + this_len = min(folio_descs[i].length, total_len); + sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset); total_len -= this_len; } @@ -1353,10 +1359,10 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) - total_sgs += sg_init_fuse_pages(&sg[total_sgs], - ap->pages, ap->descs, - ap->num_pages, - args[numargs - 1].size); + total_sgs += sg_init_fuse_folios(&sg[total_sgs], + ap->folios, ap->descs, + ap->num_folios, + args[numargs - 1].size); if (len_used) *len_used = len; @@ -1366,7 +1372,8 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight) + struct fuse_req *req, bool in_flight, + gfp_t gfp) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; @@ -1387,8 +1394,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { - sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); - sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); if (!sgs || !sg) { ret = -ENOMEM; goto out; @@ -1396,7 +1403,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } /* Use a bounce buffer since stack args cannot be mapped */ - ret = copy_args_to_argbuf(req); + ret = copy_args_to_argbuf(req, gfp); if (ret < 0) goto out; @@ -1481,7 +1488,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) clear_bit(FR_PENDING, &req->flags); fs = fiq->priv; - queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()]; + queue_id = fs->mq_map[raw_smp_processor_id()]; pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n", __func__, req->in.h.opcode, req->in.h.unique, @@ -1490,7 +1497,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) queue_id); fsvq = &fs->vqs[queue_id]; - ret = virtio_fs_enqueue_req(fsvq, req, false); + ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC); if (ret < 0) { if (ret == -ENOSPC) { /* @@ -1691,6 +1698,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d418d8b5367f..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f7dd64856c9b..c9bb3be21d2b 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -251,6 +251,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = filemap_fdatawait(inode->i_mapping); if (error) goto out; + truncate_inode_pages(inode->i_mapping, 0); if (new_flags & GFS2_DIF_JDATA) gfs2_ordered_del_inode(ip); } @@ -1586,6 +1587,7 @@ const struct file_operations gfs2_file_fops = { .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, + .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { @@ -1598,6 +1600,7 @@ const struct file_operations gfs2_dir_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, + .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 269c3bc7fced..8c4c1f871a88 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -34,8 +34,8 @@ #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> -#include <linux/fdtable.h> #include <linux/file.h> +#include <linux/random.h> #include "gfs2.h" #include "incore.h" @@ -563,11 +563,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) gl->gl_tchange = jiffies; } -static void gfs2_set_demote(struct gfs2_glock *gl) +static void gfs2_set_demote(int nr, struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - set_bit(GLF_DEMOTE, &gl->gl_flags); + set_bit(nr, &gl->gl_flags); smp_mb(); wake_up(&sdp->sd_async_glock_wait); } @@ -959,20 +959,22 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) gfs2_holder_uninit(&gh); } -static bool gfs2_try_evict(struct gfs2_glock *gl) +static void gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; - bool evicted = false; /* * If there is contention on the iopen glock and we have an inode, try - * to grab and release the inode so that it can be evicted. This will - * allow the remote node to go ahead and delete the inode without us - * having to do it, which will avoid rgrp glock thrashing. + * to grab and release the inode so that it can be evicted. The + * GIF_DEFER_DELETE flag indicates to gfs2_evict_inode() that the inode + * should not be deleted locally. This will allow the remote node to + * go ahead and delete the inode without us having to do it, which will + * avoid rgrp glock thrashing. * * The remote node is likely still holding the corresponding inode * glock, so it will run before we get to verify that the delete has - * happened below. + * happened below. (Verification is triggered by the call to + * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; @@ -980,8 +982,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - gl->gl_no_formal_ino = ip->i_no_formal_ino; - set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + wait_on_inode(&ip->i_inode); + if (is_bad_inode(&ip->i_inode)) { + iput(&ip->i_inode); + ip = NULL; + } + } + if (ip) { + set_bit(GIF_DEFER_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); @@ -989,7 +997,7 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip) { - clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + clear_bit(GIF_DEFER_DELETE, &ip->i_flags); if (!igrab(&ip->i_inode)) ip = NULL; } @@ -998,9 +1006,7 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); } - evicted = !ip; } - return evicted; } bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) @@ -1009,18 +1015,18 @@ bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) return false; - return queue_delayed_work(sdp->sd_delete_wq, - &gl->gl_delete, 0); + return !mod_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } -static bool gfs2_queue_verify_evict(struct gfs2_glock *gl) +bool gfs2_queue_verify_delete(struct gfs2_glock *gl, bool later) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + unsigned long delay; - if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) + if (test_and_set_bit(GLF_VERIFY_DELETE, &gl->gl_flags)) return false; - return queue_delayed_work(sdp->sd_delete_wq, - &gl->gl_delete, 5 * HZ); + delay = later ? HZ + get_random_long() % (HZ * 9) : 0; + return queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, delay); } static void delete_work_func(struct work_struct *work) @@ -1028,43 +1034,21 @@ static void delete_work_func(struct work_struct *work) struct delayed_work *dwork = to_delayed_work(work); struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct inode *inode; - u64 no_addr = gl->gl_name.ln_number; + bool verify_delete = test_and_clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); - if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) { - /* - * If we can evict the inode, give the remote node trying to - * delete the inode some time before verifying that the delete - * has happened. Otherwise, if we cause contention on the inode glock - * immediately, the remote node will think that we still have - * the inode in use, and so it will give up waiting. - * - * If we can't evict the inode, signal to the remote node that - * the inode is still in use. We'll later try to delete the - * inode locally in gfs2_evict_inode. - * - * FIXME: We only need to verify that the remote node has - * deleted the inode because nodes before this remote delete - * rework won't cooperate. At a later time, when we no longer - * care about compatibility with such nodes, we can skip this - * step entirely. - */ - if (gfs2_try_evict(gl)) { - if (test_bit(SDF_KILL, &sdp->sd_flags)) - goto out; - if (gfs2_queue_verify_evict(gl)) - return; - } - goto out; - } + if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) + gfs2_try_evict(gl); + + if (verify_delete) { + u64 no_addr = gl->gl_name.ln_number; + struct inode *inode; - if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) { inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && !test_bit(SDF_KILL, &sdp->sd_flags) && - gfs2_queue_verify_evict(gl)) + gfs2_queue_verify_delete(gl, true)) return; } else { d_prune_aliases(inode); @@ -1072,7 +1056,6 @@ static void delete_work_func(struct work_struct *work) } } -out: gfs2_glock_put(gl); } @@ -1101,7 +1084,7 @@ static void glock_work_func(struct work_struct *work) if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); - gfs2_set_demote(gl); + gfs2_set_demote(GLF_DEMOTE, gl); } } run_queue(gl, 0); @@ -1443,10 +1426,7 @@ out: static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { - if (delay) - set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); - else - gfs2_set_demote(gl); + gfs2_set_demote(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; @@ -1636,12 +1616,6 @@ int gfs2_glock_poll(struct gfs2_holder *gh) return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } -static inline bool needs_demote(struct gfs2_glock *gl) -{ - return (test_bit(GLF_DEMOTE, &gl->gl_flags) || - test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); -} - static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; @@ -1650,8 +1624,8 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh) /* * This holder should not be cached, so mark it for demote. - * Note: this should be done before the check for needs_demote - * below. + * Note: this should be done before the glock_needs_demote + * check below. */ if (gh->gh_flags & GL_NOCACHE) request_demote(gl, LM_ST_UNLOCKED, 0, false); @@ -1664,7 +1638,7 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh) * If there hasn't been a demote request we are done. * (Let the remaining holders, if any, keep holding it.) */ - if (!needs_demote(gl)) { + if (!glock_needs_demote(gl)) { if (list_empty(&gl->gl_holders)) fast_path = 1; } @@ -2118,7 +2092,7 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) void gfs2_cancel_delete_work(struct gfs2_glock *gl) { clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); - clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags); + clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); } @@ -2371,7 +2345,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'N'; if (test_bit(GLF_TRY_TO_EVICT, gflags)) *p++ = 'e'; - if (test_bit(GLF_VERIFY_EVICT, gflags)) + if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; *p = 0; return buf; @@ -2768,25 +2742,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) i->file = NULL; } - rcu_read_lock(); for(;; i->fd++) { - struct inode *inode; - - i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); + i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } - inode = file_inode(i->file); - if (inode->i_sb == i->sb) + if (file_inode(i->file)->i_sb == i->sb) break; - rcu_read_unlock(); fput(i->file); - rcu_read_lock(); } - rcu_read_unlock(); return i->file; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index adf0091cc98f..c171f745650f 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -245,6 +245,7 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); void gfs2_glock_complete(struct gfs2_glock *gl, int ret); bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); +bool gfs2_queue_verify_delete(struct gfs2_glock *gl, bool later); void gfs2_cancel_delete_work(struct gfs2_glock *gl); void gfs2_flush_delete_work(struct gfs2_sbd *sdp); void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); @@ -284,4 +285,10 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); +static inline bool glock_needs_demote(struct gfs2_glock *gl) +{ + return (test_bit(GLF_DEMOTE, &gl->gl_flags) || + test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 95d8081681dc..eb4714f299ef 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -470,7 +470,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) * Returns: errno */ -int gfs2_inode_refresh(struct gfs2_inode *ip) +static int gfs2_inode_refresh(struct gfs2_inode *ip) { struct buffer_head *dibh; int error; @@ -494,11 +494,18 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) static int inode_go_instantiate(struct gfs2_glock *gl) { struct gfs2_inode *ip = gl->gl_object; + struct gfs2_glock *io_gl; + int error; if (!ip) /* no inode to populate - read it in later */ return 0; - return gfs2_inode_refresh(ip); + error = gfs2_inode_refresh(ip); + if (error) + return error; + io_gl = ip->i_iopen_gh.gh_gl; + io_gl->gl_no_formal_ino = ip->i_no_formal_ino; + return 0; } static int inode_go_held(struct gfs2_holder *gh) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index aa4ef67a34e0..4e19cce3d906 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -329,7 +329,7 @@ enum { GLF_BLOCKING = 15, GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ - GLF_VERIFY_EVICT = 18, /* iopen glocks only */ + GLF_VERIFY_DELETE = 18, /* iopen glocks only */ }; struct gfs2_glock { @@ -376,7 +376,7 @@ enum { GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, - GIF_DEFERRED_DELETE = 7, + GIF_DEFER_DELETE = 7, }; struct gfs2_inode { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1b95db2c3aac..6fbbaaad1cd0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -750,6 +750,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; gfs2_cancel_delete_work(io_gl); + io_gl->gl_no_formal_ino = ip->i_no_formal_ino; retry: error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index fd15d1c6b6fb..9e5e1622d50a 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -93,8 +93,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 no_formal_ino, unsigned int blktype); -int gfs2_inode_refresh(struct gfs2_inode *ip); - struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); int gfs2_permission(struct mnt_idmap *idmap, diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index fa5134df985f..58aeeae7ed8c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -224,8 +224,21 @@ static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate) return -1; } +/* Taken from fs/dlm/lock.c. */ + +static bool middle_conversion(int cur, int req) +{ + return (cur == DLM_LOCK_PR && req == DLM_LOCK_CW) || + (cur == DLM_LOCK_CW && req == DLM_LOCK_PR); +} + +static bool down_conversion(int cur, int req) +{ + return !middle_conversion(cur, req) && req < cur; +} + static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, - const int req) + const int cur, const int req) { u32 lkf = 0; @@ -251,7 +264,14 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, if (!test_bit(GLF_INITIAL, &gl->gl_flags)) { lkf |= DLM_LKF_CONVERT; - if (test_bit(GLF_BLOCKING, &gl->gl_flags)) + + /* + * The DLM_LKF_QUECVT flag needs to be set for "first come, + * first served" semantics, but it must only be set for + * "upward" lock conversions or else DLM will reject the + * request as invalid. + */ + if (!down_conversion(cur, req)) lkf |= DLM_LKF_QUECVT; } @@ -271,13 +291,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; - int req; + int cur, req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; int error; + cur = make_mode(gl->gl_name.ln_sbd, gl->gl_state); req = make_mode(gl->gl_name.ln_sbd, req_state); - lkf = make_flags(gl, flags, req); + lkf = make_flags(gl, flags, cur, req); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (test_bit(GLF_INITIAL, &gl->gl_flags)) { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 2e6bc77f4f81..58bc5013ca49 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -149,7 +149,7 @@ static void gfs2_qd_list_dispose(struct list_head *list) static enum lru_status gfs2_qd_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = @@ -236,8 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - qd->qd_lockref.count = 0; - spin_lock_init(&qd->qd_lockref.lock); + lockref_init(&qd->qd_lockref, 0); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index f462d9cb3087..988f38dc5b2c 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -44,8 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, int ret; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ - if (capable(CAP_SYS_RESOURCE) || - sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF || + capable(CAP_SYS_RESOURCE)) return 0; ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 29c772816765..b14e54b38ee8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1879,7 +1879,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip */ ip = gl->gl_object; - if (ip || !gfs2_queue_try_to_evict(gl)) + if (ip || !gfs2_queue_verify_delete(gl, false)) gfs2_glock_put(gl); else found++; @@ -1987,10 +1987,8 @@ static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - u32 skip; - get_random_bytes(&skip, sizeof(skip)); - return skip % sdp->sd_rgrps; + return get_random_u32() % sdp->sd_rgrps; } static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6678060ed4d2..92a3b6ddafdc 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,10 +44,10 @@ #include "xattr.h" #include "lops.h" -enum dinode_demise { - SHOULD_DELETE_DINODE, - SHOULD_NOT_DELETE_DINODE, - SHOULD_DEFER_EVICTION, +enum evict_behavior { + EVICT_SHOULD_DELETE, + EVICT_SHOULD_SKIP_DELETE, + EVICT_SHOULD_DEFER_DELETE, }; /** @@ -1030,7 +1030,7 @@ static int gfs2_drop_inode(struct inode *inode) if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + if (glock_needs_demote(gl)) clear_nlink(inode); } @@ -1045,7 +1045,7 @@ static int gfs2_drop_inode(struct inode *inode) struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); - if (!gfs2_queue_try_to_evict(gl)) + if (!gfs2_queue_verify_delete(gl, true)) gfs2_glock_put_async(gl); return 0; } @@ -1257,7 +1257,7 @@ static void gfs2_glock_put_eventually(struct gfs2_glock *gl) gfs2_glock_put(gl); } -static bool gfs2_upgrade_iopen_glock(struct inode *inode) +static enum evict_behavior gfs2_upgrade_iopen_glock(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1272,9 +1272,9 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) * exclusive access to the iopen glock here. * * Otherwise, the other nodes holding the lock will be notified about - * our locking request. If they do not have the inode open, they are - * expected to evict the cached inode and release the lock, allowing us - * to proceed. + * our locking request (see iopen_go_callback()). If they do not have + * the inode open, they are expected to evict the cached inode and + * release the lock, allowing us to proceed. * * Otherwise, if they cannot evict the inode, they are expected to poke * the inode glock (note: not the iopen glock). We will notice that @@ -1290,17 +1290,22 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); error = gfs2_glock_nq(gh); if (error) - return false; + return EVICT_SHOULD_SKIP_DELETE; wait_event_interruptible_timeout(sdp->sd_async_glock_wait, !test_bit(HIF_WAIT, &gh->gh_iflags) || - test_bit(GLF_DEMOTE, &ip->i_gl->gl_flags), + glock_needs_demote(ip->i_gl), 5 * HZ); if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_dq(gh); - return false; + if (glock_needs_demote(ip->i_gl)) + return EVICT_SHOULD_SKIP_DELETE; + return EVICT_SHOULD_DEFER_DELETE; } - return gfs2_glock_holder_ready(gh) == 0; + error = gfs2_glock_holder_ready(gh); + if (error) + return EVICT_SHOULD_SKIP_DELETE; + return EVICT_SHOULD_DELETE; } /** @@ -1313,8 +1318,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) * * Returns: the fate of the dinode */ -static enum dinode_demise evict_should_delete(struct inode *inode, - struct gfs2_holder *gh) +static enum evict_behavior evict_should_delete(struct inode *inode, + struct gfs2_holder *gh) { struct gfs2_inode *ip = GFS2_I(inode); struct super_block *sb = inode->i_sb; @@ -1324,12 +1329,12 @@ static enum dinode_demise evict_should_delete(struct inode *inode, if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) goto should_delete; - if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) - return SHOULD_DEFER_EVICTION; + if (test_bit(GIF_DEFER_DELETE, &ip->i_flags)) + return EVICT_SHOULD_DEFER_DELETE; /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - return SHOULD_DEFER_EVICTION; + return EVICT_SHOULD_DEFER_DELETE; /* Must not read inode block until block type has been verified */ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); @@ -1337,34 +1342,37 @@ static enum dinode_demise evict_should_delete(struct inode *inode, glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); - return SHOULD_DEFER_EVICTION; + return EVICT_SHOULD_DEFER_DELETE; } if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) - return SHOULD_NOT_DELETE_DINODE; + return EVICT_SHOULD_SKIP_DELETE; ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (ret) - return SHOULD_NOT_DELETE_DINODE; + return EVICT_SHOULD_SKIP_DELETE; ret = gfs2_instantiate(gh); if (ret) - return SHOULD_NOT_DELETE_DINODE; + return EVICT_SHOULD_SKIP_DELETE; /* * The inode may have been recreated in the meantime. */ if (inode->i_nlink) - return SHOULD_NOT_DELETE_DINODE; + return EVICT_SHOULD_SKIP_DELETE; should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - if (!gfs2_upgrade_iopen_glock(inode)) { + enum evict_behavior behavior = + gfs2_upgrade_iopen_glock(inode); + + if (behavior != EVICT_SHOULD_DELETE) { gfs2_holder_uninit(&ip->i_iopen_gh); - return SHOULD_NOT_DELETE_DINODE; + return behavior; } } - return SHOULD_DELETE_DINODE; + return EVICT_SHOULD_DELETE; } /** @@ -1475,8 +1483,10 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + enum evict_behavior behavior; int ret; + gfs2_holder_mark_uninitialized(&gh); if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr) goto out; @@ -1488,11 +1498,20 @@ static void gfs2_evict_inode(struct inode *inode) if (!sdp->sd_jdesc) goto out; - gfs2_holder_mark_uninitialized(&gh); - ret = evict_should_delete(inode, &gh); - if (ret == SHOULD_DEFER_EVICTION) - goto out; - if (ret == SHOULD_DELETE_DINODE) + behavior = evict_should_delete(inode, &gh); + if (behavior == EVICT_SHOULD_DEFER_DELETE && + !test_bit(SDF_KILL, &sdp->sd_flags)) { + struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl; + + if (io_gl) { + gfs2_glock_hold(io_gl); + if (!gfs2_queue_verify_delete(io_gl, true)) + gfs2_glock_put(io_gl); + goto out; + } + behavior = EVICT_SHOULD_DELETE; + } + if (behavior == EVICT_SHOULD_DELETE) ret = evict_unlinked_inode(inode); else ret = evict_linked_inode(inode); @@ -1500,11 +1519,11 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (gfs2_holder_initialized(&gh)) - gfs2_glock_dq_uninit(&gh); if (ret && ret != GLR_TRYFAILED && ret != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); @@ -1537,11 +1556,13 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (!ip) return NULL; ip->i_no_addr = 0; + ip->i_no_formal_ino = 0; ip->i_flags = 0; ip->i_gl = NULL; gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); + ip->i_diskflags = 0; ip->i_rahead = 0; return &ip->i_inode; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eeac99765f0d..fe09c2093a93 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -15,10 +15,11 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vfs.h> @@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfs_remount(struct super_block *sb, int *flags, char *data) +static int hfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - *flags |= SB_NODIRATIME; - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + fc->sb_flags |= SB_NODIRATIME; + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { + + if (!(fc->sb_flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = { .put_super = hfs_put_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, - .remount_fs = hfs_remount, .show_options = hfs_show_options, }; @@ -188,181 +191,112 @@ enum { opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, opt_part, opt_session, opt_type, opt_creator, opt_quiet, opt_codepage, opt_iocharset, - opt_err }; -static const match_table_t tokens = { - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_umask, "umask=%o" }, - { opt_file_umask, "file_umask=%o" }, - { opt_dir_umask, "dir_umask=%o" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_type, "type=%s" }, - { opt_creator, "creator=%s" }, - { opt_quiet, "quiet" }, - { opt_codepage, "codepage=%s" }, - { opt_iocharset, "iocharset=%s" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32oct ("file_umask", opt_file_umask), + fsparam_u32oct ("dir_umask", opt_dir_umask), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("type", opt_type), + fsparam_string ("creator", opt_creator), + fsparam_flag ("quiet", opt_quiet), + fsparam_string ("codepage", opt_codepage), + fsparam_string ("iocharset", opt_iocharset), + {} }; -static inline int match_fourchar(substring_t *arg, u32 *result) -{ - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - /* - * parse_options() + * hfs_parse_param() * - * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger - * This function is called by hfs_read_super() to parse the mount options. + * This function is called by the vfs to parse the mount options. */ -static int parse_options(char *options, struct hfs_sb_info *hsb) +static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - /* initialize the sb with defaults */ - hsb->s_uid = current_uid(); - hsb->s_gid = current_gid(); - hsb->s_file_umask = 0133; - hsb->s_dir_umask = 0022; - hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ - hsb->s_quiet = 0; - hsb->part = -1; - hsb->session = -1; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(hsb->s_uid)) { - pr_err("invalid uid %d\n", tmp); - return 0; - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(hsb->s_gid)) { - pr_err("invalid gid %d\n", tmp); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_file_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("file_umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - break; - case opt_dir_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("dir_umask requires a value\n"); - return 0; - } - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_part: - if (match_int(&args[0], &hsb->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &hsb->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &hsb->s_type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_creator: - if (match_fourchar(&args[0], &hsb->s_creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_quiet: - hsb->s_quiet = 1; - break; - case opt_codepage: - if (hsb->nls_disk) { - pr_err("unable to change codepage\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_disk = load_nls(p); - if (!hsb->nls_disk) { - pr_err("unable to load codepage \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_iocharset: - if (hsb->nls_io) { - pr_err("unable to change iocharset\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_io = load_nls(p); - if (!hsb->nls_io) { - pr_err("unable to load iocharset \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - default: - return 0; - } - } + struct hfs_sb_info *hsb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* hfs does not honor any fs-specific options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; - if (hsb->nls_disk && !hsb->nls_io) { - hsb->nls_io = load_nls_default(); + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_uid: + hsb->s_uid = result.uid; + break; + case opt_gid: + hsb->s_gid = result.gid; + break; + case opt_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_file_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + break; + case opt_dir_umask: + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_part: + hsb->part = result.uint_32; + break; + case opt_session: + hsb->session = result.uint_32; + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_type, param->string, 4); + break; + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_creator, param->string, 4); + break; + case opt_quiet: + hsb->s_quiet = 1; + break; + case opt_codepage: + if (hsb->nls_disk) { + pr_err("unable to change codepage\n"); + return -EINVAL; + } + hsb->nls_disk = load_nls(param->string); + if (!hsb->nls_disk) { + pr_err("unable to load codepage \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_iocharset: + if (hsb->nls_io) { + pr_err("unable to change iocharset\n"); + return -EINVAL; + } + hsb->nls_io = load_nls(param->string); if (!hsb->nls_io) { - pr_err("unable to load default iocharset\n"); - return 0; + pr_err("unable to load iocharset \"%s\"\n", + param->string); + return -EINVAL; } + break; + default: + return -EINVAL; } - hsb->s_dir_umask &= 0777; - hsb->s_file_umask &= 0577; - return 1; + return 0; } /* @@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) * hfs_btree_init() to get the necessary data about the extents and * catalog B-trees and, finally, reading the root inode into memory. */ -static int hfs_fill_super(struct super_block *sb, void *data, int silent) +static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct hfs_sb_info *sbi; + struct hfs_sb_info *sbi = HFS_SB(sb); struct hfs_find_data fd; hfs_cat_rec rec; struct inode *root_inode; + int silent = fc->sb_flags & SB_SILENT; int res; - sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; + /* load_nls_default does not fail */ + if (sbi->nls_disk && !sbi->nls_io) + sbi->nls_io = load_nls_default(); + sbi->s_dir_umask &= 0777; + sbi->s_file_umask &= 0577; - sbi->sb = sb; - sb->s_fs_info = sbi; spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); - res = -EINVAL; - if (!parse_options((char *)data, sbi)) { - pr_err("unable to parse mount options\n"); - goto bail; - } - + sbi->sb = sb; sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= SB_NODIRATIME; @@ -419,11 +349,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { - if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { + if (fd.entrylength != sizeof(rec.dir)) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); + if (rec.type != HFS_CDR_DIR) + res = -EIO; } if (res) goto bail_hfs_find; @@ -451,18 +383,56 @@ bail: return res; } -static struct dentry *hfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfs_fill_super); +} + +static void hfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfs_context_ops = { + .parse_param = hfs_parse_param, + .get_tree = hfs_get_tree, + .reconfigure = hfs_reconfigure, + .free = hfs_free_fc, +}; + +static int hfs_init_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); + struct hfs_sb_info *hsb; + + hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + if (!hsb) + return -ENOMEM; + + fc->s_fs_info = hsb; + fc->ops = &hfs_context_ops; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* initialize options with defaults */ + hsb->s_uid = current_uid(); + hsb->s_gid = current_gid(); + hsb->s_file_umask = 0133; + hsb->s_dir_umask = 0022; + hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_quiet = 0; + hsb->part = -1; + hsb->session = -1; + } + + return 0; } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", - .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfs_init_fs_context, }; MODULE_ALIAS_FS("hfs"); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 59ce81dca73f..2f089bff0095 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -21,6 +21,7 @@ #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <linux/fs_context.h> #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -156,6 +157,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -307,7 +309,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } @@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* options.c */ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); -int hfsplus_parse_options_remount(char *input, int *force); -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param); int hfsplus_show_options(struct seq_file *seq, struct dentry *root); /* part_tbl.c */ diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c94a58762ad6..a66a09a56bf7 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,7 +12,8 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/nls.h> #include <linux/mount.h> #include <linux/seq_file.h> @@ -23,26 +24,23 @@ enum { opt_creator, opt_type, opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, - opt_nodecompose, opt_decompose, - opt_barrier, opt_nobarrier, - opt_force, opt_err + opt_decompose, opt_barrier, + opt_force, }; -static const match_table_t tokens = { - { opt_creator, "creator=%s" }, - { opt_type, "type=%s" }, - { opt_umask, "umask=%o" }, - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_nls, "nls=%s" }, - { opt_decompose, "decompose" }, - { opt_nodecompose, "nodecompose" }, - { opt_barrier, "barrier" }, - { opt_nobarrier, "nobarrier" }, - { opt_force, "force" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_string ("creator", opt_creator), + fsparam_string ("type", opt_type), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("nls", opt_nls), + fsparam_flag_no ("decompose", opt_decompose), + fsparam_flag_no ("barrier", opt_barrier), + fsparam_flag ("force", opt_force), + {} }; /* Initialize an options object to reasonable defaults */ @@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->session = -1; } -/* convert a "four byte character" to a 32 bit int with error checks */ -static inline int match_fourchar(substring_t *arg, u32 *result) +/* Parse options from mount. Returns nonzero errno on failure */ +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param) { - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - -int hfsplus_parse_options_remount(char *input, int *force) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!input) - return 1; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_force: - *force = 1; - break; - default: - break; + struct hfsplus_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* + * Only the force option is examined during remount, all others + * are ignored. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + strncmp(param->key, "force", 5)) + return 0; + + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; } - } - - return 1; -} - -/* Parse options from mount. Returns 0 on failure */ -/* input is the options passed to mount() as a string */ -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - if (!input) - goto done; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_creator: - if (match_fourchar(&args[0], &sbi->creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &sbi->type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - sbi->umask = (umode_t)tmp; - break; - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(sbi->uid)) { - pr_err("invalid uid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_UID, &sbi->flags); - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(sbi->gid)) { - pr_err("invalid gid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_GID, &sbi->flags); - } - break; - case opt_part: - if (match_int(&args[0], &sbi->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &sbi->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_nls: - if (sbi->nls) { - pr_err("unable to change nls mapping\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - sbi->nls = load_nls(p); - if (!sbi->nls) { - pr_err("unable to load nls mapping \"%s\"\n", - p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_decompose: - clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_nodecompose: + memcpy(&sbi->creator, param->string, 4); + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&sbi->type, param->string, 4); + break; + case opt_umask: + sbi->umask = (umode_t)result.uint_32; + break; + case opt_uid: + sbi->uid = result.uid; + set_bit(HFSPLUS_SB_UID, &sbi->flags); + break; + case opt_gid: + sbi->gid = result.gid; + set_bit(HFSPLUS_SB_GID, &sbi->flags); + break; + case opt_part: + sbi->part = result.uint_32; + break; + case opt_session: + sbi->session = result.uint_32; + break; + case opt_nls: + if (sbi->nls) { + pr_err("unable to change nls mapping\n"); + return -EINVAL; + } + sbi->nls = load_nls(param->string); + if (!sbi->nls) { + pr_err("unable to load nls mapping \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_decompose: + if (result.negated) set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_barrier: - clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_nobarrier: + else + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_barrier: + if (result.negated) set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_force: - set_bit(HFSPLUS_SB_FORCE, &sbi->flags); - break; - default: - return 0; - } - } - -done: - if (!sbi->nls) { - /* try utf8 first, as this is the old default behaviour */ - sbi->nls = load_nls("utf8"); - if (!sbi->nls) - sbi->nls = load_nls_default(); - if (!sbi->nls) - return 0; + else + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_force: + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); + break; + default: + return -EINVAL; } - return 1; + return 0; } int hfsplus_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 97920202790f..948b8aaee33e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -14,6 +14,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfsplus_remount(struct super_block *sb, int *flags, char *data) +static int hfsplus_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; - int force = 0; - - if (!hfsplus_parse_options_remount(data, &force)) - return -EINVAL; + if (!(fc->sb_flags & SB_RDONLY)) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; - } else if (force) { + fc->sb_flags |= SB_RDONLY; + } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = { .put_super = hfsplus_put_super, .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, - .remount_fs = hfsplus_remount, .show_options = hfsplus_show_options, }; -static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) +static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfsplus_vh *vhdr; - struct hfsplus_sb_info *sbi; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); hfsplus_cat_entry entry; struct hfs_find_data fd; struct inode *root, *inode; struct qstr str; - struct nls_table *nls = NULL; + struct nls_table *nls; u64 last_fs_block, last_fs_page; + int silent = fc->sb_flags & SB_SILENT; int err; - err = -ENOMEM; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out; - - sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); - hfsplus_fill_defaults(sbi); err = -EINVAL; - if (!hfsplus_parse_options(data, sbi)) { - pr_err("unable to parse mount options\n"); - goto out_unload_nls; + if (!sbi->nls) { + /* try utf8 first, as this is the old default behaviour */ + sbi->nls = load_nls("utf8"); + if (!sbi->nls) + sbi->nls = load_nls_default(); } /* temporarily use utf8 to correctly find the hidden dir below */ @@ -616,7 +611,6 @@ out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); kfree(sbi); -out: return err; } @@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static struct dentry *hfsplus_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfsplus_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfsplus_fill_super); +} + +static void hfsplus_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfsplus_context_ops = { + .parse_param = hfsplus_parse_param, + .get_tree = hfsplus_get_tree, + .reconfigure = hfsplus_reconfigure, + .free = hfsplus_free_fc, +}; + +static int hfsplus_init_fs_context(struct fs_context *fc) +{ + struct hfsplus_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + hfsplus_fill_defaults(sbi); + + fc->s_fs_info = sbi; + fc->ops = &hfsplus_context_ops; + + return 0; } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", - .mount = hfsplus_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfsplus_init_fs_context, }; MODULE_ALIAS_FS("hfsplus"); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 9592ffcb44e5..74801911bc1c 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 6d1cf2436ead..7e51d2cec64b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -57,6 +57,7 @@ static int __init hostfs_args(char *options, int *add) { char *ptr; + *add = 0; ptr = strchr(options, ','); if (ptr != NULL) *ptr++ = '\0'; @@ -471,8 +472,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!*foliop) - return -ENOMEM; + if (IS_ERR(*foliop)) + return PTR_ERR(*foliop); return 0; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index e73717daa5f9..27567920abe4 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,7 +9,8 @@ #include "hpfs_fn.h" #include <linux/module.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> @@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) hpfs_sb(s)->sb_was_error = 1; } -/* +/* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * @@ -272,146 +273,70 @@ static void destroy_inodecache(void) kmem_cache_destroy(hpfs_inode_cachep); } -/* - * A tiny parser for option strings, stolen from dosfs. - * Stolen again from read-only hpfs. - * And updated for table-driven option parsing. - */ - enum { - Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, - Opt_check_none, Opt_check_normal, Opt_check_strict, - Opt_err_cont, Opt_err_ro, Opt_err_panic, - Opt_eas_no, Opt_eas_ro, Opt_eas_rw, - Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always, - Opt_timeshift, Opt_err, + Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, + Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; -static const match_table_t tokens = { - {Opt_help, "help"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%o"}, - {Opt_case_lower, "case=lower"}, - {Opt_case_asis, "case=asis"}, - {Opt_check_none, "check=none"}, - {Opt_check_normal, "check=normal"}, - {Opt_check_strict, "check=strict"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_err_panic, "errors=panic"}, - {Opt_eas_no, "eas=no"}, - {Opt_eas_ro, "eas=ro"}, - {Opt_eas_rw, "eas=rw"}, - {Opt_chkdsk_no, "chkdsk=no"}, - {Opt_chkdsk_errors, "chkdsk=errors"}, - {Opt_chkdsk_always, "chkdsk=always"}, - {Opt_timeshift, "timeshift=%d"}, - {Opt_err, NULL}, +static const struct constant_table hpfs_param_case[] = { + {"asis", 0}, + {"lower", 1}, + {} }; -static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, - int *lowercase, int *eas, int *chk, int *errs, - int *chkdsk, int *timeshift) -{ - char *p; - int option; +static const struct constant_table hpfs_param_check[] = { + {"none", 0}, + {"normal", 1}, + {"strict", 2}, + {} +}; - if (!opts) - return 1; +static const struct constant_table hpfs_param_err[] = { + {"continue", 0}, + {"remount-ro", 1}, + {"panic", 2}, + {} +}; - /*pr_info("Parsing opts: '%s'\n",opts);*/ - - while ((p = strsep(&opts, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_help: - return 2; - case Opt_uid: - if (match_int(args, &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - break; - case Opt_gid: - if (match_int(args, &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - break; - case Opt_umask: - if (match_octal(args, &option)) - return 0; - *umask = option; - break; - case Opt_case_lower: - *lowercase = 1; - break; - case Opt_case_asis: - *lowercase = 0; - break; - case Opt_check_none: - *chk = 0; - break; - case Opt_check_normal: - *chk = 1; - break; - case Opt_check_strict: - *chk = 2; - break; - case Opt_err_cont: - *errs = 0; - break; - case Opt_err_ro: - *errs = 1; - break; - case Opt_err_panic: - *errs = 2; - break; - case Opt_eas_no: - *eas = 0; - break; - case Opt_eas_ro: - *eas = 1; - break; - case Opt_eas_rw: - *eas = 2; - break; - case Opt_chkdsk_no: - *chkdsk = 0; - break; - case Opt_chkdsk_errors: - *chkdsk = 1; - break; - case Opt_chkdsk_always: - *chkdsk = 2; - break; - case Opt_timeshift: - { - int m = 1; - char *rhs = args[0].from; - if (!rhs || !*rhs) - return 0; - if (*rhs == '-') m = -1; - if (*rhs == '+' || *rhs == '-') rhs++; - *timeshift = simple_strtoul(rhs, &rhs, 0) * m; - if (*rhs) - return 0; - break; - } - default: - return 0; - } - } - return 1; -} +static const struct constant_table hpfs_param_eas[] = { + {"no", 0}, + {"ro", 1}, + {"rw", 2}, + {} +}; + +static const struct constant_table hpfs_param_chkdsk[] = { + {"no", 0}, + {"errors", 1}, + {"always", 2}, + {} +}; + +static const struct fs_parameter_spec hpfs_param_spec[] = { + fsparam_flag ("help", Opt_help), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_enum ("case", Opt_case, hpfs_param_case), + fsparam_enum ("check", Opt_check, hpfs_param_check), + fsparam_enum ("errors", Opt_err, hpfs_param_err), + fsparam_enum ("eas", Opt_eas, hpfs_param_eas), + fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), + fsparam_s32 ("timeshift", Opt_timeshift), + {} +}; + +struct hpfs_fc_context { + kuid_t uid; + kgid_t gid; + umode_t umask; + int lowercase; + int eas; + int chk; + int errs; + int chkdsk; + int timeshift; +}; static inline void hpfs_help(void) { @@ -439,49 +364,92 @@ HPFS filesystem options:\n\ \n"); } -static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) +static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; - int o; + struct hpfs_fc_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, hpfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_help: + hpfs_help(); + return -EINVAL; + case Opt_uid: + ctx->uid = result.uid; + break; + case Opt_gid: + ctx->gid = result.gid; + break; + case Opt_umask: + ctx->umask = result.uint_32; + break; + case Opt_case: + ctx->lowercase = result.uint_32; + break; + case Opt_check: + ctx->chk = result.uint_32; + break; + case Opt_err: + ctx->errs = result.uint_32; + break; + case Opt_eas: + ctx->eas = result.uint_32; + break; + case Opt_chkdsk: + ctx->chkdsk = result.uint_32; + break; + case Opt_timeshift: + { + int m = 1; + char *rhs = param->string; + int timeshift; + + if (*rhs == '-') m = -1; + if (*rhs == '+' || *rhs == '-') rhs++; + timeshift = simple_strtoul(rhs, &rhs, 0) * m; + if (*rhs) + return -EINVAL; + ctx->timeshift = timeshift; + break; + } + default: + return -EINVAL; + } + + return 0; +} + +static int hpfs_reconfigure(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx = fc->fs_private; + struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); - *flags |= SB_NOATIME; + fc->sb_flags |= SB_NOATIME; hpfs_lock(s); - uid = sbi->sb_uid; gid = sbi->sb_gid; - umask = 0777 & ~sbi->sb_mode; - lowercase = sbi->sb_lowercase; - eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; - errs = sbi->sb_err; timeshift = sbi->sb_timeshift; - - if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto out_err; - } - if (o == 2) { - hpfs_help(); - goto out_err; - } - if (timeshift != sbi->sb_timeshift) { + + if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); - sbi->sb_uid = uid; sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; sbi->sb_timeshift = timeshift; + sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; - if (!(*flags & SB_RDONLY)) mark_dirty(s, 1); + if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; @@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops = .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, - .remount_fs = hpfs_remount_fs, .show_options = hpfs_show_options, }; -static int hpfs_fill_super(struct super_block *s, void *options, int silent) +static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { + struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; - - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; + int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; - int o; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; @@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); - uid = current_uid(); - gid = current_gid(); - umask = current_umask(); - lowercase = 0; - eas = 2; - chk = 1; - errs = 1; - chkdsk = 1; - timeshift = 0; - - if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto bail0; - } - if (o==2) { - hpfs_help(); - goto bail0; - } - /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; @@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); - sbi->sb_uid = uid; - sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; + sbi->sb_uid = ctx->uid; + sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; - sbi->sb_chk = chk; - sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; - sbi->sb_timeshift = timeshift; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; + sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; + sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; @@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { - if (errs == 2) { + if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } @@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { - if (errs >= 2) { + if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); - if (errs == 0) + if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } - if (chk) { + if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { @@ -755,18 +697,70 @@ bail0: return -EINVAL; } -static struct dentry *hpfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hpfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hpfs_fill_super); +} + +static void hpfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations hpfs_fc_context_ops = { + .parse_param = hpfs_parse_param, + .get_tree = hpfs_get_tree, + .reconfigure = hpfs_reconfigure, + .free = hpfs_free_fc, +}; + +static int hpfs_init_fs_context(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx; + + ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct hpfs_sb_info *sbi = hpfs_sb(sb); + + ctx->uid = sbi->sb_uid; + ctx->gid = sbi->sb_gid; + ctx->umask = 0777 & ~sbi->sb_mode; + ctx->lowercase = sbi->sb_lowercase; + ctx->eas = sbi->sb_eas; + ctx->chk = sbi->sb_chk; + ctx->chkdsk = sbi->sb_chkdsk; + ctx->errs = sbi->sb_err; + ctx->timeshift = sbi->sb_timeshift; + + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->umask = current_umask(); + ctx->lowercase = 0; + ctx->eas = 2; + ctx->chk = 1; + ctx->errs = 1; + ctx->chkdsk = 1; + ctx->timeshift = 0; + } + + fc->fs_private = ctx; + fc->ops = &hpfs_fc_context_ops; + + return 0; +}; + static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", - .mount = hpfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hpfs_init_fs_context, + .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..fc1ae5132127 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,6 +39,9 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/hugetlbfs.h> + static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -171,102 +174,28 @@ out: * Called under mmap_write_lock(mm). */ -static unsigned long -hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (unlikely(offset_in_page(addr))) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - addr = vm_unmapped_area(&info); - } - - return addr; -} - unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; + unsigned long addr0 = 0; struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > mmap_end - mmap_min_addr) - return -ENOMEM; - if (flags & MAP_FIXED) { + if (addr & ~huge_page_mask(h)) + return -EINVAL; if (prepare_hugepage_range(file, addr, len)) return -EINVAL; - return addr; } + if (addr) + addr0 = ALIGN(addr, huge_page_size(h)); - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma_prev(mm, addr, &prev); - if (mmap_end - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma)) && - (!prev || addr >= vm_end_gap(prev))) - return addr; - } - - /* - * Use MMF_TOPDOWN flag as a hint to use topdown routine. - * If architectures have special needs, they should define their own - * version of hugetlb_get_unmapped_area. - */ - if (test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); -} - -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, + flags, 0); } -#endif /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. @@ -687,6 +616,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -814,8 +744,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -893,7 +825,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, error = PTR_ERR(folio); goto out; } - folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); + folio_zero_user(folio, addr); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { @@ -919,6 +851,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -935,6 +870,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1033,6 +970,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1272,6 +1210,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } diff --git a/fs/inode.c b/fs/inode.c index 471ae4a31549..6b4c77268fc0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,7 +21,12 @@ #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <trace/events/writeback.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timestamp.h> + #include "internal.h" /* @@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void) return nr_dirty > 0 ? nr_dirty : 0; } +#ifdef CONFIG_DEBUG_FS +static DEFINE_PER_CPU(long, mg_ctime_updates); +static DEFINE_PER_CPU(long, mg_fine_stamps); +static DEFINE_PER_CPU(long, mg_ctime_swaps); + +static unsigned long get_mg_ctime_updates(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_updates, i)); + return sum; +} + +static unsigned long get_mg_fine_stamps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_fine_stamps, i)); + return sum; +} + +static unsigned long get_mg_ctime_swaps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_swaps, i)); + return sum; +} + +#define mgtime_counter_inc(__var) this_cpu_inc(__var) + +static int mgts_show(struct seq_file *s, void *p) +{ + unsigned long ctime_updates = get_mg_ctime_updates(); + unsigned long ctime_swaps = get_mg_ctime_swaps(); + unsigned long fine_stamps = get_mg_fine_stamps(); + unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); + + seq_printf(s, "%lu %lu %lu %lu\n", + ctime_updates, ctime_swaps, fine_stamps, floor_swaps); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mgts); + +static int __init mg_debugfs_init(void) +{ + debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); + return 0; +} +late_initcall(mg_debugfs_init); + +#else /* ! CONFIG_DEBUG_FS */ + +#define mgtime_counter_inc(__var) do { } while (0) + +#endif /* CONFIG_DEBUG_FS */ + /* * Handle nr_inode sysctl */ @@ -146,14 +215,16 @@ static int no_open(struct inode *inode, struct file *file) } /** - * inode_init_always - perform inode structure initialisation + * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise + * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. + * If there are additional allocations required @gfp is used. */ -int inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; @@ -172,6 +243,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; + if (sb->s_type->fs_flags & FS_MGTIME) + inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); @@ -230,14 +303,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode) #endif inode->i_flctx = NULL; - if (unlikely(security_inode_alloc(inode))) + if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } -EXPORT_SYMBOL(inode_init_always); +EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { @@ -746,7 +819,7 @@ static void evict(struct inode *inode) * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ - smp_mb(); + smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -879,7 +952,7 @@ again: * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); @@ -921,7 +994,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); - spin_unlock(lru_lock); + spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -932,7 +1005,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item, mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); - spin_lock(lru_lock); return LRU_RETRY; } @@ -1239,16 +1311,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a variant of iget5_locked() for callers that don't want to fail on memory - * allocation of inode. + * and if present return it with an increased reference count. This is a + * variant of iget5_locked() that doesn't allocate an inode. * - * If the inode is not in cache, insert the pre-allocated inode to cache and + * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1312,16 +1383,16 @@ EXPORT_SYMBOL(inode_insert5); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a generalized version of iget_locked() for file systems where the inode + * and if present return it with an increased reference count. This is a + * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * - * If the inode is not in cache, allocate a new inode and return it locked, - * hashed, and with the I_NEW flag set. The file system gets to fill it in - * before unlocking it via unlock_new_inode(). + * If the inode is not present in the cache, allocate and insert a new inode + * and return it locked, hashed, and with the I_NEW flag set. The file system + * gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -2209,19 +2280,58 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); +/** + * current_time - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp, but don't update + * the floor. + * + * For a multigrain inode, this is effectively an estimate of the timestamp + * that a file would receive. An actual update must go through + * inode_set_ctime_current(). + */ +struct timespec64 current_time(struct inode *inode) +{ + struct timespec64 now; + u32 cns; + + ktime_get_coarse_real_ts64_mg(&now); + + if (!is_mgtime(inode)) + goto out; + + /* If nothing has queried it, then coarse time is fine */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + /* + * If there is no apparent change, then get a fine-grained + * timestamp. + */ + if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) + ktime_get_real_ts64(&now); + } +out: + return timestamp_truncate(now, inode); +} +EXPORT_SYMBOL(current_time); + static int inode_needs_update_time(struct inode *inode) { + struct timespec64 now, ts; int sync_it = 0; - struct timespec64 now = current_time(inode); - struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + now = current_time(inode); + ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it = S_MTIME; + sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) @@ -2598,6 +2708,16 @@ void inode_nohighmem(struct inode *inode) } EXPORT_SYMBOL(inode_nohighmem); +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) +{ + trace_inode_set_ctime_to_ts(inode, &ts); + set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; + return ts; +} +EXPORT_SYMBOL(inode_set_ctime_to_ts); + /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec @@ -2630,39 +2750,159 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) EXPORT_SYMBOL(timestamp_truncate); /** - * current_time - Return FS time - * @inode: inode. + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode * - * Return the current time truncated to the time granularity supported by - * the fs. + * Set the inode's ctime to the current value for the inode. Returns the + * current value that was assigned. If this is not a multigrain inode, then we + * set it to the later of the coarse time and floor value. + * + * If it is multigrain, then we first see if the coarse-grained timestamp is + * distinct from what is already there. If so, then use that. Otherwise, get a + * fine-grained timestamp. * - * Note that inode and inode->sb cannot be NULL. - * Otherwise, the function warns and returns time without truncation. + * After that, try to swap the new value into i_ctime_nsec. Accept the + * resulting ctime, regardless of the outcome of the swap. If it has + * already been replaced, then that timestamp is later than the earlier + * unacceptable one, and is thus acceptable. */ -struct timespec64 current_time(struct inode *inode) +struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; + u32 cns, cur; - ktime_get_coarse_real_ts64(&now); - return timestamp_truncate(now, inode); + ktime_get_coarse_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + + /* Just return that if this is not a multigrain fs */ + if (!is_mgtime(inode)) { + inode_set_ctime_to_ts(inode, now); + goto out; + } + + /* + * A fine-grained time is only needed if someone has queried + * for timestamps, and the current coarse grained time isn't + * later than what's already there. + */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, + .tv_nsec = cns & ~I_CTIME_QUERIED }; + + if (timespec64_compare(&now, &ctime) <= 0) { + ktime_get_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + mgtime_counter_inc(mg_fine_stamps); + } + } + mgtime_counter_inc(mg_ctime_updates); + + /* No need to cmpxchg if it's exactly the same */ + if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { + trace_ctime_xchg_skip(inode, &now); + goto out; + } + cur = cns; +retry: + /* Try to swap the nsec value into place. */ + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { + /* If swap occurred, then we're (mostly) done */ + inode->i_ctime_sec = now.tv_sec; + trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); + mgtime_counter_inc(mg_ctime_swaps); + } else { + /* + * Was the change due to someone marking the old ctime QUERIED? + * If so then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { + cns = cur; + goto retry; + } + /* Otherwise, keep the existing ctime */ + now.tv_sec = inode->i_ctime_sec; + now.tv_nsec = cur & ~I_CTIME_QUERIED; + } +out: + return now; } -EXPORT_SYMBOL(current_time); +EXPORT_SYMBOL(inode_set_ctime_current); /** - * inode_set_ctime_current - set the ctime to current_time - * @inode: inode + * inode_set_ctime_deleg - try to update the ctime on a delegated inode + * @inode: inode to update + * @update: timespec64 to set the ctime * - * Set the inode->i_ctime to the current value for the inode. Returns - * the current value that was assigned to i_ctime. + * Attempt to atomically update the ctime on behalf of a delegation holder. + * + * The nfs server can call back the holder of a delegation to get updated + * inode attributes, including the mtime. When updating the mtime, update + * the ctime to a value at least equal to that. + * + * This can race with concurrent updates to the inode, in which + * case the update is skipped. + * + * Note that this works even when multigrain timestamps are not enabled, + * so it is used in either case. */ -struct timespec64 inode_set_ctime_current(struct inode *inode) +struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { - struct timespec64 now = current_time(inode); + struct timespec64 now, cur_ts; + u32 cur, old; - inode_set_ctime_to_ts(inode, now); - return now; + /* pairs with try_cmpxchg below */ + cur = smp_load_acquire(&inode->i_ctime_nsec); + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + cur_ts.tv_sec = inode->i_ctime_sec; + + /* If the update is older than the existing value, skip it. */ + if (timespec64_compare(&update, &cur_ts) <= 0) + return cur_ts; + + ktime_get_coarse_real_ts64_mg(&now); + + /* Clamp the update to "now" if it's in the future */ + if (timespec64_compare(&update, &now) > 0) + update = now; + + update = timestamp_truncate(update, inode); + + /* No need to update if the values are already the same */ + if (timespec64_equal(&update, &cur_ts)) + return cur_ts; + + /* + * Try to swap the nsec value into place. If it fails, that means + * it raced with an update due to a write or similar activity. That + * stamp takes precedence, so just skip the update. + */ +retry: + old = cur; + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { + inode->i_ctime_sec = update.tv_sec; + mgtime_counter_inc(mg_ctime_swaps); + return update; + } + + /* + * Was the change due to another task marking the old ctime QUERIED? + * + * If so, then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) + goto retry; + + /* Otherwise, it was a new timestamp. */ + cur_ts.tv_sec = inode->i_ctime_sec; + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + return cur_ts; } -EXPORT_SYMBOL(inode_set_ctime_current); +EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged @@ -2670,7 +2910,7 @@ EXPORT_SYMBOL(inode_set_ctime_current); * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * - * Check wether @vfsgid is in the caller's group list or if the caller is + * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * diff --git a/fs/internal.h b/fs/internal.h index 8c1b7acbbe8f..e7f02ae1e098 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -246,7 +246,6 @@ int open_namespace(struct ns_common *ns); * fs/stat.c: */ -int getname_statx_lookup_flags(int flags); int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); int do_statx_fd(int fd, unsigned int flags, unsigned int mask, @@ -267,7 +266,7 @@ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; -struct xattr_ctx { +struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; @@ -280,14 +279,15 @@ struct xattr_ctx { unsigned int flags; }; +ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx); +ssize_t filename_getxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); +int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx); +int filename_setxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); +int import_xattr_name(struct xattr_name *kname, const char __user *name); -ssize_t do_getxattr(struct mnt_idmap *idmap, - struct dentry *d, - struct xattr_ctx *ctx); - -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); -int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL diff --git a/fs/ioctl.c b/fs/ioctl.c index 6e0c954388d4..638a36be31c1 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { - struct fd src_file = fdget(srcfd); + CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; - if (!fd_file(src_file)) + if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); @@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EINVAL; else ret = 0; - fdput(src_file); return ret; } @@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) - goto out; + return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); -out: - fdput(f); return error; } @@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl_compat(fd_file(f), cmd, arg); if (error) - goto out; + return error; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ @@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, error = -ENOTTY; break; } - - out: - fdput(f); - return error; } #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 78ebd265f425..d303e6c8900c 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1138,17 +1138,43 @@ static void iomap_write_delalloc_scan(struct inode *inode, start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ - start_byte = folio_next_index(folio) << PAGE_SHIFT; + start_byte = folio_pos(folio) + folio_size(folio); folio_unlock(folio); folio_put(folio); } } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch) { @@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t scan_end_byte = min(i_size_read(inode), end_byte); /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (start_byte == -ENXIO || start_byte == scan_end_byte) break; if (WARN_ON_ONCE(start_byte < 0)) - goto out_unlock; + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); if (WARN_ON_ONCE(data_end < 0)) - goto out_unlock; + return; /* * If we race with post-direct I/O invalidation of the page cache, @@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (punch_start_byte < end_byte) punch(inode, punch_start_byte, end_byte - punch_start_byte, iomap); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); } - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, - loff_t pos, loff_t length, ssize_t written, unsigned flags, - struct iomap *iomap, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return; - - iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap, - punch); -} -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { @@ -1316,22 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) loff_t length = iomap_length(iter); loff_t written = 0; - /* Don't bother with blocks that are not shared to start with. */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - - /* - * Don't bother with delalloc reservations, holes or unwritten extents. - * - * Note that we use srcmap directly instead of iomap_iter_srcmap as - * unsharing requires providing a separate source map, and the presence - * of one is a good indicator that unsharing is needed, unlike - * IOMAP_F_SHARED which can be set for any data that goes into the COW - * fork for XFS. - */ - if (iter->srcmap.type == IOMAP_HOLE || - iter->srcmap.type == IOMAP_DELALLOC || - iter->srcmap.type == IOMAP_UNWRITTEN) + if (!iomap_want_unshare_iter(iter)) return length; do { @@ -1404,40 +1350,12 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, - bool *range_dirty) +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; - /* - * We must zero subranges of unwritten mappings that might be dirty in - * pagecache from previous writes. We only know whether the entire range - * was clean or not, however, and dirty folios may have been written - * back or reclaimed at any point after mapping lookup. - * - * The easiest way to deal with this is to flush pagecache to trigger - * any pending unwritten conversions and then grab the updated extents - * from the fs. The flush may change the current mapping, so mark it - * stale for the iterator to remap it for the next pass to handle - * properly. - * - * Note that holes are treated the same as unwritten because zero range - * is (ab)used for partial folio zeroing in some cases. Hole backed - * post-eof ranges can be dirtied via mapped write and the flush - * triggers writeback time post-eof zeroing. - */ - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - if (*range_dirty) { - *range_dirty = false; - return iomap_zero_iter_flush_and_stale(iter); - } - /* range is clean and already zeroed, nothing to do */ - return length; - } - do { struct folio *folio; int status; @@ -1451,6 +1369,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (iter->iomap.flags & IOMAP_F_STALE) break; + /* warn about zeroing folios beyond eof that won't write back */ + WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; @@ -1483,28 +1403,58 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, .len = len, .flags = IOMAP_ZERO, }; + struct address_space *mapping = inode->i_mapping; + unsigned int blocksize = i_blocksize(inode); + unsigned int off = pos & (blocksize - 1); + loff_t plen = min_t(loff_t, len, blocksize - off); int ret; bool range_dirty; /* - * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but - * pagecache must be flushed to ensure stale data from previous - * buffered writes is not exposed. A flush is only required for certain - * types of mappings, but checking pagecache after mapping lookup is - * racy with writeback and reclaim. + * Zero range can skip mappings that are zero on disk so long as + * pagecache is clean. If pagecache was dirty prior to zero range, the + * mapping converts on writeback completion and so must be zeroed. * - * Therefore, check the entire range first and pass along whether any - * part of it is dirty. If so and an underlying mapping warrants it, - * flush the cache at that point. This trades off the occasional false - * positive (and spurious flush, if the dirty data and mapping don't - * happen to overlap) for simplicity in handling a relatively uncommon - * situation. + * The simplest way to deal with this across a range is to flush + * pagecache and process the updated mappings. To avoid excessive + * flushing on partial eof zeroing, special case it to zero the + * unaligned start portion if already dirty in pagecache. + */ + if (off && + filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { + iter.len = plen; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_zero_iter(&iter, did_zero); + + iter.len = len - (iter.pos - pos); + if (ret || !iter.len) + return ret; + } + + /* + * To avoid an unconditional flush, check pagecache state and only flush + * if dirty and the fs returns a mapping that might convert on + * writeback. */ range_dirty = filemap_range_needs_writeback(inode->i_mapping, - pos, pos + len - 1); + iter.pos, iter.pos + iter.len - 1); + while ((ret = iomap_iter(&iter, ops)) > 0) { + const struct iomap *srcmap = iomap_iter_srcmap(&iter); - while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty); + if (srcmap->type == IOMAP_HOLE || + srcmap->type == IOMAP_UNWRITTEN) { + loff_t proc = iomap_length(&iter); + + if (range_dirty) { + range_dirty = false; + proc = iomap_zero_iter_flush_and_stale(&iter); + } + iter.processed = proc; + continue; + } + + iter.processed = iomap_zero_iter(&iter, did_zero); + } return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); @@ -1655,6 +1605,8 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; + if (next->io_flags & IOMAP_F_BOUNDARY) + return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) return false; @@ -1774,6 +1726,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; + if (pos > wpc->iomap.offset) + wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = pos; @@ -1785,6 +1739,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { + if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + return false; if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; @@ -1818,7 +1774,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos, unsigned len) + struct inode *inode, loff_t pos, loff_t end_pos, + unsigned len) { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); @@ -1837,15 +1794,60 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * <I/O completes, updates disk size to min(A+B+C, A+BS)> + * <power failure> + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + + wbc_account_cgroup_owner(wbc, folio, len); return 0; } static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, unsigned dirty_len, - unsigned *count) + struct inode *inode, u64 pos, u64 end_pos, + unsigned dirty_len, unsigned *count) { int error; @@ -1870,7 +1872,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, break; default: error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, - map_len); + end_pos, map_len); if (!error) (*count)++; break; @@ -1941,11 +1943,11 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, * remaining memory is zeroed when mapped, and writes to that * region are not written out to the file. * - * Also adjust the writeback range to skip all blocks entirely - * beyond i_size. + * Also adjust the end_pos to the end of file and skip writeback + * for all blocks entirely beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - *end_pos = round_up(isize, i_blocksize(inode)); + *end_pos = isize; } return true; @@ -1958,6 +1960,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct inode *inode = folio->mapping->host; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); + u64 end_aligned = 0; unsigned count = 0; int error = 0; u32 rlen; @@ -1999,9 +2002,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, /* * Walk through the folio to find dirty areas to write back. */ - while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + end_aligned = round_up(end_pos, i_blocksize(inode)); + while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, - pos, rlen, &count); + pos, end_pos, rlen, &count); if (error) break; pos += rlen; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..b521eb15759e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - /* - * Set the operation flags early so that bio_iov_iter_get_pages - * can set up the page vector appropriately for a ZONE_APPEND - * operation. - */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 79a0614eaab7..3790918646af 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -22,26 +22,25 @@ static inline int iomap_iter_advance(struct iomap_iter *iter) { bool stale = iter->iomap.flags & IOMAP_F_STALE; + int ret = 1; /* handle the previous iteration (if any) */ if (iter->iomap.length) { if (iter->processed < 0) return iter->processed; - if (!iter->processed && !stale) - return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; iter->len -= iter->processed; - if (!iter->len) - return 0; + if (!iter->len || (!iter->processed && !stale)) + ret = 0; } - /* clear the state for the next iteration */ + /* clear the per iteration state */ iter->processed = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return 1; + return ret; } static inline void iomap_iter_done(struct iomap_iter *iter) diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index f50311a6b429..47038e660812 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -948,8 +948,6 @@ root_found: goto out_no_inode; } - kfree(opt->iocharset); - return 0; /* @@ -987,7 +985,6 @@ out_freebh: brelse(bh); brelse(pri_bh); out_freesbi: - kfree(opt->iocharset); kfree(sbi); s->s_fs_info = NULL; return error; @@ -1528,7 +1525,10 @@ static int isofs_get_tree(struct fs_context *fc) static void isofs_free_fc(struct fs_context *fc) { - kfree(fc->fs_private); + struct isofs_options *opt = fc->fs_private; + + kfree(opt->iocharset); + kfree(opt); } static const struct fs_context_operations isofs_context_ops = { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4305a1ac808a..e8e80761ac73 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); escape = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &wbuf[bufs], blocknr); - if (escape < 0) { - jbd2_journal_abort(journal, escape); - continue; - } jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor @@ -776,9 +772,9 @@ start_journal_io: /* * If the journal is not located on the file system device, * then we must flush the file system device before we issue - * the commit record + * the commit record and update the journal tail sequence. */ - if (commit_transaction->t_need_data_flush && + if ((commit_transaction->t_need_data_flush || update_tail) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(journal->j_fs_dev); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 97f487c3d8fc..7e49d912b091 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data) * * * Return value: - * <0: Error * =0: Finished OK without escape * =1: Finished OK with escape */ @@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, goto escape_done; spin_unlock(&jh_in->b_state_lock); - tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); - if (!tmp) { - brelse(new_bh); - free_buffer_head(new_bh); - return -ENOMEM; - } + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); @@ -1518,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal) * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ +/* The journal_init_common() function creates and fills a journal_t object + * in memory. It calls journal_load_superblock() to load the on-disk journal + * superblock and initialize the journal_t object. + */ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 667f67342c52..9192be7c19d8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -485,6 +485,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return tag->t_checksum == cpu_to_be16(csum32); } +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID) +{ + char *tagp; + int flags; + int ret = 0; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + int err; + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + ret = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + ret = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return ret; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -493,13 +591,10 @@ static int do_one_pass(journal_t *journal, int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; - struct buffer_head * bh; + struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; - int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - int descr_csum_size = 0; - int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -528,12 +623,6 @@ static int do_one_pass(journal_t *journal, */ while (1) { - int flags; - char * tagp; - journal_block_tag_t tag; - struct buffer_head * obh; - struct buffer_head * nbh; - cond_resched(); /* If we already know where to stop the log traversal, @@ -552,6 +641,8 @@ static int do_one_pass(journal_t *journal, * record. */ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -567,20 +658,16 @@ static int do_one_pass(journal_t *journal, tmp = (journal_header_t *)bh->b_data; - if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { - brelse(bh); + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) break; - } blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); - if (sequence != next_commit_ID) { - brelse(bh); + if (sequence != next_commit_ID) break; - } /* OK, we have a valid descriptor block which matches * all of the sequence number checks. What are we going @@ -589,11 +676,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (jbd2_journal_has_csum_v2or3(journal)) - descr_csum_size = - sizeof(struct jbd2_journal_block_tail); - if (descr_csum_size > 0 && - !jbd2_descriptor_block_csum_verify(journal, + if (!jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { /* * PASS_SCAN can see stale blocks due to lazy @@ -603,7 +686,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum recovering block %lu in log\n", next_log_block); err = -EFSBADCRC; - brelse(bh); goto failed; } need_check_commit_time = true; @@ -619,125 +701,39 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && - !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, - &crc32_sum)) { - put_bh(bh); + &crc32_sum)) break; - } - put_bh(bh); continue; } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - put_bh(bh); continue; } - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize - descr_csum_size) { - unsigned long io_block; - - memcpy(&tag, tagp, sizeof(tag)); - flags = be16_to_cpu(tag.t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk(KERN_ERR - "JBD2: IO error %d recovering " - "block %lu in log\n", - err, io_block); - } else { - unsigned long long blocknr; - - J_ASSERT(obh != NULL); - blocknr = read_tag_block(journal, - &tag); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (jbd2_journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Look for block corruption */ - if (!jbd2_block_tag_csum_verify( - journal, &tag, (journal_block_tag3_t *)tagp, - obh->b_data, be32_to_cpu(tmp->h_sequence))) { - brelse(obh); - success = -EFSBADCRC; - printk(KERN_ERR "JBD2: Invalid " - "checksum recovering " - "data block %llu in " - "journal block %lu\n", - blocknr, io_block); - block_error = 1; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD2: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(bh); - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JBD2_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JBD2_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += tag_bytes; - if (!(flags & JBD2_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JBD2_FLAG_LAST_TAG) - break; + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; } - brelse(bh); continue; case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + /* How to differentiate between interrupted commit * and journal corruption ? * @@ -782,7 +778,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum found in transaction %u\n", next_commit_ID); err = -EFSBADCRC; - brelse(bh); goto failed; } ignore_crc_mismatch: @@ -792,7 +787,6 @@ static int do_one_pass(journal_t *journal, */ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - brelse(bh); goto done; } @@ -802,8 +796,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ - if (pass == PASS_SCAN && - jbd2_has_feature_checksum(journal)) { + if (jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = @@ -812,7 +805,6 @@ static int do_one_pass(journal_t *journal, if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; - brelse(bh); break; } @@ -828,36 +820,33 @@ static int do_one_pass(journal_t *journal, goto chksum_error; crc32_sum = ~0; + goto chksum_ok; } - if (pass == PASS_SCAN && - !jbd2_commit_block_csum_verify(journal, - bh->b_data)) { - if (jbd2_commit_block_csum_verify_partial( - journal, - bh->b_data)) { - pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", - next_commit_ID, next_log_block); - goto chksum_ok; - } - chksum_error: - if (commit_time < last_trans_commit_time) - goto ignore_crc_mismatch; - info->end_transaction = next_commit_ID; - info->head_block = head_block; - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - brelse(bh); - break; - } + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; } - if (pass == PASS_SCAN) { - chksum_ok: - last_trans_commit_time = commit_time; - head_block = next_log_block; + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; } - brelse(bh); + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; next_commit_ID++; continue; @@ -876,14 +865,11 @@ static int do_one_pass(journal_t *journal, /* If we aren't in the REVOKE pass, then we can * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); + if (pass != PASS_REVOKE) continue; - } err = scan_revoke_records(journal, bh, next_commit_ID, info); - brelse(bh); if (err) goto failed; continue; @@ -891,12 +877,12 @@ static int do_one_pass(journal_t *journal, default: jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); - brelse(bh); goto done; } } done: + brelse(bh); /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the @@ -927,11 +913,10 @@ static int do_one_pass(journal_t *journal, success = err; } - if (block_error && success == 0) - success = -EIO; return success; failed: + brelse(bh); return err; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 4556e4689024..ce63d5fde9c3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -654,7 +654,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, REQ_SYNC); + write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 79e771ab624f..3bd9d2f3bece 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -95,6 +95,9 @@ static int jffs2_rtime_decompress(unsigned char *data_in, positions[value]=outpos; if (repeat) { + if ((outpos + repeat) > destlen) { + return 1; + } if (backoffs + repeat >= outpos) { while(repeat) { cpage_out[outpos++] = cpage_out[backoffs++]; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index 556de100ebd5..9854253d0108 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -276,11 +276,6 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, end_rubin(&rs); - if (outpos > pos) { - /* We failed */ - return -1; - } - /* Tell the caller how much we managed to compress, * and how much space it took */ diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index acd32f05b519..ef3a1e1b6cb0 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -338,10 +338,9 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl } while(--retlen); mtd_unpoint(c->mtd, jeb->offset, c->sector_size); if (retlen) { - pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08tx\n", - *wordebuf, - jeb->offset + - c->sector_size-retlen * sizeof(*wordebuf)); + *bad_offset = jeb->offset + c->sector_size - retlen * sizeof(*wordebuf); + pr_warn("Newly-erased block contained word 0x%lx at offset 0x%08x\n", + *wordebuf, *bad_offset); return -EIO; } return 0; diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 822949d0eb00..1b833bbffcf5 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -82,7 +82,7 @@ again: nextlist = &c->erasable_list; } else if (!list_empty(&c->erasable_pending_wbuf_list)) { - /* There are blocks are wating for the wbuf sync */ + /* There are blocks are waiting for the wbuf sync */ jffs2_dbg(1, "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n"); spin_unlock(&c->erase_completion_lock); jffs2_flush_wbuf_pad(c); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index bbab2bdc71b6..3fb9f9807b66 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -15,6 +15,7 @@ #include <linux/mtd/mtd.h> #include <linux/compiler.h> #include <linux/sched/signal.h> +#include <linux/string_choices.h> #include "nodelist.h" #include "debug.h" @@ -317,9 +318,9 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) And there's no space left. At all. */ pr_crit("Argh. No free space left for GC. nr_erasing_blocks is %d. nr_free_blocks is %d. (erasableempty: %s, erasingempty: %s, erasependingempty: %s)\n", c->nr_erasing_blocks, c->nr_free_blocks, - list_empty(&c->erasable_list) ? "yes" : "no", - list_empty(&c->erasing_list) ? "yes" : "no", - list_empty(&c->erase_pending_list) ? "yes" : "no"); + str_yes_no(list_empty(&c->erasable_list)), + str_yes_no(list_empty(&c->erasing_list)), + str_yes_no(list_empty(&c->erase_pending_list))); return -ENOSPC; } @@ -630,8 +631,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref ref->flash_offset, jeb->used_size); BUG(); }) - jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n", - ref_offset(ref), freed_len); + jffs2_dbg(1, "Obsoleting previously unchecked node at 0x%08x of len %x\n", + ref_offset(ref), freed_len); jeb->unchecked_size -= freed_len; c->unchecked_size -= freed_len; } else { @@ -641,8 +642,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref ref->flash_offset, jeb->used_size); BUG(); }) - jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ", - ref_offset(ref), freed_len); + jffs2_dbg(1, "Obsoleting node at 0x%08x of len %#x: ", + ref_offset(ref), freed_len); jeb->used_size -= freed_len; c->used_size -= freed_len; } @@ -883,7 +884,7 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c) jffs2_dbg(1, "%s(): nr_free_blocks %d, nr_erasing_blocks %d, dirty_size 0x%x, vdirty_blocks %d: %s\n", __func__, c->nr_free_blocks, c->nr_erasing_blocks, - c->dirty_size, nr_very_dirty, ret ? "yes" : "no"); + c->dirty_size, nr_very_dirty, str_yes_no(ret)); return ret; } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 03b4f99614be..f987f78a894e 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -72,7 +72,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info if (err != -EOPNOTSUPP) JFFS2_WARNING("MTD point failed: error code %d.\n", err); } else - pointed = 1; /* succefully pointed to device */ + pointed = 1; /* successfully pointed to device */ #endif if (!pointed) { diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 974ecf5e0d95..f9009e4f9ffd 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap) } bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag || bmp->db_numag >= MAXAG) { + if (!bmp->db_numag || bmp->db_numag > MAXAG) { err = -EINVAL; goto err_release_metapage; } @@ -1820,6 +1820,9 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) return -EIO; dp = (struct dmap *) mp->data; + if (dp->tree.budmin < 0) + return -EIO; + /* try to allocate the blocks. */ rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); @@ -2888,6 +2891,9 @@ static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl) /* bubble the new value up the tree as required. */ for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { + if (lp == 0) + break; + /* get the index of the first leaf of the 4 leaf * group containing the specified leaf (leafno). */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 5d3127ca68a4..8f85177f284b 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2891,6 +2891,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { + if (stbl[i] < 0 || stbl[i] > 127) { + jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld", + i, stbl[i], (long)ip->i_ino, (long long)bn); + free_page(dirent_buf); + DT_PUTPAGE(mp); + return -EIO; + } + d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > @@ -3086,6 +3094,13 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) /* get the leftmost entry */ stbl = DT_GETSTBL(p); + + if (stbl[0] < 0 || stbl[0] > 127) { + DT_PUTPAGE(mp); + jfs_error(ip->i_sb, "stbl[0] out of bound\n"); + return -EIO; + } + xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 33ef13a0b110..8794281f8ffd 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -24,6 +24,7 @@ #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ #define JFS_ERR_CONTINUE 0x00000004 /* continue */ #define JFS_ERR_PANIC 0x00000008 /* panic */ +#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC) /* Quota support */ #define JFS_USRQUOTA 0x00000010 diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e1be21ca5d6e..223d9ac59839 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -6,11 +6,11 @@ #include <linux/fs.h> #include <linux/module.h> -#include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> -#include <linux/mount.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> @@ -210,240 +210,195 @@ enum { Opt_discard, Opt_nodiscard, Opt_discard_minblk }; -static const match_table_t tokens = { - {Opt_integrity, "integrity"}, - {Opt_nointegrity, "nointegrity"}, - {Opt_iocharset, "iocharset=%s"}, - {Opt_resize, "resize=%u"}, - {Opt_resize_nosize, "resize"}, - {Opt_errors, "errors=%s"}, - {Opt_ignore, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%u"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_discard_minblk, "discard=%u"}, - {Opt_err, NULL} +static const struct constant_table jfs_param_errors[] = { + {"continue", JFS_ERR_CONTINUE}, + {"remount-ro", JFS_ERR_REMOUNT_RO}, + {"panic", JFS_ERR_PANIC}, + {} }; -static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, - int *flag) -{ - void *nls_map = (void *)-1; /* -1: no change; NULL: none */ - char *p; - struct jfs_sb_info *sbi = JFS_SBI(sb); +static const struct fs_parameter_spec jfs_param_spec[] = { + fsparam_flag_no ("integrity", Opt_integrity), + fsparam_string ("iocharset", Opt_iocharset), + fsparam_u64 ("resize", Opt_resize), + fsparam_flag ("resize", Opt_resize_nosize), + fsparam_enum ("errors", Opt_errors, jfs_param_errors), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_flag ("discard", Opt_discard), + fsparam_u32 ("discard", Opt_discard_minblk), + fsparam_flag ("nodiscard", Opt_nodiscard), + {} +}; - *newLVSize = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_integrity: - *flag &= ~JFS_NOINTEGRITY; - break; - case Opt_nointegrity: - *flag |= JFS_NOINTEGRITY; - break; - case Opt_ignore: - /* Silently ignore the quota options */ - /* Don't do anything ;-) */ - break; - case Opt_iocharset: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); - if (!strcmp(args[0].from, "none")) - nls_map = NULL; - else { - nls_map = load_nls(args[0].from); - if (!nls_map) { - pr_err("JFS: charset not found\n"); - goto cleanup; - } - } - break; - case Opt_resize: - { - char *resize = args[0].from; - int rc = kstrtoll(resize, 0, newLVSize); +struct jfs_context { + int flag; + kuid_t uid; + kgid_t gid; + uint umask; + uint minblks_trim; + void *nls_map; + bool resize; + s64 newLVSize; +}; - if (rc) - goto cleanup; - break; - } - case Opt_resize_nosize: - { - *newLVSize = sb_bdev_nr_blocks(sb); - if (*newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); - break; +static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct jfs_context *ctx = fc->fs_private; + int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE); + struct fs_parse_result result; + struct nls_table *nls_map; + int opt; + + opt = fs_parse(fc, jfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_integrity: + if (result.negated) + ctx->flag |= JFS_NOINTEGRITY; + else + ctx->flag &= ~JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (ctx->nls_map && ctx->nls_map != (void *) -1) { + unload_nls(ctx->nls_map); + ctx->nls_map = NULL; } - case Opt_errors: - { - char *errors = args[0].from; - if (!errors || !*errors) - goto cleanup; - if (!strcmp(errors, "continue")) { - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_CONTINUE; - } else if (!strcmp(errors, "remount-ro")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_REMOUNT_RO; - } else if (!strcmp(errors, "panic")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag |= JFS_ERR_PANIC; - } else { - pr_err("JFS: %s is an invalid error handler\n", - errors); - goto cleanup; + if (!strcmp(param->string, "none")) + ctx->nls_map = NULL; + else { + nls_map = load_nls(param->string); + if (!nls_map) { + pr_err("JFS: charset not found\n"); + return -EINVAL; } - break; + ctx->nls_map = nls_map; } + break; + case Opt_resize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + ctx->newLVSize = result.uint_64; + break; + case Opt_resize_nosize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + break; + case Opt_errors: + ctx->flag &= ~JFS_ERR_MASK; + ctx->flag |= result.uint_32; + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - *flag |= JFS_USRQUOTA; - break; - case Opt_grpquota: - *flag |= JFS_GRPQUOTA; - break; + case Opt_quota: + case Opt_usrquota: + ctx->flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + ctx->flag |= JFS_GRPQUOTA; + break; #else - case Opt_usrquota: - case Opt_grpquota: - case Opt_quota: - pr_err("JFS: quota operations not supported\n"); - break; + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + pr_err("JFS: quota operations not supported\n"); + break; #endif - case Opt_uid: - { - char *uid = args[0].from; - uid_t val; - int rc = kstrtouint(uid, 0, &val); - - if (rc) - goto cleanup; - sbi->uid = make_kuid(current_user_ns(), val); - if (!uid_valid(sbi->uid)) - goto cleanup; - break; - } - - case Opt_gid: - { - char *gid = args[0].from; - gid_t val; - int rc = kstrtouint(gid, 0, &val); - - if (rc) - goto cleanup; - sbi->gid = make_kgid(current_user_ns(), val); - if (!gid_valid(sbi->gid)) - goto cleanup; - break; + case Opt_uid: + ctx->uid = result.uid; + break; + + case Opt_gid: + ctx->gid = result.gid; + break; + + case Opt_umask: + if (result.uint_32 & ~0777) { + pr_err("JFS: Invalid value of umask\n"); + return -EINVAL; } + ctx->umask = result.uint_32; + break; - case Opt_umask: - { - char *umask = args[0].from; - int rc = kstrtouint(umask, 8, &sbi->umask); + case Opt_discard: + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + ctx->minblks_trim = 64; + ctx->flag |= JFS_DISCARD; + break; - if (rc) - goto cleanup; - if (sbi->umask & ~0777) { - pr_err("JFS: Invalid value of umask\n"); - goto cleanup; - } - break; - } + case Opt_nodiscard: + ctx->flag &= ~JFS_DISCARD; + break; - case Opt_discard: - /* if set to 1, even copying files will cause - * trimming :O - * -> user has more control over the online trimming - */ - sbi->minblks_trim = 64; - if (bdev_max_discard_sectors(sb->s_bdev)) - *flag |= JFS_DISCARD; - else - pr_err("JFS: discard option not supported on device\n"); - break; - - case Opt_nodiscard: - *flag &= ~JFS_DISCARD; - break; - - case Opt_discard_minblk: - { - char *minblks_trim = args[0].from; - int rc; - if (bdev_max_discard_sectors(sb->s_bdev)) { - *flag |= JFS_DISCARD; - rc = kstrtouint(minblks_trim, 0, - &sbi->minblks_trim); - if (rc) - goto cleanup; - } else - pr_err("JFS: discard option not supported on device\n"); - break; - } + case Opt_discard_minblk: + ctx->minblks_trim = result.uint_32; + ctx->flag |= JFS_DISCARD; + break; - default: - printk("jfs: Unrecognized mount option \"%s\" or missing value\n", - p); - goto cleanup; - } - } - - if (nls_map != (void *) -1) { - /* Discard old (if remount) */ - unload_nls(sbi->nls_tab); - sbi->nls_tab = nls_map; + default: + return -EINVAL; } - return 1; -cleanup: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); return 0; } -static int jfs_remount(struct super_block *sb, int *flags, char *data) +static int jfs_reconfigure(struct fs_context *fc) { - s64 newLVSize = 0; + struct jfs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + int readonly = fc->sb_flags & SB_RDONLY; int rc = 0; - int flag = JFS_SBI(sb)->flag; + int flag = ctx->flag; int ret; sync_filesystem(sb); - if (!parse_options(data, sb, &newLVSize, &flag)) - return -EINVAL; - if (newLVSize) { + /* Transfer results of parsing to the sbi */ + JFS_SBI(sb)->flag = ctx->flag; + JFS_SBI(sb)->uid = ctx->uid; + JFS_SBI(sb)->gid = ctx->gid; + JFS_SBI(sb)->umask = ctx->umask; + JFS_SBI(sb)->minblks_trim = ctx->minblks_trim; + if (ctx->nls_map != (void *) -1) { + unload_nls(JFS_SBI(sb)->nls_tab); + JFS_SBI(sb)->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (ctx->resize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } - rc = jfs_extendfs(sb, newLVSize, 0); + + if (!ctx->newLVSize) { + ctx->newLVSize = sb_bdev_nr_blocks(sb); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); + } + + rc = jfs_extendfs(sb, ctx->newLVSize, 0); if (rc) return rc; } - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + if (sb_rdonly(sb) && !readonly) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) dquot_resume(sb, -1); return ret; } - if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + if (!sb_rdonly(sb) && readonly) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) JFS_SBI(sb)->flag = flag; return rc; } - if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) { if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) @@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) ret = jfs_mount_rw(sb, 1); return ret; } + } JFS_SBI(sb)->flag = flag; return 0; } -static int jfs_fill_super(struct super_block *sb, void *data, int silent) +static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct jfs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct jfs_sb_info *sbi; struct inode *inode; int rc; - s64 newLVSize = 0; - int flag, ret = -EINVAL; + int ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); @@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; - sbi->uid = INVALID_UID; - sbi->gid = INVALID_GID; - sbi->umask = -1; - - /* initialize the mount flag and determine the default error handler */ - flag = JFS_ERR_REMOUNT_RO; - if (!parse_options((char *) data, sb, &newLVSize, &flag)) - goto out_kfree; - sbi->flag = flag; + /* Transfer results of parsing to the sbi */ + sbi->flag = ctx->flag; + sbi->uid = ctx->uid; + sbi->gid = ctx->gid; + sbi->umask = ctx->umask; + if (ctx->nls_map != (void *) -1) { + unload_nls(sbi->nls_tab); + sbi->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (sbi->flag & JFS_DISCARD) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { + pr_err("JFS: discard option not supported on device\n"); + sbi->flag &= ~JFS_DISCARD; + } else { + sbi->minblks_trim = ctx->minblks_trim; + } + } #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - if (newLVSize) { + if (ctx->resize) { pr_err("resize option for remount only\n"); - goto out_kfree; + goto out_unload; } /* @@ -608,7 +575,6 @@ out_mount_failed: sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); -out_kfree: kfree(sbi); return ret; } @@ -664,10 +630,9 @@ out: return rc; } -static struct dentry *jfs_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int jfs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); + return get_tree_bdev(fc, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) @@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = { .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, - .remount_fs = jfs_remount, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, @@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = { .get_parent = jfs_get_parent, }; +static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx) +{ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + + /* Copy over current option values and mount flags */ + ctx->uid = JFS_SBI(sb)->uid; + ctx->gid = JFS_SBI(sb)->gid; + ctx->umask = JFS_SBI(sb)->umask; + ctx->nls_map = (void *)-1; + ctx->minblks_trim = JFS_SBI(sb)->minblks_trim; + ctx->flag = JFS_SBI(sb)->flag; + + } else { + /* + * Initialize the mount flag and determine the default + * error handler + */ + ctx->flag = JFS_ERR_REMOUNT_RO; + ctx->uid = INVALID_UID; + ctx->gid = INVALID_GID; + ctx->umask = -1; + ctx->nls_map = (void *)-1; + } +} + +static void jfs_free_fc(struct fs_context *fc) +{ + struct jfs_context *ctx = fc->fs_private; + + if (ctx->nls_map != (void *) -1) + unload_nls(ctx->nls_map); + kfree(ctx); +} + +static const struct fs_context_operations jfs_context_ops = { + .parse_param = jfs_parse_param, + .get_tree = jfs_get_tree, + .reconfigure = jfs_reconfigure, + .free = jfs_free_fc, +}; + +static int jfs_init_fs_context(struct fs_context *fc) +{ + struct jfs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + jfs_init_options(fc, ctx); + + fc->fs_private = ctx; + fc->ops = &jfs_context_ops; + + return 0; +} + static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", - .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = jfs_init_fs_context, + .parameters = jfs_param_spec, }; MODULE_ALIAS_FS("jfs"); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 0fb05e314edf..24afbae87225 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -559,7 +559,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { - int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size); + int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); printk(KERN_ERR "ea_get: invalid extended attribute\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 9ff37ae650ea..de32c95d823d 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { - struct fd f = fdget(fd); - ssize_t ret = -EBADF; + CLASS(fd, f)(fd); - if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ)) - goto out; + if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + return -EBADF; - ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); -out: - fdput(f); - return ret; + return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); diff --git a/fs/libfs.c b/fs/libfs.c index 46966fd8bcf9..5b6120b19e99 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); + + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + return NULL; + d_add(dentry, NULL); return NULL; } @@ -241,9 +245,16 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); -/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +/* simple_offset_add() never assigns these to a dentry */ enum { - DIR_OFFSET_MIN = 2, + DIR_OFFSET_FIRST = 2, /* Find first real entry */ + DIR_OFFSET_EOD = S32_MAX, +}; + +/* simple_offset_add() allocation range */ +enum { + DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, + DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, }; static void offset_set(struct dentry *dentry, long offset) @@ -287,9 +298,10 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) return -EBUSY; ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, - LONG_MAX, &octx->next_offset, GFP_KERNEL); - if (ret < 0) - return ret; + DIR_OFFSET_MAX, &octx->next_offset, + GFP_KERNEL); + if (unlikely(ret < 0)) + return ret == -EBUSY ? -ENOSPC : ret; offset_set(dentry, offset); return 0; @@ -326,38 +338,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) } /** - * simple_offset_empty - Check if a dentry can be unlinked - * @dentry: dentry to be tested - * - * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. - */ -int simple_offset_empty(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct offset_ctx *octx; - struct dentry *child; - unsigned long index; - int ret = 1; - - if (!inode || !S_ISDIR(inode->i_mode)) - return ret; - - index = DIR_OFFSET_MIN; - octx = inode->i_op->get_offset_ctx(inode); - mt_for_each(&octx->mt, child, index, LONG_MAX) { - spin_lock(&child->d_lock); - if (simple_positive(child)) { - spin_unlock(&child->d_lock); - ret = 0; - break; - } - spin_unlock(&child->d_lock); - } - - return ret; -} - -/** * simple_offset_rename - handle directory offsets for rename * @old_dir: parent directory of source entry * @old_dentry: dentry of source entry @@ -450,14 +430,6 @@ void simple_offset_destroy(struct offset_ctx *octx) mtree_destroy(&octx->mt); } -static int offset_dir_open(struct inode *inode, struct file *file) -{ - struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); - - file->private_data = (void *)ctx->next_offset; - return 0; -} - /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated @@ -471,9 +443,6 @@ static int offset_dir_open(struct inode *inode, struct file *file) */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_inode; - struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); - switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -486,62 +455,89 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } - /* In this case, ->private_data is protected by f_pos_lock */ - if (!offset) - file->private_data = (void *)ctx->next_offset; return vfs_setpos(file, offset, LONG_MAX); } -static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) +static struct dentry *find_positive_dentry(struct dentry *parent, + struct dentry *dentry, + bool next) { - MA_STATE(mas, &octx->mt, offset, offset); + struct dentry *found = NULL; + + spin_lock(&parent->d_lock); + if (next) + dentry = d_next_sibling(dentry); + else if (!dentry) + dentry = d_first_child(parent); + hlist_for_each_entry_from(dentry, d_sib) { + if (!simple_positive(dentry)) + continue; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(dentry)) + found = dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + if (likely(found)) + break; + } + spin_unlock(&parent->d_lock); + return found; +} + +static noinline_for_stack struct dentry * +offset_dir_lookup(struct dentry *parent, loff_t offset) +{ + struct inode *inode = d_inode(parent); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *child, *found = NULL; - rcu_read_lock(); - child = mas_find(&mas, LONG_MAX); - if (!child) - goto out; - spin_lock(&child->d_lock); - if (simple_positive(child)) - found = dget_dlock(child); - spin_unlock(&child->d_lock); -out: - rcu_read_unlock(); + MA_STATE(mas, &octx->mt, offset, offset); + + if (offset == DIR_OFFSET_FIRST) + found = find_positive_dentry(parent, NULL, false); + else { + rcu_read_lock(); + child = mas_find(&mas, DIR_OFFSET_MAX); + found = find_positive_dentry(parent, child, false); + rcu_read_unlock(); + } return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - long offset = dentry2offset(dentry); - return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, - inode->i_ino, fs_umode_to_dtype(inode->i_mode)); + return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) +static void offset_iterate_dir(struct file *file, struct dir_context *ctx) { - struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); + struct dentry *dir = file->f_path.dentry; struct dentry *dentry; + dentry = offset_dir_lookup(dir, ctx->pos); + if (!dentry) + goto out_eod; while (true) { - dentry = offset_find_next(octx, ctx->pos); - if (!dentry) - return; + struct dentry *next; - if (dentry2offset(dentry) >= last_index) { - dput(dentry); - return; - } - - if (!offset_dir_emit(ctx, dentry)) { - dput(dentry); - return; - } + ctx->pos = dentry2offset(dentry); + if (!offset_dir_emit(ctx, dentry)) + break; - ctx->pos = dentry2offset(dentry) + 1; + next = find_positive_dentry(dir, dentry, true); dput(dentry); + + if (!next) + goto out_eod; + dentry = next; } + dput(dentry); + return; + +out_eod: + ctx->pos = DIR_OFFSET_EOD; } /** @@ -561,6 +557,8 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. + * Caller places this value in the d_off field of the last entry in the + * user's buffer. * * Return values: * %0 - Complete @@ -568,19 +566,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; - long last_index = (long)file->private_data; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; - - offset_iterate_dir(d_inode(dir), ctx, last_index); + if (ctx->pos != DIR_OFFSET_EOD) + offset_iterate_dir(file, ctx); return 0; } const struct file_operations simple_offset_dir_operations = { - .open = offset_dir_open, .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, @@ -669,6 +665,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; + s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); @@ -1711,15 +1708,6 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(struct mnt_idmap *idmap, - const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) -{ - struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - return 0; -} - static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { @@ -1733,9 +1721,7 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, - .permission = generic_permission, .setattr = empty_dir_setattr, - .getattr = empty_dir_getattr, .listxattr = empty_dir_listxattr, }; @@ -1791,8 +1777,8 @@ bool is_empty_dir_inode(struct inode *inode) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; @@ -1835,6 +1821,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } +EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1843,7 +1830,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1858,6 +1845,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } +EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index a3e97278b997..6ea3448d2d31 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -2,8 +2,9 @@ /* * linux/fs/lockd/clntxdr.c * - * XDR functions to encode/decode NLM version 3 RPC arguments and results. - * NLM version 3 is backwards compatible with NLM versions 1 and 2. + * XDR functions to encode/decode NLM version 1 and 3 RPC + * arguments and results. NLM version 2 is not specified + * by a standard, thus it is not implemented. * * NLM client-side only. * diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 8a72c418cdcc..109e5caae8c7 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -46,14 +46,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, if (filp != NULL) { int mode = lock_to_openmode(&lock->fl); + lock->fl.c.flc_flags = FL_POSIX; + error = nlm_lookup_file(rqstp, &file, lock); if (error) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.c.flc_flags = FL_POSIX; - lock->fl.c.flc_file = file->f_file[mode]; + lock->fl.c.flc_file = file->f_file[mode]; lock->fl.c.flc_pid = current->tgid; lock->fl.fl_start = (loff_t)lock->lock_start; lock->fl.fl_end = lock->lock_len ? @@ -108,7 +109,8 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ - resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); + resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, + &resp->lock); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -142,18 +144,6 @@ __nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; -#if 0 - /* If supplied state doesn't match current state, we assume it's - * an old request that time-warped somehow. Any error return would - * do in this case because it's irrelevant anyway. - * - * NB: We don't retrieve the remote host's state yet. - */ - if (host->h_nsmstate && host->h_nsmstate != argp->state) { - resp->status = nlm_lck_denied_nolocks; - } else -#endif - /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1f2149db10f2..c1315df4b350 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct inode *inode = nlmsvc_file_inode(file); + struct inode *inode __maybe_unused = nlmsvc_file_inode(file); struct nlm_block *block = NULL; int error; int mode; @@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; } @@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, * requests on the underlaying ->lock() implementation but * only one nlm_block to being granted by lm_grant(). */ - if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) && !list_empty(&block->b_list)) { spin_unlock(&nlm_blocked_lock); ret = nlm_lck_blocked; @@ -609,7 +608,7 @@ out: __be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, - struct nlm_lock *conflock, struct nlm_cookie *cookie) + struct nlm_lock *conflock) { int error; int mode; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index a03220e66ce0..f53d5177f267 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -130,7 +130,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) test_owner = argp->lock.fl.c.flc_owner; /* Now check for conflicting locks */ - resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); + resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, + &argp->lock, &resp->lock)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -165,18 +166,6 @@ __nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; -#if 0 - /* If supplied state doesn't match current state, we assume it's - * an old request that time-warped somehow. Any error return would - * do in this case because it's irrelevant anyway. - * - * NB: We don't retrieve the remote host's state yet. - */ - if (host->h_nsmstate && host->h_nsmstate != argp->state) { - resp->status = nlm_lck_denied_nolocks; - } else -#endif - /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, argp->block, &argp->cookie, diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 3d28b9c3ed15..e343c820301f 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -89,7 +89,6 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) return false; locks_init_lock(fl); - fl->c.flc_flags = FL_POSIX; fl->c.flc_type = F_RDLCK; nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len); return true; @@ -268,7 +267,6 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32)0; diff --git a/fs/locks.c b/fs/locks.c index 204847628f3e..25afc8d9c9d1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2136,7 +2136,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { int can_sleep, error, type; struct file_lock fl; - struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2155,19 +2154,18 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (type < 0) return type; - error = -EBADF; - f = fdget(fd); - if (!fd_file(f)) - return error; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE))) - goto out_putf; + return -EBADF; flock_make_lock(fd_file(f), &fl, type); error = security_file_lock(fd_file(f), fl.c.flc_type); if (error) - goto out_putf; + return error; can_sleep = !(cmd & LOCK_NB); if (can_sleep) @@ -2181,9 +2179,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) error = locks_lock_file_wait(fd_file(f), &fl); locks_release_private(&fl); - out_putf: - fdput(f); - return error; } diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13..ffb613cdfeee 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -8,15 +8,23 @@ struct mnt_namespace { struct ns_common ns; struct mount * root; - struct rb_root mounts; /* Protected by namespace_sem */ + struct { + struct rb_root mounts; /* Protected by namespace_sem */ + struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */ + struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */ + }; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ - wait_queue_head_t poll; + union { + wait_queue_head_t poll; + struct rcu_head mnt_ns_rcu; + }; u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -38,6 +46,7 @@ struct mount { struct dentry *mnt_mountpoint; struct vfsmount mnt; union { + struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; @@ -51,10 +60,7 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - union { - struct rb_node mnt_node; /* Under ns->mounts */ - struct list_head mnt_list; - }; + struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -145,24 +151,28 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool mnt_ns_attached(const struct mount *mnt) +{ + return !RB_EMPTY_NODE(&mnt->mnt_node); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { - WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); - mnt->mnt.mnt_flags &= ~MNT_ONRB; - rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + struct mnt_namespace *ns = mnt->mnt_ns; + WARN_ON(!mnt_ns_attached(mnt)); + if (ns->mnt_last_node == &mnt->mnt_node) + ns->mnt_last_node = rb_prev(&mnt->mnt_node); + if (ns->mnt_first_node == &mnt->mnt_node) + ns->mnt_first_node = rb_next(&mnt->mnt_node); + rb_erase(&mnt->mnt_node, &ns->mounts); + RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); -static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, false); -} -static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, true); -} +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns, + bool previous); + static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..e56c29a22d26 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags) return result; } -struct filename * -getname_uflags(const char __user *filename, int uflags) +struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } -struct filename * -getname(const char __user * filename) +struct filename *getname(const char __user * filename) { return getname_flags(filename, 0); } -struct filename * -getname_kernel(const char * filename) +struct filename *__getname_maybe_null(const char __user *pathname) +{ + struct filename *name; + char c; + + /* try to save on allocations; loss on um, though */ + if (get_user(c, pathname)) + return ERR_PTR(-EFAULT); + if (!c) + return NULL; + + name = getname_flags(pathname, LOOKUP_EMPTY); + if (!IS_ERR(name) && !(name->name[0])) { + putname(name); + name = NULL; + } + return name; +} + +struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; @@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { - if (IS_ERR(name)) + if (IS_ERR_OR_NULL(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) @@ -326,6 +342,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +383,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { @@ -588,6 +645,7 @@ struct nameidata { unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; + const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; @@ -606,6 +664,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->depth = 0; p->dfd = dfd; p->name = name; + p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; @@ -2439,7 +2498,7 @@ OK: static const char *path_init(struct nameidata *nd, unsigned flags) { int error; - const char *s = nd->name->name; + const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) @@ -2503,26 +2562,22 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(nd->dfd); + CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && - !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) { - fdput(f); + !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); - } } dentry = fd_file(f)->f_path.dentry; - if (*s && unlikely(!d_can_lookup(dentry))) { - fdput(f); + if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); - } nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { @@ -2532,7 +2587,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - fdput(f); } /* For scoped-lookups we need to set the root to the dirfd as well. */ @@ -5218,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna getname(newname), 0); } -int readlink_copy(char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { - int len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; + int copylen; - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } /** @@ -5250,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) const char *link; int res; + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); @@ -5268,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(link)) return PTR_ERR(link); } - res = readlink_copy(buffer, buflen, link); + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } @@ -5337,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { + const char *link; + int res; + DEFINE_DELAYED_CALL(done); - int res = readlink_copy(buffer, buflen, - page_get_link(dentry, d_inode(dentry), - &done)); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } diff --git a/fs/namespace.c b/fs/namespace.c index 93c377816d75..4013fbac354a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,7 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> -#include <linux/nospec.h> +#include <linux/pidfs.h> #include "pnode.h" #include "internal.h" @@ -66,12 +66,12 @@ static int __init set_mphash_entries(char *str) __setup("mphash_entries=", set_mphash_entries); static u64 event; -static DEFINE_IDA(mnt_id_ida); +static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); +static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; @@ -79,8 +79,10 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -static DEFINE_RWLOCK(mnt_ns_tree_lock); +static DEFINE_SEQLOCK(mnt_ns_tree_lock); + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ +static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -106,17 +108,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -124,25 +115,52 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; + + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} + +static inline void mnt_ns_tree_write_lock(void) +{ + write_seqlock(&mnt_ns_tree_lock); +} - return mnt_ns_cmp(seq_a, ns_b) < 0; +static inline void mnt_ns_tree_write_unlock(void) +{ + write_sequnlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node, *prev; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->mnt_ns_tree_node); + if (!prev) + list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); + else + list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) { - lockdep_assert_not_held(&mnt_ns_tree_lock); - /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { put_user_ns(ns->user_ns); @@ -151,41 +169,34 @@ static void mnt_ns_release(struct mnt_namespace *ns) } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + list_bidir_del_rcu(&ns->mnt_ns_list); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } -/* - * Returns the mount namespace which either has the specified id, or has the - * next smallest id afer the specified one. - */ -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +static int mnt_ns_find(const void *key, const struct rb_node *node) { - struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; + const u64 mnt_ns_id = *(u64 *)key; + const struct mnt_namespace *ns = node_to_mnt_ns(node); - lockdep_assert_held(&mnt_ns_tree_lock); - - while (node) { - struct mnt_namespace *n = node_to_mnt_ns(node); - - if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); - if (mnt_ns_id == n->seq) - break; - node = node->rb_left; - } else { - node = node->rb_right; - } - } - return ret; + if (mnt_ns_id < ns->seq) + return -1; + if (mnt_ns_id > ns->seq) + return 1; + return 0; } /* @@ -195,18 +206,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + struct rb_node *node; + unsigned int seq; + + guard(rcu)(); + do { + seq = read_seqbegin(&mnt_ns_tree_lock); + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); + if (node) + break; + } while (read_seqretry(&mnt_ns_tree_lock, seq)); - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + if (!node) + return NULL; - refcount_inc(&ns->passive); - return ns; + /* + * The last reference count is put with RCU delay so we can + * unconditonally acquire a reference here. + */ + ns = node_to_mnt_ns(node); + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void) @@ -236,18 +266,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) static int mnt_alloc_id(struct mount *mnt) { - int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); + int res; - if (res < 0) - return res; - mnt->mnt_id = res; - mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); - return 0; + xa_lock(&mnt_id_xa); + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); + if (!res) + mnt->mnt_id_unique = ++mnt_id_ctr; + xa_unlock(&mnt_id_xa); + return res; } static void mnt_free_id(struct mount *mnt) { - ida_free(&mnt_id_ida, mnt->mnt_id); + xa_erase(&mnt_id_xa, mnt->mnt_id); } /* @@ -344,6 +375,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -1123,19 +1155,27 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; + bool mnt_first_node = true, mnt_last_node = true; - WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; - if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; - else + mnt_last_node = false; + } else { link = &parent->rb_right; + mnt_first_node = false; + } } + + if (mnt_last_node) + ns->mnt_last_node = &mnt->mnt_node; + if (mnt_first_node) + ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - mnt->mnt.mnt_flags |= MNT_ONRB; } /* @@ -1305,7 +1345,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1763,7 +1803,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - if (p->mnt.mnt_flags & MNT_ONRB) + if (mnt_ns_attached(p)) move_from_ns(p, &tmp_list); else list_move(&p->mnt_list, &tmp_list); @@ -1912,16 +1952,14 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } @@ -2055,9 +2093,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { + struct ns_common *ns; + /* Is this a proxy for a mount namespace? */ - return dentry->d_op == &ns_dentry_operations && - dentry->d_fsdata == &mntns_operations; + if (dentry->d_op != &ns_dentry_operations) + return false; + + ns = d_inode(dentry)->i_private; + + return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) @@ -2065,30 +2109,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) return &mnt->ns; } -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { - guard(read_lock)(&mnt_ns_tree_lock); + guard(rcu)(); + for (;;) { - struct rb_node *node; + struct list_head *list; if (previous) - node = rb_prev(&mntns->mnt_ns_tree_node); + list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); else - node = rb_next(&mntns->mnt_ns_tree_node); - if (!node) + list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); + if (list_is_head(list, &mnt_ns_list)) return ERR_PTR(-ENOENT); - mntns = node_to_mnt_ns(node); - node = &mntns->mnt_ns_tree_node; + mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + /* + * The last passive reference count is put with RCU + * delay so accessing the mount namespace is not just + * safe but all relevant members are still valid. + */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* - * Holding mnt_ns_tree_lock prevents the mount namespace from - * being freed but it may well be on it's deathbed. We want an - * active reference, not just a passive one here as we're - * persisting the mount namespace. + * We need an active reference count as we're persisting + * the mount namespace and it might already be on its + * deathbed. */ if (!refcount_inc_not_zero(&mntns->ns.count)) continue; @@ -2732,8 +2780,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) - return mnt; + if (!check_mnt(old)) { + const struct dentry_operations *d_op = old_path->dentry->d_op; + + if (d_op != &ns_dentry_operations && + d_op != &pidfs_dentry_operations) + return mnt; + } if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; @@ -3835,7 +3888,7 @@ int path_mount(const char *dev_name, struct path *path, data_page); } -long do_mount(const char *dev_name, const char __user *dir_name, +int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; @@ -3901,10 +3954,11 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + INIT_LIST_HEAD(&new_ns->mnt_ns_list); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); @@ -3944,7 +3998,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - free_mnt_ns(new_ns); + ns_free_inum(&new_ns->ns); + dec_mnt_namespaces(new_ns->ucounts); + mnt_ns_release(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { @@ -3982,7 +4038,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -3990,6 +4045,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); + mnt_ns_tree_add(new_ns); return new_ns; } @@ -4105,7 +4161,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, struct file *file; struct path newmount; struct mount *mnt; - struct fd f; unsigned int mnt_flags = 0; long ret; @@ -4133,19 +4188,18 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, return -EINVAL; } - f = fdget(fs_fd); - if (!fd_file(f)) + CLASS(fd, f)(fs_fd); + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (fd_file(f)->f_op != &fscontext_fops) - goto err_fsfd; + return -EINVAL; fc = fd_file(f)->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) - goto err_fsfd; + return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; @@ -4212,8 +4266,6 @@ err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); -err_fsfd: - fdput(f); return ret; } @@ -4668,10 +4720,8 @@ out: static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { - int err = 0; struct ns_common *ns; struct user_namespace *mnt_userns; - struct fd f; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; @@ -4687,20 +4737,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (attr->userns_fd > INT_MAX) return -EINVAL; - f = fdget(attr->userns_fd); - if (!fd_file(f)) + CLASS(fd, f)(attr->userns_fd); + if (fd_empty(f)) return -EBADF; - if (!proc_ns_file(fd_file(f))) { - err = -EINVAL; - goto out_fput; - } + if (!proc_ns_file(fd_file(f))) + return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) { - err = -EINVAL; - goto out_fput; - } + if (ns->ops->type != CLONE_NEWUSER) + return -EINVAL; /* * The initial idmapping cannot be used to create an idmapped @@ -4711,22 +4757,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); - if (mnt_userns == &init_user_ns) { - err = -EPERM; - goto out_fput; - } + if (mnt_userns == &init_user_ns) + return -EPERM; /* We're not controlling the target namespace. */ - if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { - err = -EPERM; - goto out_fput; - } + if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) + return -EPERM; kattr->mnt_userns = get_user_ns(mnt_userns); - -out_fput: - fdput(f); - return err; + return 0; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, @@ -5004,6 +5043,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5019,6 +5092,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) if (sb->s_op->show_options) { size_t start = seq->count; + err = security_sb_show_options(seq, sb); + if (err) + return err; + err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; @@ -5038,35 +5115,128 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_opt_process(struct seq_file *seq, size_t start) +{ + char *buf_end, *opt_end, *src, *dst; + int count = 0; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + buf_end = seq->buf + seq->count; + dst = seq->buf + start; + src = dst + 1; /* skip initial comma */ + + if (src >= buf_end) { + seq->count = start; + return 0; + } + + *buf_end = '\0'; + for (; src < buf_end; src = opt_end + 1) { + opt_end = strchrnul(src, ','); + *opt_end = '\0'; + dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == INT_MAX)) + return -EOVERFLOW; + } + seq->count = dst - 1 - seq->buf; + return count; +} + +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + int err; + + if (!sb->s_op->show_options) + return 0; + + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + err = statmount_opt_process(seq, start); + if (err < 0) + return err; + + s->sm.opt_num = err; + return 0; +} + +static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + int err; + + err = security_sb_show_options(seq, sb); + if (err) + return err; + + err = statmount_opt_process(seq, start); + if (err < 0) + return err; + + s->sm.opt_sec_num = err; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; + case STATMOUNT_OPT_SEC_ARRAY: + sm->opt_sec_array = start; + ret = statmount_opt_sec_array(s, seq); + break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) @@ -5201,6 +5371,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + + if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); + + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5222,7 +5404,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, @@ -5399,9 +5583,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, if (!last_mnt_id) { if (reverse) - first = node_to_mount(rb_last(&ns->mounts)); + first = node_to_mount(ns->mnt_last_node); else - first = node_to_mount(rb_first(&ns->mounts)); + first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index d08b0bfb6756..b43188d64bd8 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -13,8 +13,11 @@ netfs-y := \ read_collect.o \ read_pgpriv2.o \ read_retry.o \ + read_single.o \ + rolling_buffer.o \ write_collect.o \ - write_issue.o + write_issue.o \ + write_retry.o netfs-$(CONFIG_NETFS_STATS) += stats.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index c40e226053cc..f761d44b3436 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -64,33 +64,6 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in } /* - * Decant the list of folios to read into a rolling buffer. - */ -static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, - struct folio_queue *folioq) -{ - unsigned int order, nr; - size_t size = 0; - - nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios, - ARRAY_SIZE(folioq->vec.folios)); - folioq->vec.nr = nr; - for (int i = 0; i < nr; i++) { - struct folio *folio = folioq_folio(folioq, i); - - trace_netfs_folio(folio, netfs_folio_trace_read); - order = folio_order(folio); - folioq->orders[i] = order; - size += PAGE_SIZE << order; - } - - for (int i = nr; i < folioq_nr_slots(folioq); i++) - folioq_clear(folioq, i); - - return size; -} - -/* * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O * @subreq: The subrequest to be set up * @@ -120,27 +93,24 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) * that we will need to release later - but we don't want to do * that until after we've started the I/O. */ + struct folio_batch put_batch; + + folio_batch_init(&put_batch); while (rreq->submitted < subreq->start + rsize) { - struct folio_queue *tail = rreq->buffer_tail, *new; - size_t added; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return -ENOMEM; - netfs_stat(&netfs_n_folioq); - folioq_init(new); - new->prev = tail; - tail->next = new; - rreq->buffer_tail = new; - added = netfs_load_buffer_from_ra(rreq, new); - rreq->iter.count += added; + ssize_t added; + + added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl, + &put_batch); + if (added < 0) + return added; rreq->submitted += added; } + folio_batch_release(&put_batch); } subreq->len = rsize; if (unlikely(rreq->io_streams[0].sreq_max_segs)) { - size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize, rreq->io_streams[0].sreq_max_segs); if (limit < rsize) { @@ -149,20 +119,10 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) } } - subreq->io_iter = rreq->iter; - - if (iov_iter_is_folioq(&subreq->io_iter)) { - if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) { - subreq->io_iter.folioq = subreq->io_iter.folioq->next; - subreq->io_iter.folioq_slot = 0; - } - subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq; - subreq->curr_folioq_slot = subreq->io_iter.folioq_slot; - subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; - } + subreq->io_iter = rreq->buffer.iter; iov_iter_truncate(&subreq->io_iter, subreq->len); - iov_iter_advance(&rreq->iter, subreq->len); + rolling_buffer_advance(&rreq->buffer, subreq->len); return subreq->len; } @@ -171,25 +131,14 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rr loff_t i_size) { struct netfs_cache_resources *cres = &rreq->cache_resources; + enum netfs_io_source source; if (!cres->ops) return NETFS_DOWNLOAD_FROM_SERVER; - return cres->ops->prepare_read(subreq, i_size); -} - -static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_subrequest *subreq = priv; + source = cres->ops->prepare_read(subreq, i_size); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + return source; - if (transferred_or_error < 0) { - netfs_read_subreq_terminated(subreq, transferred_or_error, was_async); - return; - } - - if (transferred_or_error > 0) - subreq->transferred += transferred_or_error; - netfs_read_subreq_terminated(subreq, 0, was_async); } /* @@ -206,6 +155,47 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } +static void netfs_issue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct netfs_io_stream *stream = &rreq->io_streams[0]; + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + + /* We add to the end of the list whilst the collector may be walking + * the list. The collector only goes nextwards and uses the lock to + * remove entries off of the front. + */ + spin_lock(&rreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { + stream->front = subreq; + if (!stream->active) { + stream->collected_to = stream->front->start; + /* Store list pointers before active flag */ + smp_store_release(&stream->active, true); + } + } + + spin_unlock(&rreq->lock); + + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + rreq->netfs_ops->issue_read(subreq); + break; + case NETFS_READ_FROM_CACHE: + netfs_read_cache_to_pagecache(rreq, subreq); + break; + default: + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + subreq->error = 0; + iov_iter_zero(subreq->len, &subreq->io_iter); + subreq->transferred = subreq->len; + netfs_read_subreq_terminated(subreq); + break; + } +} + /* * Perform a read to the pagecache from a series of sources of different types, * slicing up the region to be read according to available cache blocks and @@ -218,11 +208,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) ssize_t size = rreq->len; int ret = 0; - atomic_inc(&rreq->nr_outstanding); - do { struct netfs_io_subrequest *subreq; - enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + enum netfs_io_source source = NETFS_SOURCE_UNKNOWN; ssize_t slice; subreq = netfs_alloc_subrequest(rreq); @@ -234,20 +222,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) subreq->start = start; subreq->len = size; - atomic_inc(&rreq->nr_outstanding); - spin_lock_bh(&rreq->lock); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - subreq->prev_donated = rreq->prev_donated; - rreq->prev_donated = 0; - trace_netfs_sreq(subreq, netfs_sreq_trace_added); - spin_unlock_bh(&rreq->lock); - source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); subreq->source = source; if (source == NETFS_DOWNLOAD_FROM_SERVER) { unsigned long long zp = umin(ictx->zero_point, rreq->i_size); size_t len = subreq->len; + if (unlikely(rreq->origin == NETFS_READ_SINGLE)) + zp = rreq->i_size; if (subreq->start >= zp) { subreq->source = source = NETFS_FILL_WITH_ZEROES; goto fill_with_zeroes; @@ -268,24 +250,17 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - atomic_dec(&rreq->nr_outstanding); + subreq->error = ret; + /* Not queued - release both refs. */ + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_cancel); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); break; } trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); } - - slice = netfs_prepare_read_iterator(subreq); - if (slice < 0) { - atomic_dec(&rreq->nr_outstanding); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - ret = slice; - break; - } - - rreq->netfs_ops->issue_read(subreq); - goto done; + goto issue; } fill_with_zeroes: @@ -293,106 +268,50 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) subreq->source = NETFS_FILL_WITH_ZEROES; trace_netfs_sreq(subreq, netfs_sreq_trace_submit); netfs_stat(&netfs_n_rh_zero); - slice = netfs_prepare_read_iterator(subreq); - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_read_subreq_terminated(subreq, 0, false); - goto done; + goto issue; } if (source == NETFS_READ_FROM_CACHE) { trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - slice = netfs_prepare_read_iterator(subreq); - netfs_read_cache_to_pagecache(rreq, subreq); - goto done; + goto issue; } pr_err("Unexpected read source %u\n", source); WARN_ON_ONCE(1); break; - done: + issue: + slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) { + ret = slice; + subreq->error = ret; + trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); + /* Not queued - release both refs. */ + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + } size -= slice; start += slice; + if (size <= 0) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + + netfs_issue_read(rreq, subreq); cond_resched(); } while (size > 0); - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_terminated(rreq, false); + if (unlikely(size > 0)) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + netfs_wake_read_collector(rreq); + } /* Defer error return as we may need to wait for outstanding I/O. */ cmpxchg(&rreq->error, 0, ret); } -/* - * Wait for the read operation to complete, successfully or otherwise. - */ -static int netfs_wait_for_read(struct netfs_io_request *rreq) -{ - int ret; - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - - return ret; -} - -/* - * Set up the initial folioq of buffer folios in the rolling buffer and set the - * iterator to refer to it. - */ -static int netfs_prime_buffer(struct netfs_io_request *rreq) -{ - struct folio_queue *folioq; - size_t added; - - folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); - if (!folioq) - return -ENOMEM; - netfs_stat(&netfs_n_folioq); - folioq_init(folioq); - rreq->buffer = folioq; - rreq->buffer_tail = folioq; - rreq->submitted = rreq->start; - iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0); - - added = netfs_load_buffer_from_ra(rreq, folioq); - rreq->iter.count += added; - rreq->submitted += added; - return 0; -} - -/* - * Drop the ref on each folio that we inherited from the VM readahead code. We - * still have the folio locks to pin the page until we complete the I/O. - * - * Note that we can't just release the batch in each queue struct as we use the - * occupancy count in other places. - */ -static void netfs_put_ra_refs(struct folio_queue *folioq) -{ - struct folio_batch fbatch; - - folio_batch_init(&fbatch); - while (folioq) { - for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) { - struct folio *folio = folioq_folio(folioq, slot); - if (!folio) - continue; - trace_netfs_folio(folio, netfs_folio_trace_read_put); - if (!folio_batch_add(&fbatch, folio)) - folio_batch_release(&fbatch); - } - folioq = folioq->next; - } - - folio_batch_release(&fbatch); -} - /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -421,6 +340,8 @@ void netfs_readahead(struct readahead_control *ractl) if (IS_ERR(rreq)) return; + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags); + ret = netfs_begin_cache_read(rreq, ictx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) goto cleanup_free; @@ -432,13 +353,11 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); rreq->ractl = ractl; - if (netfs_prime_buffer(rreq) < 0) + rreq->submitted = rreq->start; + if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0) goto cleanup_free; netfs_read_to_pagecache(rreq); - /* Release the folio refs whilst we're waiting for the I/O. */ - netfs_put_ra_refs(rreq->buffer); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); return; @@ -451,23 +370,18 @@ EXPORT_SYMBOL(netfs_readahead); /* * Create a rolling buffer with a single occupying folio. */ -static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio) +static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio, + unsigned int rollbuf_flags) { - struct folio_queue *folioq; + ssize_t added; - folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); - if (!folioq) + if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0) return -ENOMEM; - netfs_stat(&netfs_n_folioq); - folioq_init(folioq); - folioq_append(folioq, folio); - BUG_ON(folioq_folio(folioq, 0) != folio); - BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio)); - rreq->buffer = folioq; - rreq->buffer_tail = folioq; - rreq->submitted = rreq->start + rreq->len; - iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len); + added = rolling_buffer_append(&rreq->buffer, folio, rollbuf_flags); + if (added < 0) + return added; + rreq->submitted = rreq->start + added; rreq->ractl = (struct readahead_control *)1UL; return 0; } @@ -535,7 +449,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) } if (to < flen) bvec_set_folio(&bvec[i++], folio, flen - to, to); - iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len); rreq->submitted = rreq->start + flen; netfs_read_to_pagecache(rreq); @@ -544,7 +458,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) folio_put(sink); ret = netfs_wait_for_read(rreq); - if (ret == 0) { + if (ret >= 0) { flush_dcache_folio(folio); folio_mark_uptodate(folio); } @@ -603,7 +517,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); /* Set up the output buffer */ - ret = netfs_create_singular_buffer(rreq, folio); + ret = netfs_create_singular_buffer(rreq, folio, 0); if (ret < 0) goto discard; @@ -646,7 +560,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, if (unlikely(always_fill)) { if (pos - offset + len <= i_size) return false; /* Page entirely before EOF */ - zero_user_segment(&folio->page, 0, plen); + folio_zero_segment(folio, 0, plen); folio_mark_uptodate(folio); return true; } @@ -665,7 +579,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, return false; zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, plen); + folio_zero_segments(folio, 0, offset, offset + len, plen); return true; } @@ -732,7 +646,7 @@ retry: if (folio_test_uptodate(folio)) goto have_folio; - /* If the page is beyond the EOF, we want to clear it - unless it's + /* If the folio is beyond the EOF, we want to clear it - unless it's * within the cache granule containing the EOF, in which case we need * to preload the granule. */ @@ -760,7 +674,7 @@ retry: trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); /* Set up the output buffer */ - ret = netfs_create_singular_buffer(rreq, folio); + ret = netfs_create_singular_buffer(rreq, folio, 0); if (ret < 0) goto error_put; @@ -792,7 +706,7 @@ error: EXPORT_SYMBOL(netfs_write_begin); /* - * Preload the data into a page we're proposing to write into. + * Preload the data into a folio we're proposing to write into. */ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) @@ -825,15 +739,14 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); /* Set up the output buffer */ - ret = netfs_create_singular_buffer(rreq, folio); + ret = netfs_create_singular_buffer(rreq, folio, NETFS_ROLLBUF_PAGECACHE_MARK); if (ret < 0) goto error_put; - folioq_mark2(rreq->buffer, 0); netfs_read_to_pagecache(rreq); ret = netfs_wait_for_read(rreq); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); - return ret; + return ret < 0 ? ret : 0; error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b3910dfcb56d..b4826360a411 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, * netfs_perform_write - Copy data into the pagecache. * @iocb: The operation parameters * @iter: The source buffer - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * - * Copy data into pagecache pages attached to the inode specified by @iocb. + * Copy data into pagecache folios attached to the inode specified by @iocb. * The caller must hold appropriate inode locks. * - * Dirty pages are tagged with a netfs_folio struct if they're not up to date - * to indicate the range modified. Dirty pages may also be tagged with a + * Dirty folios are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty folios may also be tagged with a * netfs-specific grouping such that data from an old group gets flushed before * a new one is started. */ @@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * we try to read it. */ if (fpos >= ctx->zero_point) { - zero_user_segment(&folio->page, 0, offset); + folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - zero_user_segment(&folio->page, offset + copied, flen); + folio_zero_segment(folio, offset + copied, flen); __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); trace_netfs_folio(folio, netfs_modify_and_clear); @@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write); * netfs_buffered_write_iter_locked - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter); /* * Notification that a previously read-only page is about to become writable. - * Note that the caller indicates a single page of a multipage folio. + * The caller indicates the precise page that needs to be written to, but + * we only track group on a per-folio basis, so we block more often than + * we might otherwise. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) { @@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); - vm_fault_t ret = VM_FAULT_RETRY; + vm_fault_t ret = VM_FAULT_NOPAGE; int err; _enter("%lx", folio->index); @@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr if (folio_lock_killable(folio) < 0) goto out; - if (folio->mapping != mapping) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (folio_wait_writeback_killable(folio)) { - ret = VM_FAULT_LOCKED; - goto out; - } + if (folio->mapping != mapping) + goto unlock; + if (folio_wait_writeback_killable(folio) < 0) + goto unlock; /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { - ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; - goto out; + ret = VM_FAULT_SIGBUS; + goto unlock; } group = netfs_folio_group(folio); @@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr out: sb_end_pagefault(inode->i_sb); return ret; +unlock: + folio_unlock(folio); + goto out; } EXPORT_SYMBOL(netfs_page_mkwrite); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index b1a66a6e6bc2..0bf3c2f5a710 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -25,7 +25,7 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) subreq->len = rsize; if (unlikely(rreq->io_streams[0].sreq_max_segs)) { - size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize, rreq->io_streams[0].sreq_max_segs); if (limit < rsize) { @@ -36,9 +36,9 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - subreq->io_iter = rreq->iter; + subreq->io_iter = rreq->buffer.iter; iov_iter_truncate(&subreq->io_iter, subreq->len); - iov_iter_advance(&rreq->iter, subreq->len); + iov_iter_advance(&rreq->buffer.iter, subreq->len); } /* @@ -47,12 +47,11 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) */ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) { + struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned long long start = rreq->start; ssize_t size = rreq->len; int ret = 0; - atomic_set(&rreq->nr_outstanding, 1); - do { struct netfs_io_subrequest *subreq; ssize_t slice; @@ -67,19 +66,25 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) subreq->start = start; subreq->len = size; - atomic_inc(&rreq->nr_outstanding); - spin_lock_bh(&rreq->lock); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - subreq->prev_donated = rreq->prev_donated; - rreq->prev_donated = 0; + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + + spin_lock(&rreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { + stream->front = subreq; + if (!stream->active) { + stream->collected_to = stream->front->start; + /* Store list pointers before active flag */ + smp_store_release(&stream->active, true); + } + } trace_netfs_sreq(subreq, netfs_sreq_trace_added); - spin_unlock_bh(&rreq->lock); + spin_unlock(&rreq->lock); netfs_stat(&netfs_n_rh_download); if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - atomic_dec(&rreq->nr_outstanding); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); break; } @@ -87,20 +92,32 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) netfs_prepare_dio_read_iterator(subreq); slice = subreq->len; - rreq->netfs_ops->issue_read(subreq); - size -= slice; start += slice; rreq->submitted += slice; + if (size <= 0) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + + rreq->netfs_ops->issue_read(subreq); + if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + netfs_wait_for_pause(rreq); + if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) + break; if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) break; cond_resched(); } while (size > 0); - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_terminated(rreq, false); + if (unlikely(size > 0)) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + netfs_wake_read_collector(rreq); + } + return ret; } @@ -133,21 +150,10 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) goto out; } - if (sync) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len && - rreq->origin != NETFS_DIO_READ) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - } else { + if (sync) + ret = netfs_wait_for_read(rreq); + else ret = -EIOCBQUEUED; - } - out: _leave(" = %d", ret); return ret; @@ -199,15 +205,15 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i * the request. */ if (user_backed_iter(iter)) { - ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); + ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0); if (ret < 0) goto out; - rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; + rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec; rreq->direct_bv_count = ret; rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); - rreq->len = iov_iter_count(&rreq->iter); + rreq->len = iov_iter_count(&rreq->buffer.iter); } else { - rreq->iter = *iter; + rreq->buffer.iter = *iter; rreq->len = orig_count; rreq->direct_bv_unpin = false; iov_iter_advance(iter, orig_count); @@ -215,8 +221,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i // TODO: Set up bounce buffer if needed - if (!sync) + if (!sync) { rreq->iocb = iocb; + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags); + } ret = netfs_unbuffered_read(rreq, sync); if (ret < 0) diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 88f2adfab75e..42ce53cc216e 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -67,20 +67,23 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * * allocate a sufficiently large bvec array and may shorten the * request. */ - if (async || user_backed_iter(iter)) { - n = netfs_extract_user_iter(iter, len, &wreq->iter, 0); + if (user_backed_iter(iter)) { + n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0); if (n < 0) { ret = n; goto out; } - wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; + wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec; wreq->direct_bv_count = n; wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); } else { - wreq->iter = *iter; + /* If this is a kernel-generated async DIO request, + * assume that any resources the iterator points to + * (eg. a bio_vec array) will persist till the end of + * the op. + */ + wreq->buffer.iter = *iter; } - - wreq->io_iter = wreq->iter; } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); @@ -92,7 +95,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); if (async) wreq->iocb = iocb; - wreq->len = iov_iter_count(&wreq->io_iter); + wreq->len = iov_iter_count(&wreq->buffer.iter); wreq->cleanup = netfs_cleanup_dio_write; ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { @@ -104,7 +107,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - smp_rmb(); /* Read error/transferred after RIP flag */ ret = wreq->error; if (ret == 0) { ret = wreq->transferred; diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 38637e5c9b57..b1722a82c03d 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -9,7 +9,6 @@ #include <linux/uio.h> #include <linux/bvec.h> #include <linux/slab.h> -#include <linux/uio.h> #include "internal.h" /** diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cb75c07b5281..ced14ac78cc1 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -322,8 +322,7 @@ maybe_wait: } return; no_wait: - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); } /* diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index c562aec3b483..eb76f98c894b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,6 +23,7 @@ /* * buffered_read.c */ +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); @@ -58,11 +59,8 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ -struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq); -int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, - bool needs_put); -struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq); -void netfs_clear_buffer(struct netfs_io_request *rreq); +struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq, + enum netfs_folioq_trace trace); void netfs_reset_iter(struct netfs_io_subrequest *subreq); /* @@ -84,20 +82,27 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); } +static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq, + enum netfs_sreq_ref_trace what) +{ + trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), what); +} + /* * read_collect.c */ -void netfs_read_termination_worker(struct work_struct *work); -void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async); +void netfs_read_collection_worker(struct work_struct *work); +void netfs_wake_read_collector(struct netfs_io_request *rreq); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); +void netfs_wait_for_pause(struct netfs_io_request *rreq); /* * read_pgpriv2.c */ -void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, - struct netfs_io_request *rreq, - struct folio_queue *folioq, - int slot); -void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq); +void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio); +void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq); bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq); /* @@ -113,6 +118,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq); extern atomic_t netfs_n_rh_dio_read; extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_read_folio; +extern atomic_t netfs_n_rh_read_single; extern atomic_t netfs_n_rh_rreq; extern atomic_t netfs_n_rh_sreq; extern atomic_t netfs_n_rh_download; @@ -181,9 +187,9 @@ void netfs_reissue_write(struct netfs_io_stream *stream, struct iov_iter *source); void netfs_issue_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream); -int netfs_advance_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream, - loff_t start, size_t len, bool to_eof); +size_t netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof); struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio, size_t copied, bool to_page_end, @@ -193,6 +199,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* + * write_retry.c + */ +void netfs_retry_writes(struct netfs_io_request *wreq); + +/* * Miscellaneous functions. */ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 21eab56ee2f9..2249ecd09d0a 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -109,6 +109,7 @@ int netfs_start_io_write(struct inode *inode) up_write(&inode->i_rwsem); return -ERESTARTSYS; } + downgrade_write(&inode->i_rwsem); return 0; } EXPORT_SYMBOL(netfs_start_io_write); @@ -123,7 +124,7 @@ EXPORT_SYMBOL(netfs_start_io_write); void netfs_end_io_write(struct inode *inode) __releases(inode->i_rwsem) { - up_write(&inode->i_rwsem); + up_read(&inode->i_rwsem); } EXPORT_SYMBOL(netfs_end_io_write); diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 6c7be1377ee0..4e3e62040831 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -37,9 +37,11 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READAHEAD] = "RA", [NETFS_READPAGE] = "RP", [NETFS_READ_GAPS] = "RG", + [NETFS_READ_SINGLE] = "R1", [NETFS_READ_FOR_WRITE] = "RW", [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", + [NETFS_WRITEBACK_SINGLE] = "W1", [NETFS_WRITETHROUGH] = "WT", [NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_DIO_WRITE] = "DW", @@ -69,7 +71,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) refcount_read(&rreq->ref), rreq->flags, rreq->error, - atomic_read(&rreq->nr_outstanding), + 0, rreq->start, rreq->submitted, rreq->len); seq_putc(m, '\n'); return 0; @@ -116,7 +118,7 @@ static int __init netfs_init(void) goto error_reqpool; netfs_subrequest_slab = kmem_cache_create("netfs_subrequest", - sizeof(struct netfs_io_subrequest), 0, + sizeof(struct netfs_io_subrequest) + 16, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!netfs_subrequest_slab) diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 78fe5796b2b2..7099aa07737a 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,113 +8,101 @@ #include <linux/swap.h> #include "internal.h" -/* - * Make sure there's space in the rolling queue. +/** + * netfs_alloc_folioq_buffer - Allocate buffer space into a folio queue + * @mapping: Address space to set on the folio (or NULL). + * @_buffer: Pointer to the folio queue to add to (may point to a NULL; updated). + * @_cur_size: Current size of the buffer (updated). + * @size: Target size of the buffer. + * @gfp: The allocation constraints. */ -struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq) +int netfs_alloc_folioq_buffer(struct address_space *mapping, + struct folio_queue **_buffer, + size_t *_cur_size, ssize_t size, gfp_t gfp) { - struct folio_queue *tail = rreq->buffer_tail, *prev; - unsigned int prev_nr_slots = 0; - - if (WARN_ON_ONCE(!rreq->buffer && tail) || - WARN_ON_ONCE(rreq->buffer && !tail)) - return ERR_PTR(-EIO); - - prev = tail; - if (prev) { - if (!folioq_full(tail)) - return tail; - prev_nr_slots = folioq_nr_slots(tail); - } - - tail = kmalloc(sizeof(*tail), GFP_NOFS); - if (!tail) - return ERR_PTR(-ENOMEM); - netfs_stat(&netfs_n_folioq); - folioq_init(tail); - tail->prev = prev; - if (prev) - /* [!] NOTE: After we set prev->next, the consumer is entirely - * at liberty to delete prev. - */ - WRITE_ONCE(prev->next, tail); - - rreq->buffer_tail = tail; - if (!rreq->buffer) { - rreq->buffer = tail; - iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); - } else { - /* Make sure we don't leave the master iterator pointing to a - * block that might get immediately consumed. - */ - if (rreq->io_iter.folioq == prev && - rreq->io_iter.folioq_slot == prev_nr_slots) { - rreq->io_iter.folioq = tail; - rreq->io_iter.folioq_slot = 0; + struct folio_queue *tail = *_buffer, *p; + + size = round_up(size, PAGE_SIZE); + if (*_cur_size >= size) + return 0; + + if (tail) + while (tail->next) + tail = tail->next; + + do { + struct folio *folio; + int order = 0, slot; + + if (!tail || folioq_full(tail)) { + p = netfs_folioq_alloc(0, GFP_NOFS, netfs_trace_folioq_alloc_buffer); + if (!p) + return -ENOMEM; + if (tail) { + tail->next = p; + p->prev = tail; + } else { + *_buffer = p; + } + tail = p; } - } - rreq->buffer_tail_slot = 0; - return tail; -} -/* - * Append a folio to the rolling queue. - */ -int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, - bool needs_put) -{ - struct folio_queue *tail; - unsigned int slot, order = folio_order(folio); + if (size - *_cur_size > PAGE_SIZE) + order = umin(ilog2(size - *_cur_size) - PAGE_SHIFT, + MAX_PAGECACHE_ORDER); - tail = netfs_buffer_make_space(rreq); - if (IS_ERR(tail)) - return PTR_ERR(tail); + folio = folio_alloc(gfp, order); + if (!folio && order > 0) + folio = folio_alloc(gfp, 0); + if (!folio) + return -ENOMEM; - rreq->io_iter.count += PAGE_SIZE << order; + folio->mapping = mapping; + folio->index = *_cur_size / PAGE_SIZE; + trace_netfs_folio(folio, netfs_folio_trace_alloc_buffer); + slot = folioq_append_mark(tail, folio); + *_cur_size += folioq_folio_size(tail, slot); + } while (*_cur_size < size); - slot = folioq_append(tail, folio); - /* Store the counter after setting the slot. */ - smp_store_release(&rreq->buffer_tail_slot, slot); return 0; } +EXPORT_SYMBOL(netfs_alloc_folioq_buffer); -/* - * Delete the head of a rolling queue. +/** + * netfs_free_folioq_buffer - Free a folio queue. + * @fq: The start of the folio queue to free + * + * Free up a chain of folio_queues and, if marked, the marked folios they point + * to. */ -struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq) +void netfs_free_folioq_buffer(struct folio_queue *fq) { - struct folio_queue *head = wreq->buffer, *next = head->next; - - if (next) - next->prev = NULL; - netfs_stat_d(&netfs_n_folioq); - kfree(head); - wreq->buffer = next; - return next; -} + struct folio_queue *next; + struct folio_batch fbatch; -/* - * Clear out a rolling queue. - */ -void netfs_clear_buffer(struct netfs_io_request *rreq) -{ - struct folio_queue *p; + folio_batch_init(&fbatch); + + for (; fq; fq = next) { + for (int slot = 0; slot < folioq_count(fq); slot++) { + struct folio *folio = folioq_folio(fq, slot); - while ((p = rreq->buffer)) { - rreq->buffer = p->next; - for (int slot = 0; slot < folioq_count(p); slot++) { - struct folio *folio = folioq_folio(p, slot); - if (!folio) + if (!folio || + !folioq_is_marked(fq, slot)) continue; - if (folioq_is_marked(p, slot)) { - trace_netfs_folio(folio, netfs_folio_trace_put); - folio_put(folio); - } + + trace_netfs_folio(folio, netfs_folio_trace_put); + if (folio_batch_add(&fbatch, folio)) + folio_batch_release(&fbatch); } + netfs_stat_d(&netfs_n_folioq); - kfree(p); + next = fq->next; + kfree(fq); } + + folio_batch_release(&fbatch); } +EXPORT_SYMBOL(netfs_free_folioq_buffer); /* * Reset the subrequest iterator to refer just to the region remaining to be diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 31e388ec6e48..dc6b41ef18b0 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -48,17 +48,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, spin_lock_init(&rreq->lock); INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); - INIT_LIST_HEAD(&rreq->subrequests); + init_waitqueue_head(&rreq->waitq); refcount_set(&rreq->ref, 1); if (origin == NETFS_READAHEAD || origin == NETFS_READPAGE || origin == NETFS_READ_GAPS || + origin == NETFS_READ_SINGLE || origin == NETFS_READ_FOR_WRITE || - origin == NETFS_DIO_READ) - INIT_WORK(&rreq->work, netfs_read_termination_worker); - else + origin == NETFS_DIO_READ) { + INIT_WORK(&rreq->work, netfs_read_collection_worker); + rreq->io_streams[0].avail = true; + } else { INIT_WORK(&rreq->work, netfs_write_collection_worker); + } __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (file && file->f_flags & O_NONBLOCK) @@ -92,14 +95,6 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) struct netfs_io_stream *stream; int s; - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async, - netfs_sreq_trace_put_clear); - } - for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) { stream = &rreq->io_streams[s]; while (!list_empty(&stream->subrequests)) { @@ -143,7 +138,7 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } - netfs_clear_buffer(rreq); + rolling_buffer_clear(&rreq->buffer); if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index b18c65ba5580..f65affa5a9e4 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -14,6 +14,14 @@ #include <linux/task_io_accounting_ops.h> #include "internal.h" +/* Notes made in the collector */ +#define HIT_PENDING 0x01 /* A front op was still pending */ +#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ +#define BUFFERED 0x08 /* The pagecache needs cleaning up */ +#define NEED_RETRY 0x10 /* A front op requests retrying */ +#define COPY_TO_CACHE 0x40 /* Need to copy subrequest to cache */ +#define ABANDON_SREQ 0x80 /* Need to abandon untransferred part of subrequest */ + /* * Clear the unread part of an I/O request. */ @@ -31,14 +39,18 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq) * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it * dirty and let writeback handle it. */ -static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, - struct netfs_io_request *rreq, +static void netfs_unlock_read_folio(struct netfs_io_request *rreq, struct folio_queue *folioq, int slot) { struct netfs_folio *finfo; struct folio *folio = folioq_folio(folioq, slot); + if (unlikely(folio_pos(folio) < rreq->abandon_to)) { + trace_netfs_folio(folio, netfs_folio_trace_abandon); + goto just_unlock; + } + flush_dcache_folio(folio); folio_mark_uptodate(folio); @@ -53,7 +65,7 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, kfree(finfo); } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) { if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); @@ -62,12 +74,15 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, } else { trace_netfs_folio(folio, netfs_folio_trace_read_done); } + + folioq_clear(folioq, slot); } else { // TODO: Use of PG_private_2 is deprecated. - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) - netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) + netfs_pgpriv2_copy_to_cache(rreq, folio); } +just_unlock: if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { @@ -77,237 +92,249 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, folio_unlock(folio); } } + + folioq_clear(folioq, slot); } /* - * Unlock any folios that are now completely read. Returns true if the - * subrequest is removed from the list. + * Unlock any folios we've finished with. */ -static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) +static void netfs_read_unlock_folios(struct netfs_io_request *rreq, + unsigned int *notes) { - struct netfs_io_subrequest *prev, *next; - struct netfs_io_request *rreq = subreq->rreq; - struct folio_queue *folioq = subreq->curr_folioq; - size_t avail, prev_donated, next_donated, fsize, part, excess; - loff_t fpos, start; - loff_t fend; - int slot = subreq->curr_folioq_slot; - - if (WARN(subreq->transferred > subreq->len, - "Subreq overread: R%x[%x] %zu > %zu", - rreq->debug_id, subreq->debug_index, - subreq->transferred, subreq->len)) - subreq->transferred = subreq->len; - -next_folio: - fsize = PAGE_SIZE << subreq->curr_folio_order; - fpos = round_down(subreq->start + subreq->consumed, fsize); - fend = fpos + fsize; - - if (WARN_ON_ONCE(!folioq) || - WARN_ON_ONCE(!folioq_folio(folioq, slot)) || - WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { - pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", - rreq->debug_id, subreq->debug_index, - subreq->start, subreq->start + subreq->transferred - 1, - subreq->consumed, subreq->transferred, subreq->len, - slot); - if (folioq) { - struct folio *folio = folioq_folio(folioq, slot); - - pr_err("folioq: orders=%02x%02x%02x%02x\n", - folioq->orders[0], folioq->orders[1], - folioq->orders[2], folioq->orders[3]); - if (folio) - pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", - fpos, fend - 1, folio_pos(folio), folio_order(folio), - folioq_folio_order(folioq, slot)); - } - } + struct folio_queue *folioq = rreq->buffer.tail; + unsigned long long collected_to = rreq->collected_to; + unsigned int slot = rreq->buffer.first_tail_slot; -donation_changed: - /* Try to consume the current folio if we've hit or passed the end of - * it. There's a possibility that this subreq doesn't start at the - * beginning of the folio, in which case we need to donate to/from the - * preceding subreq. - * - * We also need to include any potential donation back from the - * following subreq. - */ - prev_donated = READ_ONCE(subreq->prev_donated); - next_donated = READ_ONCE(subreq->next_donated); - if (prev_donated || next_donated) { - spin_lock_bh(&rreq->lock); - prev_donated = subreq->prev_donated; - next_donated = subreq->next_donated; - subreq->start -= prev_donated; - subreq->len += prev_donated; - subreq->transferred += prev_donated; - prev_donated = subreq->prev_donated = 0; - if (subreq->transferred == subreq->len) { - subreq->len += next_donated; - subreq->transferred += next_donated; - next_donated = subreq->next_donated = 0; + if (rreq->cleaned_to >= rreq->collected_to) + return; + + // TODO: Begin decryption + + if (slot >= folioq_nr_slots(folioq)) { + folioq = rolling_buffer_delete_spent(&rreq->buffer); + if (!folioq) { + rreq->front_folio_order = 0; + return; } - trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); - spin_unlock_bh(&rreq->lock); + slot = 0; } - avail = subreq->transferred; - if (avail == subreq->len) - avail += next_donated; - start = subreq->start; - if (subreq->consumed == 0) { - start -= prev_donated; - avail += prev_donated; - } else { - start += subreq->consumed; - avail -= subreq->consumed; - } - part = umin(avail, fsize); - - trace_netfs_progress(subreq, start, avail, part); - - if (start + avail >= fend) { - if (fpos == start) { - /* Flush, unlock and mark for caching any folio we've just read. */ - subreq->consumed = fend - subreq->start; - netfs_unlock_read_folio(subreq, rreq, folioq, slot); - folioq_mark2(folioq, slot); - if (subreq->consumed >= subreq->len) - goto remove_subreq; - } else if (fpos < start) { - excess = fend - subreq->start; - - spin_lock_bh(&rreq->lock); - /* If we complete first on a folio split with the - * preceding subreq, donate to that subreq - otherwise - * we get the responsibility. - */ - if (subreq->prev_donated != prev_donated) { - spin_unlock_bh(&rreq->lock); - goto donation_changed; - } + for (;;) { + struct folio *folio; + unsigned long long fpos, fend; + unsigned int order; + size_t fsize; - if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { - spin_unlock_bh(&rreq->lock); - pr_err("Can't donate prior to front\n"); - goto bad; - } + if (*notes & COPY_TO_CACHE) + set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); - prev = list_prev_entry(subreq, rreq_link); - WRITE_ONCE(prev->next_donated, prev->next_donated + excess); - subreq->start += excess; - subreq->len -= excess; - subreq->transferred -= excess; - trace_netfs_donate(rreq, subreq, prev, excess, - netfs_trace_donate_tail_to_prev); - trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); - - if (subreq->consumed >= subreq->len) - goto remove_subreq_locked; - spin_unlock_bh(&rreq->lock); - } else { - pr_err("fpos > start\n"); - goto bad; - } + folio = folioq_folio(folioq, slot); + if (WARN_ONCE(!folio_test_locked(folio), + "R=%08x: folio %lx is not locked\n", + rreq->debug_id, folio->index)) + trace_netfs_folio(folio, netfs_folio_trace_not_locked); + + order = folioq_folio_order(folioq, slot); + rreq->front_folio_order = order; + fsize = PAGE_SIZE << order; + fpos = folio_pos(folio); + fend = umin(fpos + fsize, rreq->i_size); - /* Advance the rolling buffer to the next folio. */ + trace_netfs_collect_folio(rreq, folio, fend, collected_to); + + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; + + netfs_unlock_read_folio(rreq, folioq, slot); + WRITE_ONCE(rreq->cleaned_to, fpos + fsize); + *notes |= MADE_PROGRESS; + + clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); + + /* Clean up the head folioq. If we clear an entire folioq, then + * we can get rid of it provided it's not also the tail folioq + * being filled by the issuer. + */ + folioq_clear(folioq, slot); slot++; if (slot >= folioq_nr_slots(folioq)) { + folioq = rolling_buffer_delete_spent(&rreq->buffer); + if (!folioq) + goto done; slot = 0; - folioq = folioq->next; - subreq->curr_folioq = folioq; + trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress); } - subreq->curr_folioq_slot = slot; - if (folioq && folioq_folio(folioq, slot)) - subreq->curr_folio_order = folioq->orders[slot]; - if (!was_async) - cond_resched(); - goto next_folio; + + if (fpos + fsize >= collected_to) + break; } - /* Deal with partial progress. */ - if (subreq->transferred < subreq->len) - return false; + rreq->buffer.tail = folioq; +done: + rreq->buffer.first_tail_slot = slot; +} - /* Donate the remaining downloaded data to one of the neighbouring - * subrequests. Note that we may race with them doing the same thing. +/* + * Collect and assess the results of various read subrequests. We may need to + * retry some of the results. + * + * Note that we have a sequence of subrequests, which may be drawing on + * different sources and may or may not be the same size or starting position + * and may not even correspond in boundary alignment. + */ +static void netfs_collect_read_results(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *front, *remove; + struct netfs_io_stream *stream = &rreq->io_streams[0]; + unsigned int notes; + + _enter("%llx-%llx", rreq->start, rreq->start + rreq->len); + trace_netfs_rreq(rreq, netfs_rreq_trace_collect); + trace_netfs_collect(rreq); + +reassess: + if (rreq->origin == NETFS_READAHEAD || + rreq->origin == NETFS_READPAGE || + rreq->origin == NETFS_READ_FOR_WRITE) + notes = BUFFERED; + else + notes = 0; + + /* Remove completed subrequests from the front of the stream and + * advance the completion point. We stop when we hit something that's + * in progress. The issuer thread may be adding stuff to the tail + * whilst we're doing this. */ - spin_lock_bh(&rreq->lock); + front = READ_ONCE(stream->front); + while (front) { + size_t transferred; - if (subreq->prev_donated != prev_donated || - subreq->next_donated != next_donated) { - spin_unlock_bh(&rreq->lock); - cond_resched(); - goto donation_changed; - } + trace_netfs_collect_sreq(rreq, front); + _debug("sreq [%x] %llx %zx/%zx", + front->debug_index, front->start, front->transferred, front->len); - /* Deal with the trickiest case: that this subreq is in the middle of a - * folio, not touching either edge, but finishes first. In such a - * case, we donate to the previous subreq, if there is one, so that the - * donation is only handled when that completes - and remove this - * subreq from the list. - * - * If the previous subreq finished first, we will have acquired their - * donation and should be able to unlock folios and/or donate nextwards. - */ - if (!subreq->consumed && - !prev_donated && - !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { - prev = list_prev_entry(subreq, rreq_link); - WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); - subreq->start += subreq->len; - subreq->len = 0; - subreq->transferred = 0; - trace_netfs_donate(rreq, subreq, prev, subreq->len, - netfs_trace_donate_to_prev); - trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); - goto remove_subreq_locked; + if (stream->collected_to < front->start) { + trace_netfs_collect_gap(rreq, stream, front->start, 'F'); + stream->collected_to = front->start; + } + + if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) + notes |= HIT_PENDING; + smp_rmb(); /* Read counters after IN_PROGRESS flag. */ + transferred = READ_ONCE(front->transferred); + + /* If we can now collect the next folio, do so. We don't want + * to defer this as we have to decide whether we need to copy + * to the cache or not, and that may differ between adjacent + * subreqs. + */ + if (notes & BUFFERED) { + size_t fsize = PAGE_SIZE << rreq->front_folio_order; + + /* Clear the tail of a short read. */ + if (!(notes & HIT_PENDING) && + front->error == 0 && + transferred < front->len && + (test_bit(NETFS_SREQ_HIT_EOF, &front->flags) || + test_bit(NETFS_SREQ_CLEAR_TAIL, &front->flags))) { + netfs_clear_unread(front); + transferred = front->transferred = front->len; + trace_netfs_sreq(front, netfs_sreq_trace_clear); + } + + stream->collected_to = front->start + transferred; + rreq->collected_to = stream->collected_to; + + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &front->flags)) + notes |= COPY_TO_CACHE; + + if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { + rreq->abandon_to = front->start + front->len; + front->transferred = front->len; + transferred = front->len; + trace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon); + } + if (front->start + transferred >= rreq->cleaned_to + fsize || + test_bit(NETFS_SREQ_HIT_EOF, &front->flags)) + netfs_read_unlock_folios(rreq, ¬es); + } else { + stream->collected_to = front->start + transferred; + rreq->collected_to = stream->collected_to; + } + + /* Stall if the front is still undergoing I/O. */ + if (notes & HIT_PENDING) + break; + + if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { + if (!stream->failed) { + stream->error = front->error; + rreq->error = front->error; + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + stream->failed = true; + } + notes |= MADE_PROGRESS | ABANDON_SREQ; + } else if (test_bit(NETFS_SREQ_NEED_RETRY, &front->flags)) { + stream->need_retry = true; + notes |= NEED_RETRY | MADE_PROGRESS; + break; + } else { + if (!stream->failed) + stream->transferred = stream->collected_to - rreq->start; + notes |= MADE_PROGRESS; + } + + /* Remove if completely consumed. */ + stream->source = front->source; + spin_lock(&rreq->lock); + + remove = front; + trace_netfs_sreq(front, netfs_sreq_trace_discard); + list_del_init(&front->rreq_link); + front = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + stream->front = front; + spin_unlock(&rreq->lock); + netfs_put_subrequest(remove, false, + notes & ABANDON_SREQ ? + netfs_sreq_trace_put_abandon : + netfs_sreq_trace_put_done); } - /* If we can't donate down the chain, donate up the chain instead. */ - excess = subreq->len - subreq->consumed + next_donated; + trace_netfs_collect_stream(rreq, stream); + trace_netfs_collect_state(rreq, rreq->collected_to, notes); - if (!subreq->consumed) - excess += prev_donated; + if (!(notes & BUFFERED)) + rreq->cleaned_to = rreq->collected_to; - if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - rreq->prev_donated = excess; - trace_netfs_donate(rreq, subreq, NULL, excess, - netfs_trace_donate_to_deferred_next); - } else { - next = list_next_entry(subreq, rreq_link); - WRITE_ONCE(next->prev_donated, excess); - trace_netfs_donate(rreq, subreq, next, excess, - netfs_trace_donate_to_next); + if (notes & NEED_RETRY) + goto need_retry; + if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) { + trace_netfs_rreq(rreq, netfs_rreq_trace_unpause); + clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags); + smp_mb__after_atomic(); /* Set PAUSE before task state */ + wake_up(&rreq->waitq); + } + + if (notes & MADE_PROGRESS) { + //cond_resched(); + goto reassess; } - trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); - subreq->len = subreq->consumed; - subreq->transferred = subreq->consumed; - goto remove_subreq_locked; - -remove_subreq: - spin_lock_bh(&rreq->lock); -remove_subreq_locked: - subreq->consumed = subreq->len; - list_del(&subreq->rreq_link); - spin_unlock_bh(&rreq->lock); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); - return true; - -bad: - /* Errr... prev and next both donated to us, but insufficient to finish - * the folio. + +out: + _leave(" = %x", notes); + return; + +need_retry: + /* Okay... We're going to have to retry parts of the stream. Note + * that any partially completed op will have had any wholly transferred + * folios removed from it. */ - printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", - rreq->debug_id, subreq->debug_index, - subreq->start, subreq->start + subreq->transferred - 1, - subreq->consumed, subreq->transferred, subreq->len); - printk("folio: %llx-%llx\n", fpos, fend - 1); - printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); - printk("s=%llx av=%zx part=%zx\n", start, avail, part); - BUG(); + _debug("retry"); + netfs_retry_reads(rreq); + goto out; } /* @@ -316,12 +343,13 @@ bad: static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned int i; /* Collect unbuffered reads and direct reads, adding up the transfer * sizes until we find the first short or failed subrequest. */ - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { rreq->transferred += subreq->transferred; if (subreq->transferred < subreq->len || @@ -354,89 +382,135 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) } /* - * Assess the state of a read request and decide what to do next. + * Do processing after reading a monolithic single object. + */ +static void netfs_rreq_assess_single(struct netfs_io_request *rreq) +{ + struct netfs_io_stream *stream = &rreq->io_streams[0]; + + if (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER && + fscache_resources_valid(&rreq->cache_resources)) { + trace_netfs_rreq(rreq, netfs_rreq_trace_dirty); + netfs_single_mark_inode_dirty(rreq->inode); + } + + if (rreq->iocb) { + rreq->iocb->ki_pos += rreq->transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); +} + +/* + * Perform the collection of subrequests and folios. * * Note that we're in normal kernel thread context at this point, possibly * running on a workqueue. */ -static void netfs_rreq_assess(struct netfs_io_request *rreq) +static void netfs_read_collection(struct netfs_io_request *rreq) { - trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + struct netfs_io_stream *stream = &rreq->io_streams[0]; - //netfs_rreq_is_still_valid(rreq); + netfs_collect_read_results(rreq); - if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { - netfs_retry_reads(rreq); + /* We're done when the app thread has finished posting subreqs and the + * queue is empty. + */ + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) return; - } + smp_rmb(); /* Read ALL_QUEUED before subreq lists. */ + + if (!list_empty(&stream->subrequests)) + return; + + /* Okay, declare that all I/O is complete. */ + rreq->transferred = stream->transferred; + trace_netfs_rreq(rreq, netfs_rreq_trace_complete); - if (rreq->origin == NETFS_DIO_READ || - rreq->origin == NETFS_READ_GAPS) + //netfs_rreq_is_still_valid(rreq); + + switch (rreq->origin) { + case NETFS_DIO_READ: + case NETFS_READ_GAPS: netfs_rreq_assess_dio(rreq); + break; + case NETFS_READ_SINGLE: + netfs_rreq_assess_single(rreq); + break; + default: + break; + } task_io_account_read(rreq->transferred); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_done); netfs_clear_subrequests(rreq, false); netfs_unlock_abandoned_read_pages(rreq); - if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) - netfs_pgpriv2_write_to_the_cache(rreq); + if (unlikely(rreq->copy_to_cache)) + netfs_pgpriv2_end_copy_to_cache(rreq); } -void netfs_read_termination_worker(struct work_struct *work) +void netfs_read_collection_worker(struct work_struct *work) { - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); + struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + netfs_see_request(rreq, netfs_rreq_trace_see_work); - netfs_rreq_assess(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); + if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + netfs_read_collection(rreq); + netfs_put_request(rreq, false, netfs_rreq_trace_put_work); } /* - * Handle the completion of all outstanding I/O operations on a read request. - * We inherit a ref from the caller. + * Wake the collection work item. */ -void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) +void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (!was_async) - return netfs_rreq_assess(rreq); - if (!work_pending(&rreq->work)) { - netfs_get_request(rreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &rreq->work)) - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (!work_pending(&rreq->work)) { + netfs_get_request(rreq, netfs_rreq_trace_get_work); + if (!queue_work(system_unbound_wq, &rreq->work)) + netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq); + } + } else { + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); + wake_up(&rreq->waitq); } } /** * netfs_read_subreq_progress - Note progress of a read operation. * @subreq: The read request that has terminated. - * @was_async: True if we're in an asynchronous context. * * This tells the read side of netfs lib that a contributory I/O operation has * made some progress and that it may be possible to unlock some folios. * * Before calling, the filesystem should update subreq->transferred to track * the amount of data copied into the output buffer. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. */ -void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, - bool was_async) +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; + size_t fsize = PAGE_SIZE << rreq->front_folio_order; trace_netfs_sreq(subreq, netfs_sreq_trace_progress); - if (subreq->transferred > subreq->consumed && + /* If we are at the head of the queue, wake up the collector, + * getting a ref to it if we were the ones to do so. + */ + if (subreq->start + subreq->transferred > rreq->cleaned_to + fsize && (rreq->origin == NETFS_READAHEAD || rreq->origin == NETFS_READPAGE || - rreq->origin == NETFS_READ_FOR_WRITE)) { - netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + rreq->origin == NETFS_READ_FOR_WRITE) && + list_is_first(&subreq->rreq_link, &stream->subrequests) + ) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + netfs_wake_read_collector(rreq); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -444,27 +518,23 @@ EXPORT_SYMBOL(netfs_read_subreq_progress); /** * netfs_read_subreq_terminated - Note the termination of an I/O operation. * @subreq: The I/O request that has terminated. - * @error: Error code indicating type of completion. - * @was_async: The termination was asynchronous * * This tells the read helper that a contributory I/O operation has terminated, * one way or another, and that it should integrate the results. * - * The caller indicates the outcome of the operation through @error, supplying - * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY - * is set) or a negative error code. The helper will look after reissuing I/O - * operations as appropriate and writing downloaded data to the cache. + * The caller indicates the outcome of the operation through @subreq->error, + * supplying 0 to indicate a successful or retryable transfer (if + * NETFS_SREQ_NEED_RETRY is set) or a negative error code. The helper will + * look after reissuing I/O operations as appropriate and writing downloaded + * data to the cache. * * Before calling, the filesystem should update subreq->transferred to track * the amount of data copied into the output buffer. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. */ -void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, - int error, bool was_async) +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -477,68 +547,156 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, break; } - if (rreq->origin != NETFS_DIO_READ) { - /* Collect buffered reads. - * - * If the read completed validly short, then we can clear the - * tail before going on to unlock the folios. - */ - if (error == 0 && subreq->transferred < subreq->len && - (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || - test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { - netfs_clear_unread(subreq); - subreq->transferred = subreq->len; - trace_netfs_sreq(subreq, netfs_sreq_trace_clear); - } - if (subreq->transferred > subreq->consumed && - (rreq->origin == NETFS_READAHEAD || - rreq->origin == NETFS_READPAGE || - rreq->origin == NETFS_READ_FOR_WRITE)) { - netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - rreq->transferred += subreq->transferred; - } - /* Deal with retry requests, short reads and errors. If we retry * but don't make progress, we abandon the attempt. */ - if (!error && subreq->transferred < subreq->len) { + if (!subreq->error && subreq->transferred < subreq->len) { if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); + } else if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { + trace_netfs_sreq(subreq, netfs_sreq_trace_need_clear); + } else if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + trace_netfs_sreq(subreq, netfs_sreq_trace_need_retry); + } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + trace_netfs_sreq(subreq, netfs_sreq_trace_partial_read); } else { + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + subreq->error = -ENODATA; trace_netfs_sreq(subreq, netfs_sreq_trace_short); - if (subreq->transferred > subreq->consumed) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else { - __set_bit(NETFS_SREQ_FAILED, &subreq->flags); - error = -ENODATA; - } } } - subreq->error = error; - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - if (unlikely(error < 0)) { - trace_netfs_failure(rreq, subreq, error, netfs_fail_read); + if (unlikely(subreq->error < 0)) { + trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read); if (subreq->source == NETFS_READ_FROM_CACHE) { netfs_stat(&netfs_n_rh_read_failed); + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); } else { netfs_stat(&netfs_n_rh_download_failed); - set_bit(NETFS_RREQ_FAILED, &rreq->flags); - rreq->error = subreq->error; + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); } + trace_netfs_rreq(rreq, netfs_rreq_trace_set_pause); + set_bit(NETFS_RREQ_PAUSE, &rreq->flags); } - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_terminated(rreq, was_async); + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ + + /* If we are at the head of the queue, wake up the collector. */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + netfs_wake_read_collector(rreq); - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_read_subreq_terminated); + +/* + * Handle termination of a read from the cache. + */ +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + + if (transferred_or_error > 0) { + subreq->error = 0; + if (transferred_or_error > 0) { + subreq->transferred += transferred_or_error; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } + } else { + subreq->error = transferred_or_error; + } + netfs_read_subreq_terminated(subreq); +} + +/* + * Wait for the read operation to complete, successfully or otherwise. + */ +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + ssize_t ret; + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + __set_current_state(TASK_RUNNING); + netfs_read_collection(rreq); + continue; + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); + + ret = rreq->error; + if (ret == 0) { + ret = rreq->transferred; + switch (rreq->origin) { + case NETFS_DIO_READ: + case NETFS_READ_SINGLE: + ret = rreq->transferred; + break; + default: + if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + break; + } + } + + return ret; +} + +/* + * Wait for a paused read operation to unpause or complete in some manner. + */ +void netfs_wait_for_pause(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + __set_current_state(TASK_RUNNING); + netfs_read_collection(rreq); + continue; + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); +} diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index ba5af89d37fa..cf7727060215 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -14,52 +14,11 @@ #include "internal.h" /* - * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The - * third mark in the folio queue is used to indicate that this folio needs - * writing. - */ -void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, - struct netfs_io_request *rreq, - struct folio_queue *folioq, - int slot) -{ - struct folio *folio = folioq_folio(folioq, slot); - - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folioq_mark3(folioq, slot); -} - -/* - * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an - * unrecoverable error. - */ -static void netfs_pgpriv2_cancel(struct folio_queue *folioq) -{ - struct folio *folio; - int slot; - - while (folioq) { - if (!folioq->marks3) { - folioq = folioq->next; - continue; - } - - slot = __ffs(folioq->marks3); - folio = folioq_folio(folioq, slot); - - trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); - folio_end_private_2(folio); - folioq_unmark3(folioq, slot); - } -} - -/* * [DEPRECATED] Copy a folio to the cache with PG_private_2 set. */ -static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio) +static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio *folio) { - struct netfs_io_stream *cache = &wreq->io_streams[1]; + struct netfs_io_stream *cache = &creq->io_streams[1]; size_t fsize = folio_size(folio), flen = fsize; loff_t fpos = folio_pos(folio), i_size; bool to_eof = false; @@ -70,17 +29,17 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio * of the page to beyond it, but cannot move i_size into or through the * page since we have it locked. */ - i_size = i_size_read(wreq->inode); + i_size = i_size_read(creq->inode); if (fpos >= i_size) { /* mmap beyond eof. */ _debug("beyond eof"); folio_end_private_2(folio); - return 0; + return; } - if (fpos + fsize > wreq->i_size) - wreq->i_size = i_size; + if (fpos + fsize > creq->i_size) + creq->i_size = i_size; if (flen > i_size - fpos) { flen = i_size - fpos; @@ -94,8 +53,10 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio trace_netfs_folio(folio, netfs_folio_trace_store_copy); /* Attach the folio to the rolling buffer. */ - if (netfs_buffer_append_folio(wreq, folio, false) < 0) - return -ENOMEM; + if (rolling_buffer_append(&creq->buffer, folio, 0) < 0) { + clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &creq->flags); + return; + } cache->submit_extendable_to = fsize; cache->submit_off = 0; @@ -109,11 +70,11 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio do { ssize_t part; - wreq->io_iter.iov_offset = cache->submit_off; + creq->buffer.iter.iov_offset = cache->submit_off; - atomic64_set(&wreq->issued_to, fpos + cache->submit_off); + atomic64_set(&creq->issued_to, fpos + cache->submit_off); cache->submit_extendable_to = fsize - cache->submit_off; - part = netfs_advance_write(wreq, cache, fpos + cache->submit_off, + part = netfs_advance_write(creq, cache, fpos + cache->submit_off, cache->submit_len, to_eof); cache->submit_off += part; if (part > cache->submit_len) @@ -122,94 +83,95 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio cache->submit_len -= part; } while (cache->submit_len > 0); - wreq->io_iter.iov_offset = 0; - iov_iter_advance(&wreq->io_iter, fsize); - atomic64_set(&wreq->issued_to, fpos + fsize); + creq->buffer.iter.iov_offset = 0; + rolling_buffer_advance(&creq->buffer, fsize); + atomic64_set(&creq->issued_to, fpos + fsize); if (flen < fsize) - netfs_issue_write(wreq, cache); - - _leave(" = 0"); - return 0; + netfs_issue_write(creq, cache); } /* - * [DEPRECATED] Go through the buffer and write any folios that are marked with - * the third mark to the cache. + * [DEPRECATED] Set up copying to the cache. */ -void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) +static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( + struct netfs_io_request *rreq, struct folio *folio) { - struct netfs_io_request *wreq; - struct folio_queue *folioq; - struct folio *folio; - int error = 0; - int slot = 0; - - _enter(""); + struct netfs_io_request *creq; if (!fscache_resources_valid(&rreq->cache_resources)) - goto couldnt_start; - - /* Need the first folio to be able to set up the op. */ - for (folioq = rreq->buffer; folioq; folioq = folioq->next) { - if (folioq->marks3) { - slot = __ffs(folioq->marks3); - break; - } - } - if (!folioq) - return; - folio = folioq_folio(folioq, slot); + goto cancel; - wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio), + creq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio), NETFS_PGPRIV2_COPY_TO_CACHE); - if (IS_ERR(wreq)) { - kleave(" [create %ld]", PTR_ERR(wreq)); - goto couldnt_start; - } + if (IS_ERR(creq)) + goto cancel; + + if (!creq->io_streams[1].avail) + goto cancel_put; - trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); + trace_netfs_write(creq, netfs_write_trace_copy_to_cache); netfs_stat(&netfs_n_wh_copy_to_cache); + rreq->copy_to_cache = creq; + return creq; + +cancel_put: + netfs_put_request(creq, false, netfs_rreq_trace_put_return); +cancel: + rreq->copy_to_cache = ERR_PTR(-ENOBUFS); + clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); + return ERR_PTR(-ENOBUFS); +} - for (;;) { - error = netfs_pgpriv2_copy_folio(wreq, folio); - if (error < 0) - break; +/* + * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2 and add + * it to the copy write request. + */ +void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio) +{ + struct netfs_io_request *creq = rreq->copy_to_cache; - folioq_unmark3(folioq, slot); - if (!folioq->marks3) { - folioq = folioq->next; - if (!folioq) - break; - } + if (!creq) + creq = netfs_pgpriv2_begin_copy_to_cache(rreq, folio); + if (IS_ERR(creq)) + return; - slot = __ffs(folioq->marks3); - folio = folioq_folio(folioq, slot); - } + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + netfs_pgpriv2_copy_folio(creq, folio); +} - netfs_issue_write(wreq, &wreq->io_streams[1]); +/* + * [DEPRECATED] End writing to the cache, flushing out any outstanding writes. + */ +void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_io_request *creq = rreq->copy_to_cache; + + if (IS_ERR_OR_NULL(creq)) + return; + + netfs_issue_write(creq, &creq->io_streams[1]); smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - _leave(" = %d", error); -couldnt_start: - netfs_pgpriv2_cancel(rreq->buffer); + netfs_put_request(creq, false, netfs_rreq_trace_put_return); + creq->copy_to_cache = NULL; } /* * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished * copying. */ -bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) +bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *creq) { - struct folio_queue *folioq = wreq->buffer; - unsigned long long collected_to = wreq->collected_to; - unsigned int slot = wreq->buffer_head_slot; + struct folio_queue *folioq = creq->buffer.tail; + unsigned long long collected_to = creq->collected_to; + unsigned int slot = creq->buffer.first_tail_slot; bool made_progress = false; if (slot >= folioq_nr_slots(folioq)) { - folioq = netfs_delete_buffer_head(wreq); + folioq = rolling_buffer_delete_spent(&creq->buffer); slot = 0; } @@ -221,16 +183,16 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) folio = folioq_folio(folioq, slot); if (WARN_ONCE(!folio_test_private_2(folio), "R=%08x: folio %lx is not marked private_2\n", - wreq->debug_id, folio->index)) + creq->debug_id, folio->index)) trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); fpos = folio_pos(folio); fsize = folio_size(folio); flen = fsize; - fend = min_t(unsigned long long, fpos + flen, wreq->i_size); + fend = min_t(unsigned long long, fpos + flen, creq->i_size); - trace_netfs_collect_folio(wreq, folio, fend, collected_to); + trace_netfs_collect_folio(creq, folio, fend, collected_to); /* Unlock any folio we've transferred all of. */ if (collected_to < fend) @@ -238,7 +200,7 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) trace_netfs_folio(folio, netfs_folio_trace_end_copy); folio_end_private_2(folio); - wreq->cleaned_to = fpos + fsize; + creq->cleaned_to = fpos + fsize; made_progress = true; /* Clean up the head folioq. If we clear an entire folioq, then @@ -248,9 +210,9 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) folioq_clear(folioq, slot); slot++; if (slot >= folioq_nr_slots(folioq)) { - if (READ_ONCE(wreq->buffer_tail) == folioq) - break; - folioq = netfs_delete_buffer_head(wreq); + folioq = rolling_buffer_delete_spent(&creq->buffer); + if (!folioq) + goto done; slot = 0; } @@ -258,7 +220,8 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) break; } - wreq->buffer = folioq; - wreq->buffer_head_slot = slot; + creq->buffer.tail = folioq; +done: + creq->buffer.first_tail_slot = slot; return made_progress; } diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0350592ea804..2290af0d51ac 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -12,15 +12,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, struct netfs_io_subrequest *subreq) { - struct iov_iter *io_iter = &subreq->io_iter; - - if (iov_iter_is_folioq(io_iter)) { - subreq->curr_folioq = (struct folio_queue *)io_iter->folioq; - subreq->curr_folioq_slot = io_iter->folioq_slot; - subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; - } - - atomic_inc(&rreq->nr_outstanding); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); subreq->rreq->netfs_ops->issue_read(subreq); @@ -33,13 +25,12 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream0 = &rreq->io_streams[0]; - LIST_HEAD(sublist); - LIST_HEAD(queue); + struct netfs_io_stream *stream = &rreq->io_streams[0]; + struct list_head *next; _enter("R=%x", rreq->debug_id); - if (list_empty(&rreq->subrequests)) + if (list_empty(&stream->subrequests)) return; if (rreq->netfs_ops->retry_request) @@ -49,13 +40,13 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * up to the first permanently failed one. */ if (!rreq->netfs_ops->prepare_read && - !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { - struct netfs_io_subrequest *subreq; - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + !rreq->cache_resources.ops) { + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; netfs_reset_iter(subreq); netfs_reissue_read(rreq, subreq); } @@ -73,48 +64,44 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * populating with smaller subrequests. In the event that the subreq * we just launched finishes before we insert the next subreq, it'll * fill in rreq->prev_donated instead. - + * * Note: Alternatively, we could split the tail subrequest right before * we reissue it and fix up the donations under lock. */ - list_splice_init(&rreq->subrequests, &queue); + next = stream->subrequests.next; do { - struct netfs_io_subrequest *from; + struct netfs_io_subrequest *from, *to, *tmp; struct iov_iter source; unsigned long long start, len; - size_t part, deferred_next_donated = 0; + size_t part; bool boundary = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the * rsize renegotiating) and reissue. */ - from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); - list_move_tail(&from->rreq_link, &sublist); + from = list_entry(next, struct netfs_io_subrequest, rreq_link); + to = from; start = from->start + from->transferred; len = from->len - from->transferred; - _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", + _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx", rreq->debug_id, from->debug_index, - from->start, from->consumed, from->transferred, from->len); + from->start, from->transferred, from->len); if (test_bit(NETFS_SREQ_FAILED, &from->flags) || !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) goto abandon; - deferred_next_donated = from->next_donated; - while ((subreq = list_first_entry_or_null( - &queue, struct netfs_io_subrequest, rreq_link))) { - if (subreq->start != start + len || - subreq->transferred > 0 || + list_for_each_continue(next, &stream->subrequests) { + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); + if (subreq->start + subreq->transferred != start + len || + test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) break; - list_move_tail(&subreq->rreq_link, &sublist); - len += subreq->len; - deferred_next_donated = subreq->next_donated; - if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags)) - break; + to = subreq; + len += to->len; } _debug(" - range: %llx-%llx %llx", start, start + len - 1, len); @@ -127,36 +114,31 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) source.count = len; /* Work through the sublist. */ - while ((subreq = list_first_entry_or_null( - &sublist, struct netfs_io_subrequest, rreq_link))) { - list_del(&subreq->rreq_link); - + subreq = from; + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { + if (!len) + break; subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; - stream0->sreq_max_len = subreq->len; - __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; - spin_lock_bh(&rreq->lock); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - subreq->prev_donated += rreq->prev_donated; - rreq->prev_donated = 0; trace_netfs_sreq(subreq, netfs_sreq_trace_retry); - spin_unlock_bh(&rreq->lock); - - BUG_ON(!len); /* Renegotiate max_len (rsize) */ - if (rreq->netfs_ops->prepare_read(subreq) < 0) { + stream->sreq_max_len = subreq->len; + if (rreq->netfs_ops->prepare_read && + rreq->netfs_ops->prepare_read(subreq) < 0) { trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + goto abandon; } - part = umin(len, stream0->sreq_max_len); - if (unlikely(rreq->io_streams[0].sreq_max_segs)) - part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs); + part = umin(len, stream->sreq_max_len); + if (unlikely(stream->sreq_max_segs)) + part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs); subreq->len = subreq->transferred + part; subreq->io_iter = source; iov_iter_truncate(&subreq->io_iter, part); @@ -166,58 +148,105 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (!len) { if (boundary) __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); - subreq->next_donated = deferred_next_donated; } else { __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); - subreq->next_donated = 0; } + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (!len) + if (subreq == to) break; - - /* If we ran out of subrequests, allocate another. */ - if (list_empty(&sublist)) { - subreq = netfs_alloc_subrequest(rreq); - if (!subreq) - goto abandon; - subreq->source = NETFS_DOWNLOAD_FROM_SERVER; - subreq->start = start; - - /* We get two refs, but need just one. */ - netfs_put_subrequest(subreq, false, netfs_sreq_trace_new); - trace_netfs_sreq(subreq, netfs_sreq_trace_split); - list_add_tail(&subreq->rreq_link, &sublist); - } } /* If we managed to use fewer subreqs, we can discard the - * excess. + * excess; if we used the same number, then we're done. */ - while ((subreq = list_first_entry_or_null( - &sublist, struct netfs_io_subrequest, rreq_link))) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); - list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + if (!len) { + if (subreq == to) + continue; + list_for_each_entry_safe_from(subreq, tmp, + &stream->subrequests, rreq_link) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + if (subreq == to) + break; + } + continue; } - } while (!list_empty(&queue)); + /* We ran out of subrequests, so we need to allocate some more + * and insert them after. + */ + do { + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) { + subreq = to; + goto abandon_after; + } + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start; + subreq->len = len; + subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); + subreq->stream_nr = stream->stream_nr; + subreq->retry_count = 1; + + trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + + list_add(&subreq->rreq_link, &to->rreq_link); + to = list_next_entry(to, rreq_link); + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + + stream->sreq_max_len = umin(len, rreq->rsize); + stream->sreq_max_segs = 0; + if (unlikely(stream->sreq_max_segs)) + part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs); + + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read(subreq) < 0) { + trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + goto abandon; + } + + part = umin(len, stream->sreq_max_len); + subreq->len = subreq->transferred + part; + subreq->io_iter = source; + iov_iter_truncate(&subreq->io_iter, part); + iov_iter_advance(&source, part); + + len -= part; + start += part; + if (!len && boundary) { + __set_bit(NETFS_SREQ_BOUNDARY, &to->flags); + boundary = false; + } + + netfs_reissue_read(rreq, subreq); + } while (len); + + } while (!list_is_head(next, &stream->subrequests)); return; - /* If we hit ENOMEM, fail all remaining subrequests */ + /* If we hit an error, fail all remaining incomplete subrequests */ +abandon_after: + if (list_is_last(&subreq->rreq_link, &stream->subrequests)) + return; + subreq = list_next_entry(subreq, rreq_link); abandon: - list_splice_init(&sublist, &queue); - list_for_each_entry(subreq, &queue, rreq_link) { - if (!subreq->error) - subreq->error = -ENOMEM; - __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { + if (!subreq->error && + !test_bit(NETFS_SREQ_FAILED, &subreq->flags) && + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + continue; + subreq->error = -ENOMEM; + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); } - spin_lock_bh(&rreq->lock); - list_splice_tail_init(&queue, &rreq->subrequests); - spin_unlock_bh(&rreq->lock); } /* @@ -225,14 +254,19 @@ abandon: */ void netfs_retry_reads(struct netfs_io_request *rreq) { - trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[0]; - atomic_inc(&rreq->nr_outstanding); + /* Wait for all outstanding I/O to quiesce before performing retries as + * we may need to renegotiate the I/O sizes. + */ + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + } + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); - - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_terminated(rreq, false); } /* @@ -243,7 +277,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) { struct folio_queue *p; - for (p = rreq->buffer; p; p = p->next) { + for (p = rreq->buffer.tail; p; p = p->next) { for (int slot = 0; slot < folioq_count(p); slot++) { struct folio *folio = folioq_folio(p, slot); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c new file mode 100644 index 000000000000..fea0ecdecc53 --- /dev/null +++ b/fs/netfs/read_single.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Single, monolithic object support (e.g. AFS directory). + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_single_mark_inode_dirty - Mark a single, monolithic object inode dirty + * @inode: The inode to mark + * + * Mark an inode that contains a single, monolithic object as dirty so that its + * writepages op will get called. If set, the SINGLE_NO_UPLOAD flag indicates + * that the object will only be written to the cache and not uploaded (e.g. AFS + * directory contents). + */ +void netfs_single_mark_inode_dirty(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + bool cache_only = test_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &ictx->flags); + bool caching = fscache_cookie_enabled(netfs_i_cookie(netfs_inode(inode))); + + if (cache_only && !caching) + return; + + mark_inode_dirty(inode); + + if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) { + bool need_use = false; + + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_PINNING_NETFS_WB)) { + inode->i_state |= I_PINNING_NETFS_WB; + need_use = true; + } + spin_unlock(&inode->i_lock); + + if (need_use) + fscache_use_cookie(netfs_i_cookie(ictx), true); + } + +} +EXPORT_SYMBOL(netfs_single_mark_inode_dirty); + +static int netfs_single_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx) +{ + return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); +} + +static void netfs_single_cache_prepare_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (!cres->ops) { + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + return; + } + subreq->source = cres->ops->prepare_read(subreq, rreq->i_size); + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + +} + +static void netfs_single_read_cache(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + _enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index); + netfs_stat(&netfs_n_rh_read); + cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL, + netfs_cache_read_terminated, subreq); +} + +/* + * Perform a read to a buffer from the cache or the server. Only a single + * subreq is permitted as the object must be fetched in a single transaction. + */ +static int netfs_single_dispatch_read(struct netfs_io_request *rreq) +{ + struct netfs_io_stream *stream = &rreq->io_streams[0]; + struct netfs_io_subrequest *subreq; + int ret = 0; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + return -ENOMEM; + + subreq->source = NETFS_SOURCE_UNKNOWN; + subreq->start = 0; + subreq->len = rreq->len; + subreq->io_iter = rreq->buffer.iter; + + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + + spin_lock(&rreq->lock); + list_add_tail(&subreq->rreq_link, &stream->subrequests); + trace_netfs_sreq(subreq, netfs_sreq_trace_added); + stream->front = subreq; + /* Store list pointers before active flag */ + smp_store_release(&stream->active, true); + spin_unlock(&rreq->lock); + + netfs_single_cache_prepare_read(rreq, subreq); + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read) { + ret = rreq->netfs_ops->prepare_read(subreq); + if (ret < 0) + goto cancel; + } + + rreq->netfs_ops->issue_read(subreq); + rreq->submitted += subreq->len; + break; + case NETFS_READ_FROM_CACHE: + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + netfs_single_read_cache(rreq, subreq); + rreq->submitted += subreq->len; + ret = 0; + break; + default: + pr_warn("Unexpected single-read source %u\n", subreq->source); + WARN_ON_ONCE(true); + ret = -EIO; + break; + } + + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + return ret; +cancel: + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + return ret; +} + +/** + * netfs_read_single - Synchronously read a single blob of pages. + * @inode: The inode to read from. + * @file: The file we're using to read or NULL. + * @iter: The buffer we're reading into. + * + * Fulfil a read request for a single monolithic object by drawing data from + * the cache if possible, or the netfs if not. The buffer may be larger than + * the file content; unused beyond the EOF will be zero-filled. The content + * will be read with a single I/O request (though this may be retried). + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. If caching is enabled, + * the data will be stored as a single object into the cache. + */ +ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter) +{ + struct netfs_io_request *rreq; + struct netfs_inode *ictx = netfs_inode(inode); + ssize_t ret; + + rreq = netfs_alloc_request(inode->i_mapping, file, 0, iov_iter_count(iter), + NETFS_READ_SINGLE); + if (IS_ERR(rreq)) + return PTR_ERR(rreq); + + ret = netfs_single_begin_cache_read(rreq, ictx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; + + netfs_stat(&netfs_n_rh_read_single); + trace_netfs_read(rreq, 0, rreq->len, netfs_read_trace_read_single); + + rreq->buffer.iter = *iter; + netfs_single_dispatch_read(rreq); + + ret = netfs_wait_for_read(rreq); + netfs_put_request(rreq, true, netfs_rreq_trace_put_return); + return ret; + +cleanup_free: + netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + return ret; +} +EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c new file mode 100644 index 000000000000..75d97af14b4a --- /dev/null +++ b/fs/netfs/rolling_buffer.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Rolling buffer helpers + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/bitops.h> +#include <linux/pagemap.h> +#include <linux/rolling_buffer.h> +#include <linux/slab.h> +#include "internal.h" + +static atomic_t debug_ids; + +/** + * netfs_folioq_alloc - Allocate a folio_queue struct + * @rreq_id: Associated debugging ID for tracing purposes + * @gfp: Allocation constraints + * @trace: Trace tag to indicate the purpose of the allocation + * + * Allocate, initialise and account the folio_queue struct and log a trace line + * to mark the allocation. + */ +struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, + unsigned int /*enum netfs_folioq_trace*/ trace) +{ + struct folio_queue *fq; + + fq = kmalloc(sizeof(*fq), gfp); + if (fq) { + netfs_stat(&netfs_n_folioq); + folioq_init(fq, rreq_id); + fq->debug_id = atomic_inc_return(&debug_ids); + trace_netfs_folioq(fq, trace); + } + return fq; +} +EXPORT_SYMBOL(netfs_folioq_alloc); + +/** + * netfs_folioq_free - Free a folio_queue struct + * @folioq: The object to free + * @trace: Trace tag to indicate which free + * + * Free and unaccount the folio_queue struct. + */ +void netfs_folioq_free(struct folio_queue *folioq, + unsigned int /*enum netfs_trace_folioq*/ trace) +{ + trace_netfs_folioq(folioq, trace); + netfs_stat_d(&netfs_n_folioq); + kfree(folioq); +} +EXPORT_SYMBOL(netfs_folioq_free); + +/* + * Initialise a rolling buffer. We allocate an empty folio queue struct to so + * that the pointers can be independently driven by the producer and the + * consumer. + */ +int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id, + unsigned int direction) +{ + struct folio_queue *fq; + + fq = netfs_folioq_alloc(rreq_id, GFP_NOFS, netfs_trace_folioq_rollbuf_init); + if (!fq) + return -ENOMEM; + + roll->head = fq; + roll->tail = fq; + iov_iter_folio_queue(&roll->iter, direction, fq, 0, 0, 0); + return 0; +} + +/* + * Add another folio_queue to a rolling buffer if there's no space left. + */ +int rolling_buffer_make_space(struct rolling_buffer *roll) +{ + struct folio_queue *fq, *head = roll->head; + + if (!folioq_full(head)) + return 0; + + fq = netfs_folioq_alloc(head->rreq_id, GFP_NOFS, netfs_trace_folioq_make_space); + if (!fq) + return -ENOMEM; + fq->prev = head; + + roll->head = fq; + if (folioq_full(head)) { + /* Make sure we don't leave the master iterator pointing to a + * block that might get immediately consumed. + */ + if (roll->iter.folioq == head && + roll->iter.folioq_slot == folioq_nr_slots(head)) { + roll->iter.folioq = fq; + roll->iter.folioq_slot = 0; + } + } + + /* Make sure the initialisation is stored before the next pointer. + * + * [!] NOTE: After we set head->next, the consumer is at liberty to + * immediately delete the old head. + */ + smp_store_release(&head->next, fq); + return 0; +} + +/* + * Decant the list of folios to read into a rolling buffer. + */ +ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll, + struct readahead_control *ractl, + struct folio_batch *put_batch) +{ + struct folio_queue *fq; + struct page **vec; + int nr, ix, to; + ssize_t size = 0; + + if (rolling_buffer_make_space(roll) < 0) + return -ENOMEM; + + fq = roll->head; + vec = (struct page **)fq->vec.folios; + nr = __readahead_batch(ractl, vec + folio_batch_count(&fq->vec), + folio_batch_space(&fq->vec)); + ix = fq->vec.nr; + to = ix + nr; + fq->vec.nr = to; + for (; ix < to; ix++) { + struct folio *folio = folioq_folio(fq, ix); + unsigned int order = folio_order(folio); + + fq->orders[ix] = order; + size += PAGE_SIZE << order; + trace_netfs_folio(folio, netfs_folio_trace_read); + if (!folio_batch_add(put_batch, folio)) + folio_batch_release(put_batch); + } + WRITE_ONCE(roll->iter.count, roll->iter.count + size); + + /* Store the counter after setting the slot. */ + smp_store_release(&roll->next_head_slot, to); + + for (; ix < folioq_nr_slots(fq); ix++) + folioq_clear(fq, ix); + + return size; +} + +/* + * Append a folio to the rolling buffer. + */ +ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio, + unsigned int flags) +{ + ssize_t size = folio_size(folio); + int slot; + + if (rolling_buffer_make_space(roll) < 0) + return -ENOMEM; + + slot = folioq_append(roll->head, folio); + if (flags & ROLLBUF_MARK_1) + folioq_mark(roll->head, slot); + if (flags & ROLLBUF_MARK_2) + folioq_mark2(roll->head, slot); + + WRITE_ONCE(roll->iter.count, roll->iter.count + size); + + /* Store the counter after setting the slot. */ + smp_store_release(&roll->next_head_slot, slot); + return size; +} + +/* + * Delete a spent buffer from a rolling queue and return the next in line. We + * don't return the last buffer to keep the pointers independent, but return + * NULL instead. + */ +struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll) +{ + struct folio_queue *spent = roll->tail, *next = READ_ONCE(spent->next); + + if (!next) + return NULL; + next->prev = NULL; + netfs_folioq_free(spent, netfs_trace_folioq_delete); + roll->tail = next; + return next; +} + +/* + * Clear out a rolling queue. Folios that have mark 1 set are put. + */ +void rolling_buffer_clear(struct rolling_buffer *roll) +{ + struct folio_batch fbatch; + struct folio_queue *p; + + folio_batch_init(&fbatch); + + while ((p = roll->tail)) { + roll->tail = p->next; + for (int slot = 0; slot < folioq_count(p); slot++) { + struct folio *folio = folioq_folio(p, slot); + + if (!folio) + continue; + if (folioq_is_marked(p, slot)) { + trace_netfs_folio(folio, netfs_folio_trace_put); + if (!folio_batch_add(&fbatch, folio)) + folio_batch_release(&fbatch); + } + } + + netfs_folioq_free(p, netfs_trace_folioq_clear); + } + + folio_batch_release(&fbatch); +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 8e63516b40f6..f1af344266cc 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -12,6 +12,7 @@ atomic_t netfs_n_rh_dio_read; atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_read_folio; +atomic_t netfs_n_rh_read_single; atomic_t netfs_n_rh_rreq; atomic_t netfs_n_rh_sreq; atomic_t netfs_n_rh_download; @@ -46,10 +47,11 @@ atomic_t netfs_n_folioq; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", + seq_printf(m, "Reads : DR=%u RA=%u RF=%u RS=%u WB=%u WBZ=%u\n", atomic_read(&netfs_n_rh_dio_read), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_read_folio), + atomic_read(&netfs_n_rh_read_single), atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_zskip)); seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n", diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 1d438be2e1b4..294f67795f79 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -17,10 +17,38 @@ #define HIT_PENDING 0x01 /* A front op was still pending */ #define NEED_REASSESS 0x02 /* Need to loop round and reassess */ #define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ -#define BUFFERED 0x08 /* The pagecache needs cleaning up */ +#define NEED_UNLOCK 0x08 /* The pagecache needs unlocking */ #define NEED_RETRY 0x10 /* A front op requests retrying */ #define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ +static void netfs_dump_request(const struct netfs_io_request *rreq) +{ + pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n", + rreq->debug_id, refcount_read(&rreq->ref), rreq->flags, + rreq->origin, rreq->error); + pr_err(" st=%llx tsl=%zx/%llx/%llx\n", + rreq->start, rreq->transferred, rreq->submitted, rreq->len); + pr_err(" cci=%llx/%llx/%llx\n", + rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to)); + pr_err(" iw=%pSR\n", rreq->netfs_ops->issue_write); + for (int i = 0; i < NR_IO_STREAMS; i++) { + const struct netfs_io_subrequest *sreq; + const struct netfs_io_stream *s = &rreq->io_streams[i]; + + pr_err(" str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n", + s->stream_nr, s->source, s->error, + s->avail, s->active, s->need_retry, s->failed); + pr_err(" str[%x] ct=%llx t=%zx\n", + s->stream_nr, s->collected_to, s->transferred); + list_for_each_entry(sreq, &s->subrequests, rreq_link) { + pr_err(" sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n", + sreq->stream_nr, sreq->debug_index, sreq->source, + sreq->start, sreq->transferred, sreq->len, + refcount_read(&sreq->ref), sreq->flags); + } + } +} + /* * Successful completion of write of a folio to the server and/or cache. Note * that we are not allowed to lock the folio here on pain of deadlocking with @@ -83,9 +111,15 @@ end_wb: static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, unsigned int *notes) { - struct folio_queue *folioq = wreq->buffer; + struct folio_queue *folioq = wreq->buffer.tail; unsigned long long collected_to = wreq->collected_to; - unsigned int slot = wreq->buffer_head_slot; + unsigned int slot = wreq->buffer.first_tail_slot; + + if (WARN_ON_ONCE(!folioq)) { + pr_err("[!] Writeback unlock found empty rolling buffer!\n"); + netfs_dump_request(wreq); + return; + } if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) { if (netfs_pgpriv2_unlock_copied_folios(wreq)) @@ -94,7 +128,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, } if (slot >= folioq_nr_slots(folioq)) { - folioq = netfs_delete_buffer_head(wreq); + folioq = rolling_buffer_delete_spent(&wreq->buffer); + if (!folioq) + return; slot = 0; } @@ -134,9 +170,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, folioq_clear(folioq, slot); slot++; if (slot >= folioq_nr_slots(folioq)) { - if (READ_ONCE(wreq->buffer_tail) == folioq) - break; - folioq = netfs_delete_buffer_head(wreq); + folioq = rolling_buffer_delete_spent(&wreq->buffer); + if (!folioq) + goto done; slot = 0; } @@ -144,223 +180,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, break; } - wreq->buffer = folioq; - wreq->buffer_head_slot = slot; -} - -/* - * Perform retries on the streams that need it. - */ -static void netfs_retry_write_stream(struct netfs_io_request *wreq, - struct netfs_io_stream *stream) -{ - struct list_head *next; - - _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); - - if (list_empty(&stream->subrequests)) - return; - - if (stream->source == NETFS_UPLOAD_TO_SERVER && - wreq->netfs_ops->retry_request) - wreq->netfs_ops->retry_request(wreq, stream); - - if (unlikely(stream->failed)) - return; - - /* If there's no renegotiation to do, just resend each failed subreq. */ - if (!stream->prepare_write) { - struct netfs_io_subrequest *subreq; - - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) - break; - if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { - struct iov_iter source = subreq->io_iter; - - iov_iter_revert(&source, subreq->len - source.count); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq, &source); - } - } - return; - } - - next = stream->subrequests.next; - - do { - struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; - struct iov_iter source; - unsigned long long start, len; - size_t part; - bool boundary = false; - - /* Go through the stream and find the next span of contiguous - * data that we then rejig (cifs, for example, needs the wsize - * renegotiating) and reissue. - */ - from = list_entry(next, struct netfs_io_subrequest, rreq_link); - to = from; - start = from->start + from->transferred; - len = from->len - from->transferred; - - if (test_bit(NETFS_SREQ_FAILED, &from->flags) || - !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) - return; - - list_for_each_continue(next, &stream->subrequests) { - subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); - if (subreq->start + subreq->transferred != start + len || - test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || - !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) - break; - to = subreq; - len += to->len; - } - - /* Determine the set of buffers we're going to use. Each - * subreq gets a subset of a single overall contiguous buffer. - */ - netfs_reset_iter(from); - source = from->io_iter; - source.count = len; - - /* Work through the sublist. */ - subreq = from; - list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) - break; - /* Renegotiate max_len (wsize) */ - trace_netfs_sreq(subreq, netfs_sreq_trace_retry); - __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); - stream->prepare_write(subreq); - - part = min(len, stream->sreq_max_len); - subreq->len = part; - subreq->start = start; - subreq->transferred = 0; - len -= part; - start += part; - if (len && subreq == to && - __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags)) - boundary = true; - - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq, &source); - if (subreq == to) - break; - } - - /* If we managed to use fewer subreqs, we can discard the - * excess; if we used the same number, then we're done. - */ - if (!len) { - if (subreq == to) - continue; - list_for_each_entry_safe_from(subreq, tmp, - &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); - list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); - if (subreq == to) - break; - } - continue; - } - - /* We ran out of subrequests, so we need to allocate some more - * and insert them after. - */ - do { - subreq = netfs_alloc_subrequest(wreq); - subreq->source = to->source; - subreq->start = start; - subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); - subreq->stream_nr = to->stream_nr; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); - - trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, - refcount_read(&subreq->ref), - netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - - list_add(&subreq->rreq_link, &to->rreq_link); - to = list_next_entry(to, rreq_link); - trace_netfs_sreq(subreq, netfs_sreq_trace_retry); - - stream->sreq_max_len = len; - stream->sreq_max_segs = INT_MAX; - switch (stream->source) { - case NETFS_UPLOAD_TO_SERVER: - netfs_stat(&netfs_n_wh_upload); - stream->sreq_max_len = umin(len, wreq->wsize); - break; - case NETFS_WRITE_TO_CACHE: - netfs_stat(&netfs_n_wh_write); - break; - default: - WARN_ON_ONCE(1); - } - - stream->prepare_write(subreq); - - part = umin(len, stream->sreq_max_len); - subreq->len = subreq->transferred + part; - len -= part; - start += part; - if (!len && boundary) { - __set_bit(NETFS_SREQ_BOUNDARY, &to->flags); - boundary = false; - } - - netfs_reissue_write(stream, subreq, &source); - if (!len) - break; - - } while (len); - - } while (!list_is_head(next, &stream->subrequests)); -} - -/* - * Perform retries on the streams that need it. If we're doing content - * encryption and the server copy changed due to a third-party write, we may - * need to do an RMW cycle and also rewrite the data to the cache. - */ -static void netfs_retry_writes(struct netfs_io_request *wreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream; - int s; - - /* Wait for all outstanding I/O to quiesce before performing retries as - * we may need to renegotiate the I/O sizes. - */ - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (!stream->active) - continue; - - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - } - } - - // TODO: Enc: Fetch changed partial pages - // TODO: Enc: Reencrypt content if needed. - // TODO: Enc: Wind back transferred point. - // TODO: Enc: Mark cache pages for retry. - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->need_retry) { - stream->need_retry = false; - netfs_retry_write_stream(wreq, stream); - } - } + wreq->buffer.tail = folioq; +done: + wreq->buffer.first_tail_slot = slot; } /* @@ -391,7 +213,7 @@ reassess_streams: if (wreq->origin == NETFS_WRITEBACK || wreq->origin == NETFS_WRITETHROUGH || wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) - notes = BUFFERED; + notes = NEED_UNLOCK; else notes = 0; @@ -450,14 +272,14 @@ reassess_streams: cancel: /* Remove if completely consumed. */ - spin_lock_bh(&wreq->lock); + spin_lock(&wreq->lock); remove = front; list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); stream->front = front; - spin_unlock_bh(&wreq->lock); + spin_unlock(&wreq->lock); netfs_put_subrequest(remove, false, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : @@ -488,7 +310,7 @@ reassess_streams: trace_netfs_collect_state(wreq, wreq->collected_to, notes); /* Unlock any folios that we have now finished with. */ - if (notes & BUFFERED) { + if (notes & NEED_UNLOCK) { if (wreq->cleaned_to < wreq->collected_to) netfs_writeback_unlock_folios(wreq, ¬es); } else { @@ -502,7 +324,8 @@ reassess_streams: if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE); + smp_mb__after_atomic(); /* Set PAUSE before task state */ + wake_up(&wreq->waitq); } if (notes & NEED_REASSESS) { @@ -605,8 +428,7 @@ void netfs_write_collection_worker(struct work_struct *work) _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); - wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -714,8 +536,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS); + clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); /* If we are at the head of the queue, wake up the collector, * transferring a ref to it if we were the ones to do so. diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index bf6d507578e5..69727411683e 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -94,9 +94,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, { struct netfs_io_request *wreq; struct netfs_inode *ictx; - bool is_buffered = (origin == NETFS_WRITEBACK || - origin == NETFS_WRITETHROUGH || - origin == NETFS_PGPRIV2_COPY_TO_CACHE); + bool is_cacheable = (origin == NETFS_WRITEBACK || + origin == NETFS_WRITEBACK_SINGLE || + origin == NETFS_WRITETHROUGH || + origin == NETFS_PGPRIV2_COPY_TO_CACHE); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -105,8 +106,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); - if (is_buffered && netfs_is_cache_enabled(ictx)) + if (is_cacheable && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); + if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0) + goto nomem; wreq->cleaned_to = wreq->start; @@ -129,6 +132,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, } return wreq; +nomem: + wreq->error = -ENOMEM; + netfs_put_request(wreq, false, netfs_rreq_trace_put_failed); + return ERR_PTR(-ENOMEM); } /** @@ -153,16 +160,15 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, loff_t start) { struct netfs_io_subrequest *subreq; - struct iov_iter *wreq_iter = &wreq->io_iter; + struct iov_iter *wreq_iter = &wreq->buffer.iter; /* Make sure we don't point the iterator at a used-up folio_queue * struct being used as a placeholder to prevent the queue from * collapsing. In such a case, extend the queue. */ if (iov_iter_is_folioq(wreq_iter) && - wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) { - netfs_buffer_make_space(wreq); - } + wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) + rolling_buffer_make_space(&wreq->buffer); subreq = netfs_alloc_subrequest(wreq); subreq->source = stream->source; @@ -198,7 +204,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, * the list. The collector only goes nextwards and uses the lock to * remove entries off of the front. */ - spin_lock_bh(&wreq->lock); + spin_lock(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { stream->front = subreq; @@ -209,7 +215,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, } } - spin_unlock_bh(&wreq->lock); + spin_unlock(&wreq->lock); stream->construct = subreq; } @@ -244,6 +250,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream, iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); + subreq->retry_count++; + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } @@ -266,9 +274,9 @@ void netfs_issue_write(struct netfs_io_request *wreq, * we can avoid overrunning the credits obtained (cifs) and try to parallelise * content-crypto preparation with network writes. */ -int netfs_advance_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream, - loff_t start, size_t len, bool to_eof) +size_t netfs_advance_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream, + loff_t start, size_t len, bool to_eof) { struct netfs_io_subrequest *subreq = stream->construct; size_t part; @@ -325,6 +333,9 @@ static int netfs_write_folio(struct netfs_io_request *wreq, _enter(""); + if (rolling_buffer_make_space(&wreq->buffer) < 0) + return -ENOMEM; + /* netfs_perform_write() may shift i_size around the page or from out * of the page to beyond it, but cannot move i_size into or through the * page since we have it locked. @@ -429,7 +440,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, } /* Attach the folio to the rolling buffer. */ - netfs_buffer_append_folio(wreq, folio, false); + rolling_buffer_append(&wreq->buffer, folio, 0); /* Move the submission point forward to allow for write-streaming data * not starting at the front of the page. We don't do write-streaming @@ -442,7 +453,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq, stream = &wreq->io_streams[s]; stream->submit_off = foff; stream->submit_len = flen; - if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || + if (!stream->avail || + (stream->source == NETFS_WRITE_TO_CACHE && streamw) || (stream->source == NETFS_UPLOAD_TO_SERVER && fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { stream->submit_off = UINT_MAX; @@ -476,7 +488,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, /* Advance the iterator(s). */ if (stream->submit_off > iter_off) { - iov_iter_advance(&wreq->io_iter, stream->submit_off - iter_off); + rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off); iter_off = stream->submit_off; } @@ -494,7 +506,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, } if (fsize > iter_off) - iov_iter_advance(&wreq->io_iter, fsize - iter_off); + rolling_buffer_advance(&wreq->buffer, fsize - iter_off); atomic64_set(&wreq->issued_to, fpos + fsize); if (!debug) @@ -633,7 +645,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio **writethrough_cache) { _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", - wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); + wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end); if (!*writethrough_cache) { if (folio_test_dirty(folio)) @@ -708,10 +720,10 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; - iov_iter_advance(&wreq->io_iter, part); + rolling_buffer_advance(&wreq->buffer, part); if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); - wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); + wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags)); } if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) break; @@ -721,3 +733,194 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t _leave(" = %d", error); return error; } + +/* + * Write some of a pending folio data back to the server and/or the cache. + */ +static int netfs_write_folio_single(struct netfs_io_request *wreq, + struct folio *folio) +{ + struct netfs_io_stream *upload = &wreq->io_streams[0]; + struct netfs_io_stream *cache = &wreq->io_streams[1]; + struct netfs_io_stream *stream; + size_t iter_off = 0; + size_t fsize = folio_size(folio), flen; + loff_t fpos = folio_pos(folio); + bool to_eof = false; + bool no_debug = false; + + _enter(""); + + flen = folio_size(folio); + if (flen > wreq->i_size - fpos) { + flen = wreq->i_size - fpos; + folio_zero_segment(folio, flen, fsize); + to_eof = true; + } else if (flen == wreq->i_size - fpos) { + to_eof = true; + } + + _debug("folio %zx/%zx", flen, fsize); + + if (!upload->avail && !cache->avail) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_store); + return 0; + } + + if (!upload->construct) + trace_netfs_folio(folio, netfs_folio_trace_store); + else + trace_netfs_folio(folio, netfs_folio_trace_store_plus); + + /* Attach the folio to the rolling buffer. */ + folio_get(folio); + rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK); + + /* Move the submission point forward to allow for write-streaming data + * not starting at the front of the page. We don't do write-streaming + * with the cache as the cache requires DIO alignment. + * + * Also skip uploading for data that's been read and just needs copying + * to the cache. + */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + stream->submit_off = 0; + stream->submit_len = flen; + if (!stream->avail) { + stream->submit_off = UINT_MAX; + stream->submit_len = 0; + } + } + + /* Attach the folio to one or more subrequests. For a big folio, we + * could end up with thousands of subrequests if the wsize is small - + * but we might need to wait during the creation of subrequests for + * network resources (eg. SMB credits). + */ + for (;;) { + ssize_t part; + size_t lowest_off = ULONG_MAX; + int choose_s = -1; + + /* Always add to the lowest-submitted stream first. */ + for (int s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->submit_len > 0 && + stream->submit_off < lowest_off) { + lowest_off = stream->submit_off; + choose_s = s; + } + } + + if (choose_s < 0) + break; + stream = &wreq->io_streams[choose_s]; + + /* Advance the iterator(s). */ + if (stream->submit_off > iter_off) { + rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off); + iter_off = stream->submit_off; + } + + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_extendable_to = fsize - stream->submit_off; + part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, + stream->submit_len, to_eof); + stream->submit_off += part; + if (part > stream->submit_len) + stream->submit_len = 0; + else + stream->submit_len -= part; + if (part > 0) + no_debug = true; + } + + wreq->buffer.iter.iov_offset = 0; + if (fsize > iter_off) + rolling_buffer_advance(&wreq->buffer, fsize - iter_off); + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (!no_debug) + kdebug("R=%x: No submit", wreq->debug_id); + _leave(" = 0"); + return 0; +} + +/** + * netfs_writeback_single - Write back a monolithic payload + * @mapping: The mapping to write from + * @wbc: Hints from the VM + * @iter: Data to write, must be ITER_FOLIOQ. + * + * Write a monolithic, non-pagecache object back to the server and/or + * the cache. + */ +int netfs_writeback_single(struct address_space *mapping, + struct writeback_control *wbc, + struct iov_iter *iter) +{ + struct netfs_io_request *wreq; + struct netfs_inode *ictx = netfs_inode(mapping->host); + struct folio_queue *fq; + size_t size = iov_iter_count(iter); + int ret; + + if (WARN_ON_ONCE(!iov_iter_is_folioq(iter))) + return -EIO; + + if (!mutex_trylock(&ictx->wb_lock)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + netfs_stat(&netfs_n_wb_lock_skip); + return 0; + } + netfs_stat(&netfs_n_wb_lock_wait); + mutex_lock(&ictx->wb_lock); + } + + wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE); + if (IS_ERR(wreq)) { + ret = PTR_ERR(wreq); + goto couldnt_start; + } + + trace_netfs_write(wreq, netfs_write_trace_writeback); + netfs_stat(&netfs_n_wh_writepages); + + if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) + wreq->netfs_ops->begin_writeback(wreq); + + for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) { + for (int slot = 0; slot < folioq_count(fq); slot++) { + struct folio *folio = folioq_folio(fq, slot); + size_t part = umin(folioq_folio_size(fq, slot), size); + + _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); + + ret = netfs_write_folio_single(wreq, folio); + if (ret < 0) + goto stop; + size -= part; + if (size <= 0) + goto stop; + } + } + +stop: + for (int s = 0; s < NR_IO_STREAMS; s++) + netfs_issue_write(wreq, &wreq->io_streams[s]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + mutex_unlock(&ictx->wb_lock); + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = %d", ret); + return ret; + +couldnt_start: + mutex_unlock(&ictx->wb_lock); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_writeback_single); diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c new file mode 100644 index 000000000000..c841a851dd73 --- /dev/null +++ b/fs/netfs/write_retry.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem write retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include "internal.h" + +/* + * Perform retries on the streams that need it. + */ +static void netfs_retry_write_stream(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) +{ + struct list_head *next; + + _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + + if (list_empty(&stream->subrequests)) + return; + + if (stream->source == NETFS_UPLOAD_TO_SERVER && + wreq->netfs_ops->retry_request) + wreq->netfs_ops->retry_request(wreq, stream); + + if (unlikely(stream->failed)) + return; + + /* If there's no renegotiation to do, just resend each failed subreq. */ + if (!stream->prepare_write) { + struct netfs_io_subrequest *subreq; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + struct iov_iter source = subreq->io_iter; + + iov_iter_revert(&source, subreq->len - source.count); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq, &source); + } + } + return; + } + + next = stream->subrequests.next; + + do { + struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; + struct iov_iter source; + unsigned long long start, len; + size_t part; + bool boundary = false; + + /* Go through the stream and find the next span of contiguous + * data that we then rejig (cifs, for example, needs the wsize + * renegotiating) and reissue. + */ + from = list_entry(next, struct netfs_io_subrequest, rreq_link); + to = from; + start = from->start + from->transferred; + len = from->len - from->transferred; + + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + return; + + list_for_each_continue(next, &stream->subrequests) { + subreq = list_entry(next, struct netfs_io_subrequest, rreq_link); + if (subreq->start + subreq->transferred != start + len || + test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + break; + to = subreq; + len += to->len; + } + + /* Determine the set of buffers we're going to use. Each + * subreq gets a subset of a single overall contiguous buffer. + */ + netfs_reset_iter(from); + source = from->io_iter; + source.count = len; + + /* Work through the sublist. */ + subreq = from; + list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { + if (!len) + break; + + subreq->start = start; + subreq->len = len; + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + subreq->retry_count++; + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + + /* Renegotiate max_len (wsize) */ + stream->sreq_max_len = len; + stream->prepare_write(subreq); + + part = umin(len, stream->sreq_max_len); + if (unlikely(stream->sreq_max_segs)) + part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs); + subreq->len = part; + subreq->transferred = 0; + len -= part; + start += part; + if (len && subreq == to && + __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags)) + boundary = true; + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + netfs_reissue_write(stream, subreq, &source); + if (subreq == to) + break; + } + + /* If we managed to use fewer subreqs, we can discard the + * excess; if we used the same number, then we're done. + */ + if (!len) { + if (subreq == to) + continue; + list_for_each_entry_safe_from(subreq, tmp, + &stream->subrequests, rreq_link) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + if (subreq == to) + break; + } + continue; + } + + /* We ran out of subrequests, so we need to allocate some more + * and insert them after. + */ + do { + subreq = netfs_alloc_subrequest(wreq); + subreq->source = to->source; + subreq->start = start; + subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); + subreq->stream_nr = to->stream_nr; + subreq->retry_count = 1; + + trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, + refcount_read(&subreq->ref), + netfs_sreq_trace_new); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + + list_add(&subreq->rreq_link, &to->rreq_link); + to = list_next_entry(to, rreq_link); + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + + stream->sreq_max_len = len; + stream->sreq_max_segs = INT_MAX; + switch (stream->source) { + case NETFS_UPLOAD_TO_SERVER: + netfs_stat(&netfs_n_wh_upload); + stream->sreq_max_len = umin(len, wreq->wsize); + break; + case NETFS_WRITE_TO_CACHE: + netfs_stat(&netfs_n_wh_write); + break; + default: + WARN_ON_ONCE(1); + } + + stream->prepare_write(subreq); + + part = umin(len, stream->sreq_max_len); + subreq->len = subreq->transferred + part; + len -= part; + start += part; + if (!len && boundary) { + __set_bit(NETFS_SREQ_BOUNDARY, &to->flags); + boundary = false; + } + + netfs_reissue_write(stream, subreq, &source); + if (!len) + break; + + } while (len); + + } while (!list_is_head(next, &stream->subrequests)); +} + +/* + * Perform retries on the streams that need it. If we're doing content + * encryption and the server copy changed due to a third-party write, we may + * need to do an RMW cycle and also rewrite the data to the cache. + */ +void netfs_retry_writes(struct netfs_io_request *wreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream; + int s; + + /* Wait for all outstanding I/O to quiesce before performing retries as + * we may need to renegotiate the I/O sizes. + */ + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (!stream->active) + continue; + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + } + } + + // TODO: Enc: Fetch changed partial pages + // TODO: Enc: Reencrypt content if needed. + // TODO: Enc: Wind back transferred point. + // TODO: Enc: Mark cache pages for retry. + + for (s = 0; s < NR_IO_STREAMS; s++) { + stream = &wreq->io_streams[s]; + if (stream->need_retry) { + stream->need_retry = false; + netfs_retry_write_stream(wreq, stream); + } + } +} diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0becdec12970..47189476b553 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -571,19 +571,32 @@ retry: if (!node) return ERR_PTR(-ENODEV); + /* + * Devices that are marked unavailable are left in the cache with a + * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or + * constantly attempting to register the device. Once marked as + * unavailable they must be deleted and never reused. + */ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { unsigned long end = jiffies; unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT; if (!time_in_range(node->timestamp_unavailable, start, end)) { + /* Uncork subsequent GETDEVINFO operations for this device */ nfs4_delete_deviceid(node->ld, node->nfs_client, id); goto retry; } goto out_put; } - if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) + if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) { + /* + * If we cannot register, treat this device as transient: + * Make a negative cache entry for the device + */ + nfs4_mark_deviceid_unavailable(node); goto out_put; + } return node; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 6252f4447945..cab8809f0e0f 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -20,9 +20,6 @@ static void bl_unregister_scsi(struct pnfs_block_dev *dev) const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; int status; - if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) - return; - status = ops->pr_register(bdev, dev->pr_key, 0, false); if (status) trace_bl_pr_key_unreg_err(bdev, dev->pr_key, status); @@ -58,7 +55,8 @@ static void bl_unregister_dev(struct pnfs_block_dev *dev) return; } - if (dev->type == PNFS_BLOCK_VOLUME_SCSI) + if (dev->type == PNFS_BLOCK_VOLUME_SCSI && + test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) bl_unregister_scsi(dev); } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 6df77f008d3f..fdeb0b34a3d3 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -375,6 +375,8 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { + if (unlikely(rc_list->rcl_nrefcalls > xdr->buf->len)) + goto out; p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a1d21c4be0ac..550ca934c9cf 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -55,9 +55,13 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); -static DEFINE_SPINLOCK(nfs_version_lock); -static DEFINE_MUTEX(nfs_version_mutex); -static LIST_HEAD(nfs_versions); +static DEFINE_RWLOCK(nfs_version_lock); + +static struct nfs_subversion *nfs_version_mods[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, +}; /* * RPC cruft for NFS @@ -76,38 +80,38 @@ const struct rpc_program nfs_program = { .pipe_dir_name = NFS_PIPE_DIRNAME, }; -static struct nfs_subversion *find_nfs_version(unsigned int version) +static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; - spin_lock(&nfs_version_lock); - - list_for_each_entry(nfs, &nfs_versions, list) { - if (nfs->rpc_ops->version == version) { - spin_unlock(&nfs_version_lock); - return nfs; - } - } - spin_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT); + read_lock(&nfs_version_lock); + nfs = nfs_version_mods[version]; + read_unlock(&nfs_version_lock); + return nfs; } -struct nfs_subversion *get_nfs_version(unsigned int version) +struct nfs_subversion *find_nfs_version(unsigned int version) { - struct nfs_subversion *nfs = find_nfs_version(version); + struct nfs_subversion *nfs = __find_nfs_version(version); - if (IS_ERR(nfs)) { - mutex_lock(&nfs_version_mutex); - request_module("nfsv%d", version); - nfs = find_nfs_version(version); - mutex_unlock(&nfs_version_mutex); - } + if (!nfs && request_module("nfsv%d", version) == 0) + nfs = __find_nfs_version(version); - if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + if (!nfs) + return ERR_PTR(-EPROTONOSUPPORT); + + if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); + return nfs; } +int get_nfs_version(struct nfs_subversion *nfs) +{ + return try_module_get(nfs->owner); +} +EXPORT_SYMBOL_GPL(get_nfs_version); + void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); @@ -115,23 +119,23 @@ void put_nfs_version(struct nfs_subversion *nfs) void register_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); - list_add(&nfs->list, &nfs_versions); + nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; - list_del(&nfs->list); + nfs_version_mods[nfs->rpc_ops->version] = NULL; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); @@ -151,7 +155,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; - if (!try_module_get(clp->cl_nfs_mod->owner)) + if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; @@ -181,8 +185,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); - clp->cl_uuid.net = NULL; - clp->cl_uuid.dom = NULL; + nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ @@ -996,6 +999,7 @@ struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); + INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 20cb2008f9e4..035ba52742a5 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1001,6 +1001,11 @@ void nfs_delegation_mark_returned(struct inode *inode, } nfs_mark_delegation_revoked(delegation); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + spin_unlock(&delegation->lock); + if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode))) + nfs_put_delegation(delegation); + goto out_rcu_unlock; out_clear_returning: clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 90079ca134dd..b08dbe96bc57 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -454,8 +454,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (user_backed_iter(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - if (!swap) - nfs_start_io_direct(inode); + if (!swap) { + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_read_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } + } NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); @@ -1007,7 +1015,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_STABLE); } else { - nfs_start_io_direct(inode); + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_write_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_COND_STABLE); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6800ee92d742..1bb646752e46 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -166,7 +166,10 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); @@ -187,7 +190,10 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, in->f_mapping); if (!result) { result = filemap_splice_read(in, ppos, pipe, len, flags); @@ -668,7 +674,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); - nfs_start_io_write(inode); + error = nfs_start_io_write(inode); + if (error) + return error; result = generic_write_checks(iocb, from); if (result > 0) result = generic_perform_write(iocb, from); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 7e000d782e28..b069385eea17 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1467,7 +1467,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) /* Load the NFS protocol module if we haven't done so yet */ if (!ctx->nfs_mod) { - nfs_mod = get_nfs_version(ctx->version); + nfs_mod = find_nfs_version(ctx->version); if (IS_ERR(nfs_mod)) { ret = PTR_ERR(nfs_mod); goto out_version_unavailable; @@ -1541,7 +1541,7 @@ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) } nfs_copy_fh(ctx->mntfh, src->mntfh); - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; @@ -1633,7 +1633,7 @@ static int nfs_init_fs_context(struct fs_context *fc) } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 810269ee0a50..e278a1ad1ca3 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -263,6 +263,12 @@ int nfs_netfs_readahead(struct readahead_control *ractl) static atomic_t nfs_netfs_debug_id; static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ @@ -274,7 +280,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi static void nfs_netfs_free_request(struct netfs_io_request *rreq) { - put_nfs_open_context(rreq->netfs_priv); + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); } static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) @@ -307,8 +314,10 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) &nfs_async_read_completion_ops); netfs = nfs_netfs_alloc(sreq); - if (!netfs) - return netfs_read_subreq_terminated(sreq, -ENOMEM, false); + if (!netfs) { + sreq->error = -ENOMEM; + return netfs_read_subreq_terminated(sreq); + } pgio.pg_netfs = netfs; /* used in completion */ diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 772d485e96d3..9d86868f4998 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -74,7 +74,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) */ netfs->sreq->transferred = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); - netfs_read_subreq_terminated(netfs->sreq, netfs->error, false); + netfs->sreq->error = netfs->error; + netfs_read_subreq_terminated(netfs->sreq); kfree(netfs); } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 542c7d97b235..596f35170137 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -205,12 +205,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; - nfsi->cache_validity |= flags; + flags |= nfsi->cache_validity; + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; - if (inode->i_mapping->nrpages == 0) { - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - nfs_ooo_clear(nfsi); - } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); @@ -628,23 +631,35 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); - if (nfs_have_delegated_atime(inode)) { - inode_update_timestamps(inode, S_ATIME); - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME; - } + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { - if (nfs_have_delegated_mtime(inode)) { - inode_update_timestamps(inode, S_CTIME | S_MTIME); - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_MTIME); - } + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) @@ -682,15 +697,16 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid &= ~ATTR_SIZE; } - if (nfs_have_delegated_mtime(inode)) { - if (attr->ia_valid & ATTR_MTIME) { - nfs_update_delegated_mtime(inode); - attr->ia_valid &= ~ATTR_MTIME; - } - if (attr->ia_valid & ATTR_ATIME) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; - } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + nfs_update_timestamps(inode, attr->ia_valid); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; } /* Optimization: if the end result is no change, don't RPC */ @@ -1408,6 +1424,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); @@ -1633,6 +1656,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 430733e3eff2..e564bd11ba60 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -6,13 +6,14 @@ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> +#include <linux/compiler_attributes.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> #include <linux/nfslocalio.h> #include <linux/wait_bit.h> -#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) +#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; @@ -516,11 +517,11 @@ extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ -extern void nfs_start_io_read(struct inode *inode); +extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); -extern void nfs_start_io_write(struct inode *inode); +extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); -extern void nfs_start_io_direct(struct inode *inode); +extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) diff --git a/fs/nfs/io.c b/fs/nfs/io.c index b5551ed8f648..3388faf2acb9 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -39,19 +39,28 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void +int nfs_start_io_read(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_o_direct(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** @@ -74,11 +83,15 @@ nfs_end_io_read(struct inode *inode) * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. */ -void +int nfs_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - nfs_block_o_direct(NFS_I(inode), inode); + int err; + + err = down_write_killable(&inode->i_rwsem); + if (!err) + nfs_block_o_direct(NFS_I(inode), inode); + return err; } /** @@ -119,19 +132,28 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void +int nfs_start_io_direct(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_buffered(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index c29cdf51c458..4b8618cf114c 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -18,7 +18,6 @@ #include <net/addrconf.h> #include <linux/nfs_common.h> #include <linux/nfslocalio.h> -#include <linux/module.h> #include <linux/bvec.h> #include <linux/nfs.h> @@ -43,10 +42,8 @@ struct nfs_local_fsync_ctx { struct nfsd_file *localio; struct nfs_commit_data *data; struct work_struct work; - struct kref kref; struct completion *done; }; -static void nfs_local_fsync_work(struct work_struct *work); static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); @@ -206,7 +203,8 @@ void nfs_local_probe(struct nfs_client *clp) nfs_local_disable(clp); } - nfs_uuid_begin(&clp->cl_uuid); + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; if (nfs_server_uuid_is_local(clp)) nfs_local_enable(clp); nfs_uuid_end(&clp->cl_uuid); @@ -274,7 +272,7 @@ nfs_local_iocb_free(struct nfs_local_kiocb *iocb) static struct nfs_local_kiocb * nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, gfp_t flags) + struct file *file, gfp_t flags) { struct nfs_local_kiocb *iocb; @@ -287,9 +285,8 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, kfree(iocb); return NULL; } - init_sync_kiocb(&iocb->kiocb, nfs_to->nfsd_file_file(localio)); + init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; - iocb->localio = localio; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; return iocb; @@ -341,7 +338,7 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - nfs_to->nfsd_file_put_local(iocb->localio); + nfs_to_nfsd_file_put_local(iocb->localio); nfs_local_iocb_free(iocb); nfs_local_hdr_release(hdr, hdr->task.tk_ops); } @@ -354,6 +351,12 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) nfs_local_pgio_done(hdr, status); + /* + * Must clear replen otherwise NFSv3 data corruption will occur + * if/when switching from LOCALIO back to using normal RPC. + */ + hdr->res.replen = 0; + if (hdr->res.count != hdr->args.count || hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; @@ -390,13 +393,19 @@ nfs_do_local_read(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without read_iter */ + if (!file->f_op->read_iter) + return -EAGAIN; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_KERNEL); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; @@ -521,12 +530,7 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) } if (status < 0) nfs_reset_boot_verifier(inode); - else if (nfs_should_remove_suid(inode)) { - /* Deal with the suid/sgid bit corner case */ - spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); - spin_unlock(&inode->i_lock); - } + nfs_local_pgio_done(hdr, status); } @@ -564,14 +568,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without write_iter */ + if (!file->f_op->write_iter) + return -EAGAIN; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_NOIO); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; switch (hdr->args.stable) { default: @@ -597,16 +607,9 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, const struct rpc_call_ops *call_ops) { int status = 0; - struct file *filp = nfs_to->nfsd_file_file(localio); if (!hdr->args.count) return 0; - /* Don't support filesystems without read_iter/write_iter */ - if (!filp->f_op->read_iter || !filp->f_op->write_iter) { - nfs_local_disable(clp); - status = -EAGAIN; - goto out; - } switch (hdr->rw_mode) { case FMODE_READ: @@ -620,9 +623,11 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, hdr->rw_mode); status = -EINVAL; } -out: + if (status != 0) { - nfs_to->nfsd_file_put_local(localio); + if (status == -EAGAIN) + nfs_local_disable(clp); + nfs_to_nfsd_file_put_local(localio); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); } @@ -673,45 +678,17 @@ nfs_local_release_commit_data(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops) { - nfs_to->nfsd_file_put_local(localio); + nfs_to_nfsd_file_put_local(localio); call_ops->rpc_call_done(&data->task, data); call_ops->rpc_release(data); } -static struct nfs_local_fsync_ctx * -nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, - struct nfsd_file *localio, gfp_t flags) -{ - struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); - - if (ctx != NULL) { - ctx->localio = localio; - ctx->data = data; - INIT_WORK(&ctx->work, nfs_local_fsync_work); - kref_init(&ctx->kref); - ctx->done = NULL; - } - return ctx; -} - -static void -nfs_local_fsync_ctx_kref_free(struct kref *kref) -{ - kfree(container_of(kref, struct nfs_local_fsync_ctx, kref)); -} - -static void -nfs_local_fsync_ctx_put(struct nfs_local_fsync_ctx *ctx) -{ - kref_put(&ctx->kref, nfs_local_fsync_ctx_kref_free); -} - static void nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) { nfs_local_release_commit_data(ctx->localio, ctx->data, ctx->data->task.tk_ops); - nfs_local_fsync_ctx_put(ctx); + kfree(ctx); } static void @@ -730,6 +707,21 @@ nfs_local_fsync_work(struct work_struct *work) nfs_local_fsync_ctx_free(ctx); } +static struct nfs_local_fsync_ctx * +nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, + struct nfsd_file *localio, gfp_t flags) +{ + struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); + + if (ctx != NULL) { + ctx->localio = localio; + ctx->data = data; + INIT_WORK(&ctx->work, nfs_local_fsync_work); + ctx->done = NULL; + } + return ctx; +} + int nfs_local_commit(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, int how) @@ -744,7 +736,7 @@ int nfs_local_commit(struct nfsd_file *localio, } nfs_local_init_commit(data, call_ops); - kref_get(&ctx->kref); + if (how & FLUSH_SYNC) { DECLARE_COMPLETION_ONSTACK(done); ctx->done = &done; @@ -752,6 +744,6 @@ int nfs_local_commit(struct nfsd_file *localio, wait_for_completion(&done); } else queue_work(nfsiod_workqueue, &ctx->work); - nfs_local_fsync_ctx_put(ctx); + return 0; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e7494cdd957e..2d53574da605 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -182,7 +182,7 @@ struct vfsmount *nfs_d_automount(struct path *path) ctx->version = client->rpc_ops->version; ctx->minorversion = client->cl_minorversion; ctx->nfs_mod = client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ret = client->rpc_ops->submount(fc, server); if (ret < 0) { diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 0d3ce0460e35..8a5f51be013a 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -19,10 +19,10 @@ struct nfs_subversion { const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ const struct xattr_handler * const *xattr; /* NFS xattr handlers */ - struct list_head list; /* List of NFS versions */ }; -struct nfs_subversion *get_nfs_version(unsigned int); +struct nfs_subversion *find_nfs_version(unsigned int); +int get_nfs_version(struct nfs_subversion *); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 28704f924612..531c9c20ef1d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -218,7 +218,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, if (dst_server != src_server) { spin_lock(&src_server->nfs_client->cl_lock); - list_add_tail(©->src_copies, &src_server->ss_copies); + list_add_tail(©->src_copies, &src_server->ss_src_copies); spin_unlock(&src_server->nfs_client->cl_lock); } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index b6e3d8f77b91..37d79400e5f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -802,7 +802,7 @@ static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct inode *inode; @@ -867,7 +867,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) static enum lru_status entry_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct nfs4_xattr_bucket *bucket; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cd2fbde2e6d7..405f17e6e0b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2603,12 +2603,14 @@ static void nfs4_open_release(void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state *state = NULL; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) { + nfs_release_seqid(data->o_arg.seqid); + goto out_free; + } /* If this request hasn't been cancelled, do nothing */ if (!data->cancelled) goto out_free; - /* In case of error, no cleanup! */ - if (data->rpc_status != 0 || !data->rpc_done) - goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) goto out_free; @@ -3452,6 +3454,10 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, adjust_flags |= NFS_INO_INVALID_MODE; if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 581864a15888..9a9f60a2291b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1083,14 +1083,12 @@ void nfs_release_seqid(struct nfs_seqid *seqid) return; sequence = seqid->sequence; spin_lock(&sequence->lock); - list_del_init(&seqid->list); - if (!list_empty(&sequence->list)) { - struct nfs_seqid *next; - - next = list_first_entry(&sequence->list, - struct nfs_seqid, list); + if (list_is_first(&seqid->list, &sequence->list) && + !list_is_singular(&sequence->list)) { + struct nfs_seqid *next = list_next_entry(seqid, list); rpc_wake_up_queued_task(&sequence->wait, next->task); } + list_del_init(&seqid->list); spin_unlock(&sequence->lock); } @@ -1585,7 +1583,7 @@ static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state complete(©->completion); } } - list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) { + list_for_each_entry(copy, &sp->so_server->ss_src_copies, src_copies) { if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && !nfs4_stateid_match_other(&state->stateid, ©->parent_src_state->stateid))) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0d16b383a452..5f582713bf05 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1308,7 +1308,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9723b6c53397..aeb715b4a690 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,6 +73,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -885,7 +886,15 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request, ctx->timeo, ctx->retrans); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ead2dc55952d..50fa539611f5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -144,6 +144,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + /** * nfs_folio_find_head_request - find head request associated with a folio * @folio: pointer to folio @@ -540,7 +565,6 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) struct inode *inode = folio->mapping->host; struct nfs_page *head, *subreq; struct nfs_commit_info cinfo; - bool removed; int ret; /* @@ -565,18 +589,18 @@ retry: goto retry; } - ret = nfs_page_group_lock(head); + ret = nfs_cancel_remove_inode(head, inode); if (ret < 0) goto out_unlock; - removed = test_bit(PG_REMOVE, &head->wb_flags); + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; /* lock each request in the page group */ for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - if (test_bit(PG_REMOVE, &subreq->wb_flags)) - removed = true; ret = nfs_page_group_lock_subreq(head, subreq); if (ret < 0) goto out_unlock; @@ -584,21 +608,6 @@ retry: nfs_page_group_unlock(head); - /* - * If PG_REMOVE is set on any request, I/O on that request has - * completed, but some requests were still under I/O at the time - * we locked the head request. - * - * In that case the above wait for all requests means that all I/O - * has now finished, and we can restart from a clean slate. Let the - * old requests go away and start from scratch instead. - */ - if (removed) { - nfs_unroll_locks(head, head); - nfs_unlock_and_release_request(head); - goto retry; - } - nfs_init_cinfo_from_inode(&cinfo, inode); nfs_join_page_group(head, &cinfo, inode); return head; @@ -772,7 +781,8 @@ static void nfs_inode_add_request(struct nfs_page *req) nfs_lock_request(req); spin_lock(&mapping->i_private_lock); set_bit(PG_MAPPED, &req->wb_flags); - folio_attach_private(folio, req); + folio_set_private(folio); + folio->private = req; spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an @@ -796,7 +806,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&mapping->i_private_lock); if (likely(folio)) { - folio_detach_private(folio); + folio->private = NULL; + folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 42b479b9191f..a74ec08f6c96 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -5,7 +5,7 @@ */ #include <linux/module.h> -#include <linux/rculist.h> +#include <linux/list.h> #include <linux/nfslocalio.h> #include <net/netns/generic.h> @@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock); */ static LIST_HEAD(nfs_uuids); -void nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +void nfs_uuid_init(nfs_uuid_t *nfs_uuid) { nfs_uuid->net = NULL; nfs_uuid->dom = NULL; - uuid_gen(&nfs_uuid->uuid); + INIT_LIST_HEAD(&nfs_uuid->list); +} +EXPORT_SYMBOL_GPL(nfs_uuid_init); +bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +{ spin_lock(&nfs_uuid_lock); - list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids); + /* Is this nfs_uuid already in use? */ + if (!list_empty(&nfs_uuid->list)) { + spin_unlock(&nfs_uuid_lock); + return false; + } + uuid_gen(&nfs_uuid->uuid); + list_add_tail(&nfs_uuid->list, &nfs_uuids); spin_unlock(&nfs_uuid_lock); + + return true; } EXPORT_SYMBOL_GPL(nfs_uuid_begin); @@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid) { if (nfs_uuid->net == NULL) { spin_lock(&nfs_uuid_lock); - list_del_init(&nfs_uuid->list); + if (nfs_uuid->net == NULL) + list_del_init(&nfs_uuid->list); spin_unlock(&nfs_uuid_lock); } } @@ -143,7 +156,8 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, cred, nfs_fh, fmode); if (IS_ERR(localio)) - nfs_to->nfsd_serv_put(net); + nfs_to_nfsd_net_put(net); + return localio; } EXPORT_SYMBOL_GPL(nfs_open_local_fh); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 93e33d1ee891..4dc327e02456 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -27,7 +27,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp) int flags = nfsexp_flags(cred, exp); /* discard any old override before preparing the new set */ - revert_creds(get_cred(current_real_cred())); + put_cred(revert_creds(get_cred(current_real_cred()))); new = prepare_creds(); if (!new) return -ENOMEM; @@ -80,7 +80,6 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp) new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); put_cred(override_creds(new)); - put_cred(new); return 0; oom: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c82d8e3e0d4f..aa4712362b3b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1078,12 +1078,14 @@ static struct svc_export *exp_find(struct cache_detail *cd, * check_nfsd_access - check if access to export is allowed. * @exp: svc_export that is being accessed. * @rqstp: svc_rqst attempting to access @exp (will be NULL for LOCALIO). + * @may_bypass_gss: reduce strictness of authorization check * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) { struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; struct svc_xprt *xprt; @@ -1140,6 +1142,23 @@ ok: if (nfsd4_spo_must_allow(rqstp)) return nfs_ok; + /* Some calls may be processed without authentication + * on GSS exports. For example NFS2/3 calls on root + * directory, see section 2.3.2 of rfc 2623. + * For "may_bypass_gss" check that export has really + * enabled some flavor with authentication (GSS or any + * other) and also check that the used auth flavor is + * without authentication (none or sys). + */ + if (may_bypass_gss && ( + rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || + rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)) { + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor >= RPC_AUTH_DES) + return 0; + } + } + denied: return nfserr_wrongsec; } @@ -1406,9 +1425,12 @@ static int e_show(struct seq_file *m, void *p) return 0; } - exp_get(exp); + if (!cache_get_rcu(&exp->h)) + return 0; + if (cache_check(cd, &exp->h, NULL)) return 0; + exp_put(exp); return svc_export_show(m, cd, cp); } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 3794ae253a70..4d92b99c1ffd 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -101,7 +101,8 @@ struct svc_expkey { struct svc_cred; int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp); -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss); /* * Function declarations diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ea1ca374cdab..dc5c9d8e8202 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -391,19 +391,19 @@ nfsd_file_put(struct nfsd_file *nf) } /** - * nfsd_file_put_local - put the reference to nfsd_file and local nfsd_serv - * @nf: nfsd_file of which to put the references + * nfsd_file_put_local - put nfsd_file reference and arm nfsd_serv_put in caller + * @nf: nfsd_file of which to put the reference * - * First put the reference of the nfsd_file and then put the - * reference to the associated nn->nfsd_serv. + * First save the associated net to return to caller, then put + * the reference of the nfsd_file. */ -void +struct net * nfsd_file_put_local(struct nfsd_file *nf) { struct net *net = nf->nf_net; nfsd_file_put(nf); - nfsd_serv_put(net); + return net; } /** @@ -487,7 +487,6 @@ void nfsd_file_net_dispose(struct nfsd_net *nn) * nfsd_file_lru_cb - Examine an entry on the LRU list * @item: LRU entry to examine * @lru: controlling LRU - * @lock: LRU list lock (unused) * @arg: dispose list * * Return values: @@ -497,9 +496,7 @@ void nfsd_file_net_dispose(struct nfsd_net *nn) */ static enum lru_status nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lock, void *arg) - __releases(lock) - __acquires(lock) + void *arg) { struct list_head *head = arg; struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); @@ -751,7 +748,7 @@ nfsd_file_cache_init(void) ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); if (ret) - return ret; + goto out; ret = -ENOMEM; nfsd_file_slab = KMEM_CACHE(nfsd_file, 0); @@ -803,6 +800,8 @@ nfsd_file_cache_init(void) INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); out: + if (ret) + clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags); return ret; out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); @@ -1048,7 +1047,7 @@ retry: * the last one however, since we should hold another. */ if (nfsd_file_lru_remove(nf)) - WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); + refcount_dec(&nf->nf_ref); goto wait_for_construction; } @@ -1121,8 +1120,7 @@ open_file: status = nfs_ok; trace_nfsd_file_opened(nf, status); } else { - ret = nfsd_open_verified(rqstp, fhp, may_flags, - &nf->nf_file); + ret = nfsd_open_verified(fhp, may_flags, &nf->nf_file); if (ret == -EOPENSTALE && stale_retry) { stale_retry = false; nfsd_file_unhash(nf); @@ -1250,7 +1248,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags, NULL, pnf, true); - revert_creds(save_cred); + put_cred(revert_creds(save_cred)); return beres; } diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index cadf3c2689c4..d5db6b34ba30 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -55,7 +55,7 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -void nfsd_file_put_local(struct nfsd_file *nf); +struct net *nfsd_file_put_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 291e9c69cae4..f441cb9f74d5 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -53,7 +53,7 @@ void nfsd_localio_ops_init(void) * * On successful return, returned nfsd_file will have its nf_net member * set. Caller (NFS client) is responsible for calling nfsd_serv_put and - * nfsd_file_put (via nfs_to->nfsd_file_put_local). + * nfsd_file_put (via nfs_to_nfsd_file_put_local). */ struct nfsd_file * nfsd_open_local_fh(struct net *net, struct auth_domain *dom, diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 46a7f9b813e5..edc9f75dc75c 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -38,11 +38,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, memcpy(&fh.fh_handle.fh_raw, f->data, f->size); fh.fh_export = NULL; + /* + * Allow BYPASS_GSS as some client implementations use AUTH_SYS + * for NLM even when GSS is used for NFS. + * Allow OWNER_OVERRIDE as permission might have been changed + * after the file was opened. + * Pass MAY_NLM so that authentication can be completely bypassed + * if NFSEXP_NOAUTHNLM is set. Some older clients use AUTH_NULL + * for NLM requests. + */ access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; - access |= NFSD_MAY_LOCK; + access |= NFSD_MAY_NLM | NFSD_MAY_OWNER_OVERRIDE | NFSD_MAY_BYPASS_GSS; nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); - /* We return nlm error codes as nlm doesn't know + /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. */ switch (nfserr) { diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 96e786b5e544..936ea1ad9586 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -198,8 +198,6 @@ summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) memset(pas, 0, sizeof(*pas)); pas->mask = 07; - pe = acl->a_entries + acl->a_count; - FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index b5b3ab9d719a..c083e539e898 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -287,17 +287,17 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, u32 length; __be32 *p; - p = xdr_inline_decode(xdr, 4 + 4); + p = xdr_inline_decode(xdr, XDR_UNIT); if (unlikely(p == NULL)) goto out_overflow; - hdr->status = be32_to_cpup(p++); + hdr->status = be32_to_cpup(p); /* Ignore the tag */ - length = be32_to_cpup(p++); - p = xdr_inline_decode(xdr, length + 4); - if (unlikely(p == NULL)) + if (xdr_stream_decode_u32(xdr, &length) < 0) + goto out_overflow; + if (xdr_inline_decode(xdr, length) == NULL) + goto out_overflow; + if (xdr_stream_decode_u32(xdr, &hdr->nops) < 0) goto out_overflow; - p += XDR_QUADLEN(length); - hdr->nops = be32_to_cpup(p); return 0; out_overflow: return -EIO; @@ -364,13 +364,29 @@ encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, struct nfs4_delegation *dp = container_of(fattr, struct nfs4_delegation, dl_cb_fattr); struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; + u32 bmap[1]; + + bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE; encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); encode_nfs_fh4(xdr, fh); - encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap)); + encode_bitmap4(xdr, bmap, ARRAY_SIZE(bmap)); hdr->nops++; } +static u32 highest_slotid(struct nfsd4_session *ses) +{ + u32 idx; + + spin_lock(&ses->se_lock); + idx = fls(~ses->se_cb_slot_avail); + if (idx > 0) + --idx; + idx = max(idx, ses->se_cb_highest_slot); + spin_unlock(&ses->se_lock); + return idx; +} + /* * CB_SEQUENCE4args * @@ -397,15 +413,40 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, encode_sessionid4(xdr, session); p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); - *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ - *p++ = xdr_zero; /* csa_slotid */ - *p++ = xdr_zero; /* csa_highest_slotid */ + *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ + *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ + *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ xdr_encode_empty_array(p); /* csa_referring_call_lists */ hdr->nops++; } +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target) +{ + /* No need to do anything if nothing changed */ + if (likely(target == READ_ONCE(ses->se_cb_highest_slot))) + return; + + spin_lock(&ses->se_lock); + if (target > ses->se_cb_highest_slot) { + int i; + + target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1); + + /* + * Growing the slot table. Reset any new sequences to 1. + * + * NB: There is some debate about whether the RFC requires this, + * but the Linux client expects it. + */ + for (i = ses->se_cb_highest_slot + 1; i <= target; ++i) + ses->se_cb_seq_nr[i] = 1; + } + ses->se_cb_highest_slot = target; + spin_unlock(&ses->se_lock); +} + /* * CB_SEQUENCE4resok * @@ -433,7 +474,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_session *session = cb->cb_clp->cl_cb_session; int status = -ESERVERFAULT; __be32 *p; - u32 dummy; + u32 seqid, slotid, target; /* * If the server returns different values for sessionID, slotID or @@ -449,21 +490,22 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, } p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); - dummy = be32_to_cpup(p++); - if (dummy != session->se_cb_seq_nr) { + seqid = be32_to_cpup(p++); + if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) { dprintk("NFS: %s Invalid sequence number\n", __func__); goto out; } - dummy = be32_to_cpup(p++); - if (dummy != 0) { + slotid = be32_to_cpup(p++); + if (slotid != cb->cb_held_slot) { dprintk("NFS: %s Invalid slotid\n", __func__); goto out; } - /* - * FIXME: process highest slotid and target highest slotid - */ + p++; // ignore current highest slot value + + target = be32_to_cpup(p++); + update_cb_slot_table(session, target); status = 0; out: cb->cb_seq_status = status; @@ -1058,7 +1100,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) + if (!conn->cb_xprt || !ses) return -EINVAL; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -1164,6 +1206,22 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) spin_unlock(&clp->cl_lock); } +static int grab_slot(struct nfsd4_session *ses) +{ + int idx; + + spin_lock(&ses->se_lock); + idx = ffs(ses->se_cb_slot_avail) - 1; + if (idx < 0 || idx > ses->se_cb_highest_slot) { + spin_unlock(&ses->se_lock); + return -1; + } + /* clear the bit for the slot */ + ses->se_cb_slot_avail &= ~BIT(idx); + spin_unlock(&ses->se_lock); + return idx; +} + /* * There's currently a single callback channel slot. * If the slot is available, then mark it busy. Otherwise, set the @@ -1172,28 +1230,32 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = clp->cl_cb_session; - if (!cb->cb_holds_slot && - test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + if (cb->cb_held_slot >= 0) + return true; + cb->cb_held_slot = grab_slot(ses); + if (cb->cb_held_slot < 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); /* Race breaker */ - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { - dprintk("%s slot is busy\n", __func__); + cb->cb_held_slot = grab_slot(ses); + if (cb->cb_held_slot < 0) return false; - } rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } - cb->cb_holds_slot = true; return true; } static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = clp->cl_cb_session; - if (cb->cb_holds_slot) { - cb->cb_holds_slot = false; - clear_bit(0, &clp->cl_cb_slot_busy); + if (cb->cb_held_slot >= 0) { + spin_lock(&ses->se_lock); + ses->se_cb_slot_avail |= BIT(cb->cb_held_slot); + spin_unlock(&ses->se_lock); + cb->cb_held_slot = -1; rpc_wake_up_next(&clp->cl_cb_waitq); } } @@ -1210,8 +1272,8 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) } /* - * TODO: cb_sequence should support referring call lists, cachethis, multiple - * slots, and mark callback channel down on communication errors. + * TODO: cb_sequence should support referring call lists, cachethis, + * and mark callback channel down on communication errors. */ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { @@ -1253,7 +1315,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } - if (!cb->cb_holds_slot) + if (cb->cb_held_slot < 0) goto need_restart; /* This is the operation status code for CB_SEQUENCE */ @@ -1267,10 +1329,10 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback * If CB_SEQUENCE returns an error, then the state of the slot * (sequence ID, cached reply) MUST NOT change. */ - ++session->se_cb_seq_nr; + ++session->se_cb_seq_nr[cb->cb_held_slot]; break; case -ESERVERFAULT: - ++session->se_cb_seq_nr; + ++session->se_cb_seq_nr[cb->cb_held_slot]; nfsd4_mark_cb_fault(cb->cb_clp); ret = false; break; @@ -1296,17 +1358,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback case -NFS4ERR_BADSLOT: goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: - if (session->se_cb_seq_nr != 1) { - session->se_cb_seq_nr = 1; + if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) { + session->se_cb_seq_nr[cb->cb_held_slot] = 1; goto retry_nowait; } break; default: nfsd4_mark_cb_fault(cb->cb_clp); } - nfsd41_cb_release_slot(cb); - trace_nfsd_cb_free_slot(task, cb); + nfsd41_cb_release_slot(cb); if (RPC_SIGNALLED(task)) goto need_restart; @@ -1524,7 +1585,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_need_restart = false; - cb->cb_holds_slot = false; + cb->cb_held_slot = -1; } /** diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b5a6bf4f459f..ad44ad49274f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -57,6 +57,8 @@ module_param(inter_copy_offload_enable, bool, 0644); MODULE_PARM_DESC(inter_copy_offload_enable, "Enable inter server to server copy offload. Default: false"); +static void cleanup_async_copy(struct nfsd4_copy *copy); + #ifdef CONFIG_NFSD_V4_2_INTER_SSC static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */ module_param(nfsd4_ssc_umount_timeout, int, 0644); @@ -1276,23 +1278,88 @@ out: return status; } +/** + * nfsd4_has_active_async_copies - Check for ongoing copy operations + * @clp: Client to be checked + * + * NFSD maintains state for async COPY operations after they complete, + * and this state remains in the nfs4_client's async_copies list. + * Ongoing copies should block the destruction of the nfs4_client, but + * completed copies should not. + * + * Return values: + * %true: At least one active async COPY is ongoing + * %false: No active async COPY operations were found + */ +bool nfsd4_has_active_async_copies(struct nfs4_client *clp) +{ + struct nfsd4_copy *copy; + bool result = false; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (!test_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags) && + !test_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { + result = true; + break; + } + } + spin_unlock(&clp->async_lock); + return result; +} + +/** + * nfsd4_async_copy_reaper - Purge completed copies + * @nn: Network namespace with possible active copy information + */ +void nfsd4_async_copy_reaper(struct nfsd_net *nn) +{ + struct nfs4_client *clp; + struct nfsd4_copy *copy; + LIST_HEAD(reaplist); + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + struct list_head *pos, *next; + + spin_lock(&clp->async_lock); + list_for_each_safe(pos, next, &clp->async_copies) { + copy = list_entry(pos, struct nfsd4_copy, copies); + if (test_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags)) { + if (--copy->cp_ttl) { + list_del_init(©->copies); + list_add(©->copies, &reaplist); + } + } + } + spin_unlock(&clp->async_lock); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + copy = list_first_entry(&reaplist, struct nfsd4_copy, copies); + list_del_init(©->copies); + cleanup_async_copy(copy); + } +} + static void nfs4_put_copy(struct nfsd4_copy *copy) { if (!refcount_dec_and_test(©->refcount)) return; - atomic_dec(©->cp_nn->pending_async_copies); kfree(copy->cp_src); kfree(copy); } static void nfsd4_stop_copy(struct nfsd4_copy *copy) { + trace_nfsd_copy_async_cancel(copy); if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) kthread_stop(copy->copy_task); nfs4_put_copy(copy); } -static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +static struct nfsd4_copy *nfsd4_unhash_copy(struct nfs4_client *clp) { struct nfsd4_copy *copy = NULL; @@ -1301,6 +1368,9 @@ static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, copies); refcount_inc(©->refcount); + copy->cp_clp = NULL; + if (!list_empty(©->copies)) + list_del_init(©->copies); } spin_unlock(&clp->async_lock); return copy; @@ -1310,7 +1380,7 @@ void nfsd4_shutdown_copy(struct nfs4_client *clp) { struct nfsd4_copy *copy; - while ((copy = nfsd4_get_copy(clp)) != NULL) + while ((copy = nfsd4_unhash_copy(clp)) != NULL) nfsd4_stop_copy(copy); } #ifdef CONFIG_NFSD_V4_2_INTER_SSC @@ -1598,8 +1668,10 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) { struct nfsd4_cb_offload *cbo = container_of(cb, struct nfsd4_cb_offload, co_cb); + struct nfsd4_copy *copy = + container_of(cbo, struct nfsd4_copy, cp_cb_offload); - kfree(cbo); + set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags); } static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, @@ -1609,6 +1681,13 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, container_of(cb, struct nfsd4_cb_offload, co_cb); trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task); + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (cbo->co_retries--) { + rpc_delay(task, 1 * HZ); + return 0; + } + } return 1; } @@ -1732,15 +1811,12 @@ static void cleanup_async_copy(struct nfsd4_copy *copy) static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) { - struct nfsd4_cb_offload *cbo; - - cbo = kzalloc(sizeof(*cbo), GFP_KERNEL); - if (!cbo) - return; + struct nfsd4_cb_offload *cbo = ©->cp_cb_offload; memcpy(&cbo->co_res, ©->cp_res, sizeof(copy->cp_res)); memcpy(&cbo->co_fh, ©->fh, sizeof(copy->fh)); cbo->co_nfserr = copy->nfserr; + cbo->co_retries = 5; nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); @@ -1786,10 +1862,14 @@ static int nfsd4_do_async_copy(void *data) } do_callback: + /* The kthread exits forthwith. Ensure that a subsequent + * OFFLOAD_CANCEL won't try to kill it again. */ + set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags); + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); nfsd4_send_cb_offload(copy); - cleanup_async_copy(copy); + atomic_dec(©->cp_nn->pending_async_copies); return 0; } @@ -1841,26 +1921,25 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!async_copy) goto out_err; async_copy->cp_nn = nn; - /* Arbitrary cap on number of pending async copy operations */ - if (atomic_inc_return(&nn->pending_async_copies) > - (int)rqstp->rq_pool->sp_nrthreads) { - atomic_dec(&nn->pending_async_copies); - goto out_err; - } INIT_LIST_HEAD(&async_copy->copies); refcount_set(&async_copy->refcount, 1); + async_copy->cp_ttl = NFSD_COPY_INITIAL_TTL; + /* Arbitrary cap on number of pending async copy operations */ + if (atomic_inc_return(&nn->pending_async_copies) > + (int)rqstp->rq_pool->sp_nrthreads) + goto out_dec_async_copy_err; async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) - goto out_err; + goto out_dec_async_copy_err; if (!nfs4_init_copy_state(nn, copy)) - goto out_err; + goto out_dec_async_copy_err; memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) - goto out_err; + goto out_dec_async_copy_err; spin_lock(&async_copy->cp_clp->async_lock); list_add(&async_copy->copies, &async_copy->cp_clp->async_copies); @@ -1875,6 +1954,9 @@ out: trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; +out_dec_async_copy_err: + if (async_copy) + atomic_dec(&nn->pending_async_copies); out_err: if (nfsd4_ssc_is_inter(copy)) { /* @@ -2782,6 +2864,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (op->opdesc->op_get_currentstateid) op->opdesc->op_get_currentstateid(cstate, &op->u); op->status = op->opdesc->op_func(rqstp, cstate, &op->u); + trace_nfsd_compound_op_err(rqstp, op->opnum, op->status); /* Only from SEQUENCE */ if (cstate->status == nfserr_replay_cache) { @@ -2798,7 +2881,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (current_fh->fh_export && need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(current_fh->fh_export, rqstp); + op->status = check_nfsd_access(current_fh->fh_export, rqstp, false); } encode_op: if (op->status == nfserr_replay_me) { @@ -3454,6 +3537,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = nfsd4_exchange_id, + .op_release = nfsd4_exchange_id_release, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index b7d61eb8afe9..7f2ceeb118a4 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -82,14 +82,13 @@ nfs4_save_creds(const struct cred **original_creds) new->fsuid = GLOBAL_ROOT_UID; new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); - put_cred(new); return 0; } static void nfs4_reset_creds(const struct cred *original) { - revert_creds(original); + put_cred(revert_creds(original)); } static void @@ -659,7 +658,8 @@ nfs4_reset_recoverydir(char *recdir) return status; status = -ENOTDIR; if (d_is_dir(path.dentry)) { - strcpy(user_recovery_dirname, recdir); + strscpy(user_recovery_dirname, recdir, + sizeof(user_recovery_dirname)); status = 0; } path_put(&path); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ac1859c7cc9d..741b9449f727 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -149,14 +149,14 @@ void nfsd4_destroy_laundry_wq(void) static bool is_session_dead(struct nfsd4_session *ses) { - return ses->se_flags & NFS4_SESSION_DEAD; + return ses->se_dead; } static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) return nfserr_jukebox; - ses->se_flags |= NFS4_SESSION_DEAD; + ses->se_dead = true; return nfs_ok; } @@ -572,13 +572,6 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static void nfsd4_free_file_rcu(struct rcu_head *rcu) -{ - struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); - - kmem_cache_free(file_slab, fp); -} - void put_nfs4_file(struct nfs4_file *fi) { @@ -586,7 +579,7 @@ put_nfs4_file(struct nfs4_file *fi) nfsd4_file_hash_remove(fi); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); - call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); + kfree_rcu(fi, fi_rcu); } } @@ -1184,7 +1177,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); dp->dl_cb_fattr.ncf_file_modified = false; - dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; @@ -1359,21 +1351,47 @@ static void destroy_delegation(struct nfs4_delegation *dp) destroy_unhashed_deleg(dp); } +/** + * revoke_delegation - perform nfs4 delegation structure cleanup + * @dp: pointer to the delegation + * + * This function assumes that it's called either from the administrative + * interface (nfsd4_revoke_states()) that's revoking a specific delegation + * stateid or it's called from a laundromat thread (nfsd4_landromat()) that + * determined that this specific state has expired and needs to be revoked + * (both mark state with the appropriate stid sc_status mode). It is also + * assumed that a reference was taken on the @dp state. + * + * If this function finds that the @dp state is SC_STATUS_FREED it means + * that a FREE_STATEID operation for this stateid has been processed and + * we can proceed to removing it from recalled list. However, if @dp state + * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked + * list and wait for the FREE_STATEID to arrive from the client. At the same + * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the + * nfsd4_free_stateid() function that this stateid has already been added + * to the cl_revoked list and that nfsd4_free_stateid() is now responsible + * for removing it from the list. Inspection of where the delegation state + * in the revocation process is protected by the clp->cl_lock. + */ static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); + WARN_ON_ONCE(!(dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); - if (dp->dl_stid.sc_status & - (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) { - spin_lock(&clp->cl_lock); - refcount_inc(&dp->dl_stid.sc_count); - list_add(&dp->dl_recall_lru, &clp->cl_revoked); - spin_unlock(&clp->cl_lock); + spin_lock(&clp->cl_lock); + if (dp->dl_stid.sc_status & SC_STATUS_FREED) { + list_del_init(&dp->dl_recall_lru); + goto out; } + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + dp->dl_stid.sc_status |= SC_STATUS_FREEABLE; +out: + spin_unlock(&clp->cl_lock); destroy_unhashed_deleg(dp); } @@ -1634,6 +1652,14 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) free_ol_stateid_reaplist(&reaplist); } +static bool nfs4_openowner_unhashed(struct nfs4_openowner *oo) +{ + lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + + return list_empty(&oo->oo_owner.so_strhash) && + list_empty(&oo->oo_perclient); +} + static void unhash_openowner_locked(struct nfs4_openowner *oo) { struct nfs4_client *clp = oo->oo_owner.so_client; @@ -1780,6 +1806,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb) mutex_unlock(&stp->st_mutex); break; case SC_TYPE_DELEG: + refcount_inc(&stid->sc_count); dp = delegstateid(stid); spin_lock(&state_lock); if (!unhash_delegation_locked( @@ -1983,8 +2010,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, } memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); - memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); - + new->se_cb_slot_avail = ~0U; + new->se_cb_highest_slot = min(battrs->maxreqs - 1, + NFSD_BC_SLOT_TABLE_SIZE - 1); + spin_lock_init(&new->se_lock); return new; out_free: while (i--) @@ -2115,11 +2144,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru INIT_LIST_HEAD(&new->se_conns); - new->se_cb_seq_nr = 1; - new->se_flags = cses->flags; + atomic_set(&new->se_ref, 0); + new->se_dead = false; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; - atomic_set(&new->se_ref, 0); + + for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx) + new->se_cb_seq_nr[idx] = 1; + idx = hash_sessionid(&new->se_sessionid); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); @@ -2212,21 +2244,16 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) return 1; } -/* - * XXX Should we use a slab cache ? - * This type of memory management is somewhat inefficient, but we use it - * anyway since SETCLIENTID is not a common operation. - */ static struct nfs4_client *alloc_client(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client *clp; int i; - if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) { + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients && + atomic_read(&nn->nfsd_courtesy_clients) > 0) mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); - return NULL; - } + clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; @@ -3133,7 +3160,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, kref_init(&clp->cl_nfsdfs.cl_ref); nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); clp->cl_time = ktime_get_boottime_seconds(); - clear_bit(0, &clp->cl_cb_slot_busy); copy_verf(clp, verf); memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; @@ -3460,7 +3486,7 @@ static bool client_has_state(struct nfs4_client *clp) #endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions) - || !list_empty(&clp->async_copies); + || nfsd4_has_active_async_copies(clp); } static __be32 copy_impl_id(struct nfs4_client *clp, @@ -3498,6 +3524,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); + exid->server_impl_name = kasprintf(GFP_KERNEL, "%s %s %s %s", + utsname()->sysname, utsname()->release, + utsname()->version, utsname()->machine); + if (!exid->server_impl_name) + return nfserr_jukebox; + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; @@ -3635,6 +3667,23 @@ out_copy: exid->seqid = conf->cl_cs_slot.sl_seqid + 1; nfsd4_set_ex_flags(conf, exid); + exid->nii_domain.len = sizeof("kernel.org") - 1; + exid->nii_domain.data = "kernel.org"; + + /* + * Note that RFC 8881 places no length limit on + * nii_name, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes. + */ + exid->nii_name.len = strlen(exid->server_impl_name); + if (exid->nii_name.len > NFS4_OPAQUE_LIMIT) + exid->nii_name.len = NFS4_OPAQUE_LIMIT; + exid->nii_name.data = exid->server_impl_name; + + /* just send zeros - the date is in nii_name */ + exid->nii_time.tv_sec = 0; + exid->nii_time.tv_nsec = 0; + dprintk("nfsd4_exchange_id seqid %d flags %x\n", conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; @@ -3651,6 +3700,14 @@ out_nolock: return status; } +void +nfsd4_exchange_id_release(union nfsd4_op_u *u) +{ + struct nfsd4_exchange_id *exid = &u->exchange_id; + + kfree(exid->server_impl_name); +} + static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, bool slot_inuse) { /* The slot is in use, and no response has been sent. */ @@ -3884,6 +3941,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, cr_ses->flags &= ~SESSION4_PERSIST; /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; + /* Report the correct number of backchannel slots */ + cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1; init_session(rqstp, new, conf, cr_ses); nfsd4_get_session_locked(new); @@ -3904,7 +3963,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, return status; out_expired_error: - old = NULL; /* * Revert the slot seq_nr change so the server will process * the client's resend instead of returning a cached response. @@ -3919,8 +3977,6 @@ out_cache_error: out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); - if (old) - expire_client(old); out_free_session: __free_session(new); out_release_drc_mem: @@ -4948,6 +5004,12 @@ retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); + if (nfs4_openowner_unhashed(oo)) { + mutex_unlock(&stp->st_mutex); + stp = NULL; + goto out_unlock; + } + retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; @@ -5930,7 +5992,7 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, path.dentry = file_dentry(nf->nf_file); rc = vfs_getattr(&path, stat, - (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), AT_STATX_SYNC_AS_STAT); nfsd_file_put(nf); @@ -6014,8 +6076,7 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, } open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; dp->dl_cb_fattr.ncf_cur_fsize = stat.size; - dp->dl_cb_fattr.ncf_initial_cinfo = - nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry)); + dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; @@ -6100,6 +6161,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (!stp) { stp = init_open_stateid(fp, open); + if (!stp) { + status = nfserr_jukebox; + goto out; + } + if (!open->op_stp) new_stp = true; } @@ -6535,6 +6601,7 @@ nfs4_laundromat(struct nfsd_net *nn) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); + nfsd4_async_copy_reaper(nn); nfs4_get_client_reaplist(nn, &reaplist, <); nfs4_process_client_reaplist(&reaplist); @@ -6545,6 +6612,7 @@ nfs4_laundromat(struct nfsd_net *nn) dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; + refcount_inc(&dp->dl_stid.sc_count); unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } @@ -7154,9 +7222,12 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (s->sc_type) { case SC_TYPE_DELEG: if (s->sc_status & SC_STATUS_REVOKED) { + s->sc_status |= SC_STATUS_CLOSED; spin_unlock(&s->sc_lock); dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); + if (s->sc_status & SC_STATUS_FREEABLE) + list_del_init(&dp->dl_recall_lru); + s->sc_status |= SC_STATUS_FREED; spin_unlock(&cl->cl_lock); nfs4_put_stid(s); ret = nfs_ok; @@ -7486,7 +7557,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, + SC_STATUS_REVOKED | SC_STATUS_FREEABLE, + &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7910,11 +7983,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; - if ((status = fh_verify(rqstp, &cstate->current_fh, - S_IFREG, NFSD_MAY_LOCK))) { - dprintk("NFSD: nfsd4_lock: permission denied!\n"); + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); + if (status != nfs_ok) return status; - } sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { @@ -7968,9 +8039,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -7981,9 +8049,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -8003,15 +8068,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - /* - * Most filesystems with their own ->lock operations will block - * the nfsd thread waiting to acquire the lock. That leads to - * deadlocks (we don't want every nfsd thread tied up waiting - * for file locks), so don't attempt blocking lock notifications - * on those filesystems: - */ - if (!exportfs_lock_op_is_async(sb->s_export_op)) - flags &= ~FL_SLEEP; + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { @@ -8683,7 +8743,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); shrinker_free(nn->nfsd_client_shrinker); - cancel_work(&nn->nfsd_shrinker_work); + cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -8832,8 +8892,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @dentry: dentry of inode to be checked for a conflict - * @modified: return true if file was modified - * @size: new size of file if modified is true + * @pdp: returned WRITE delegation, if one was found * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server @@ -8843,11 +8902,12 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * 18.7.4. * * Returns 0 if there is no conflict; otherwise an nfs_stat - * code is returned. + * code is returned. If @pdp is set to a non-NULL value, then the + * caller must put the reference. */ __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, - bool *modified, u64 *size) + struct nfs4_delegation **pdp) { __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -8858,10 +8918,9 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_cb_fattr *ncf; struct inode *inode = d_inode(dentry); - *modified = false; ctx = locks_inode_context(inode); if (!ctx) - return 0; + return nfs_ok; #define NON_NFSD_LEASE ((void *)1) @@ -8927,10 +8986,10 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, goto out_status; } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; - *size = ncf->ncf_cur_fsize; - *modified = true; + *pdp = dp; + return nfs_ok; } - status = 0; + status = nfs_ok; out_status: nfs4_put_stid(&dp->dl_stid); return status; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f118921250c3..53fac037611c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2652,13 +2652,10 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, strlen = end - str; if (strlen) { - p = xdr_reserve_space(xdr, strlen + 4); - if (!p) + if (xdr_stream_encode_opaque(xdr, str, strlen) < 0) return nfserr_resource; - p = xdr_encode_opaque(p, str, strlen); count++; - } - else + } else end++; if (found_esc) end = next; @@ -2699,7 +2696,6 @@ static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr, const struct path *path) { struct path cur = *path; - __be32 *p; struct dentry **components = NULL; unsigned int ncomponents = 0; __be32 err = nfserr_jukebox; @@ -2730,24 +2726,19 @@ static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr, components[ncomponents++] = cur.dentry; cur.dentry = dget_parent(cur.dentry); } + err = nfserr_resource; - p = xdr_reserve_space(xdr, 4); - if (!p) + if (xdr_stream_encode_u32(xdr, ncomponents) != XDR_UNIT) goto out_free; - *p++ = cpu_to_be32(ncomponents); - while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; - unsigned int len; spin_lock(&dentry->d_lock); - len = dentry->d_name.len; - p = xdr_reserve_space(xdr, len + 4); - if (!p) { + if (xdr_stream_encode_opaque(xdr, dentry->d_name.name, + dentry->d_name.len) < 0) { spin_unlock(&dentry->d_lock); goto out_free; } - p = xdr_encode_opaque(p, dentry->d_name.name, len); dprintk("/%pd", dentry); spin_unlock(&dentry->d_lock); dput(dentry); @@ -2928,7 +2919,6 @@ struct nfsd4_fattr_args { struct kstat stat; struct kstatfs statfs; struct nfs4_acl *acl; - u64 size; #ifdef CONFIG_NFSD_V4_SECURITY_LABEL void *context; int contextlen; @@ -3040,14 +3030,14 @@ static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr, return nfs_ok; } - c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry)); + c = nfsd4_change_attribute(&args->stat); return nfsd4_encode_changeid4(xdr, c); } static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) { - return nfsd4_encode_uint64_t(xdr, args->size); + return nfsd4_encode_uint64_t(xdr, args->stat.size); } static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr, @@ -3512,6 +3502,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, int ignore_crossmnt) { DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + struct nfs4_delegation *dp = NULL; struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; int starting_len = xdr->buf->len; @@ -3526,8 +3517,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, .dentry = dentry, }; unsigned long bit; - bool file_modified = false; - u64 size = 0; WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); @@ -3555,10 +3544,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; } - args.size = 0; if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, dentry, - &file_modified, &size); + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &dp); if (status) goto out; } @@ -3566,12 +3553,16 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, err = vfs_getattr(&path, &args.stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); + if (dp) { + struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; + + if (ncf->ncf_file_modified) + args.stat.size = ncf->ncf_cur_fsize; + + nfs4_put_stid(&dp->dl_stid); + } if (err) goto out_nfserr; - if (file_modified) - args.size = size; - else - args.size = args.stat.size; if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ @@ -3767,7 +3758,7 @@ nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name, nfserr = nfserrno(err); goto out_put; } - nfserr = check_nfsd_access(exp, cd->rd_rqstp); + nfserr = check_nfsd_access(exp, cd->rd_rqstp, false); if (nfserr) goto out_put; @@ -4826,6 +4817,25 @@ nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp) } static __be32 +nfsd4_encode_nfs_impl_id4(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid) +{ + __be32 status; + + /* nii_domain */ + status = nfsd4_encode_opaque(xdr, exid->nii_domain.data, + exid->nii_domain.len); + if (status != nfs_ok) + return status; + /* nii_name */ + status = nfsd4_encode_opaque(xdr, exid->nii_name.data, + exid->nii_name.len); + if (status != nfs_ok) + return status; + /* nii_time */ + return nfsd4_encode_nfstime4(xdr, &exid->nii_time); +} + +static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { @@ -4859,8 +4869,11 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, if (nfserr != nfs_ok) return nfserr; /* eir_server_impl_id<1> */ - if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) return nfserr_resource; + nfserr = nfsd4_encode_nfs_impl_id4(xdr, exid); + if (nfserr != nfs_ok) + return nfserr; return nfs_ok; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 40ad58a6a036..98d6459724a7 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -222,7 +222,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); put_cred(override_creds(new)); - put_cred(new); } else { error = nfsd_setuser_and_check_port(rqstp, cred, exp); if (error) @@ -320,6 +319,7 @@ __fh_verify(struct svc_rqst *rqstp, { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_export *exp = NULL; + bool may_bypass_gss = false; struct dentry *dentry; __be32 error; @@ -362,13 +362,12 @@ __fh_verify(struct svc_rqst *rqstp, if (error) goto out; - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) - goto skip_pseudoflavor_check; + if ((access & NFSD_MAY_NLM) && (exp->ex_flags & NFSEXP_NOAUTHNLM)) + /* NLM is allowed to fully bypass authentication */ + goto out; + + if (access & NFSD_MAY_BYPASS_GSS) + may_bypass_gss = true; /* * Clients may expect to be able to use auth_sys during mount, * even if they use gss for everything else; see section 2.3.2 @@ -376,13 +375,12 @@ __fh_verify(struct svc_rqst *rqstp, */ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT && exp->ex_path.dentry == dentry) - goto skip_pseudoflavor_check; + may_bypass_gss = true; - error = check_nfsd_access(exp, rqstp); + error = check_nfsd_access(exp, rqstp, may_bypass_gss); if (error) goto out; -skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); out: @@ -667,20 +665,18 @@ out_negative: __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - struct inode *inode; struct kstat stat; __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return nfs_ok; - inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) return err; if (v4) - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); + fhp->fh_pre_change = nfsd4_change_attribute(&stat); fhp->fh_pre_mtime = stat.mtime; fhp->fh_pre_ctime = stat.ctime; @@ -697,7 +693,6 @@ __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) __be32 fh_fill_post_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; if (fhp->fh_no_wcc) @@ -713,7 +708,7 @@ __be32 fh_fill_post_attrs(struct svc_fh *fhp) fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = - nfsd4_change_attribute(&fhp->fh_post_attr, inode); + nfsd4_change_attribute(&fhp->fh_post_attr); return nfs_ok; } @@ -770,7 +765,7 @@ char * SVCFH_fmt(struct svc_fh *fhp) struct knfsd_fh *fh = &fhp->fh_handle; static char buf[2+1+1+64*3+1]; - if (fh->fh_size < 0 || fh->fh_size> 64) + if (fh->fh_size > 64) return "bad-fh"; sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw); return buf; @@ -804,7 +799,14 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) return FSIDSOURCE_DEV; } -/* +/** + * nfsd4_change_attribute - Generate an NFSv4 change_attribute value + * @stat: inode attributes + * + * Caller must fill in @stat before calling, typically by invoking + * vfs_getattr() with STATX_MODE, STATX_CTIME, and STATX_CHANGE_COOKIE. + * Returns an unsigned 64-bit changeid4 value (RFC 8881 Section 3.2). + * * We could use i_version alone as the change attribute. However, i_version * can go backwards on a regular file after an unclean shutdown. On its own * that doesn't necessarily cause a problem, but if i_version goes backwards @@ -821,13 +823,13 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) * assume that the new change attr is always logged to stable storage in some * fashion before the results can be seen. */ -u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode) +u64 nfsd4_change_attribute(const struct kstat *stat) { u64 chattr; if (stat->result_mask & STATX_CHANGE_COOKIE) { chattr = stat->change_cookie; - if (S_ISREG(inode->i_mode) && + if (S_ISREG(stat->mode) && !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { chattr += (u64)stat->ctime.tv_sec << 30; chattr += stat->ctime.tv_nsec; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 5b7394801dc4..876152a91f12 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -297,8 +297,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -u64 nfsd4_change_attribute(const struct kstat *stat, - const struct inode *inode); +u64 nfsd4_change_attribute(const struct kstat *stat); __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); __be32 fh_fill_post_attrs(struct svc_fh *fhp); __be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e236135ddc63..49e2f32102ab 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -214,14 +214,14 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change return 0; } -bool nfsd_serv_try_get(struct net *net) +bool nfsd_serv_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref)); } -void nfsd_serv_put(struct net *net) +void nfsd_serv_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -434,6 +434,9 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + if (!nn->nfsd_net_up) + return; + nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); @@ -549,11 +552,8 @@ void nfsd_destroy_serv(struct net *net) * other initialization has been done except the rpcb information. */ svc_rpcb_cleanup(serv, net); - if (!nn->nfsd_net_up) - return; nfsd_shutdown_net(net); - nfsd_export_flush(net); svc_destroy(&serv); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 79c743c01a47..e16bb3717fb9 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -71,8 +71,8 @@ struct nfsd4_callback { struct work_struct cb_work; int cb_seq_status; int cb_status; + int cb_held_slot; bool cb_need_restart; - bool cb_holds_slot; }; struct nfsd4_callback_ops { @@ -114,6 +114,8 @@ struct nfs4_stid { /* For a deleg stateid kept around only to process free_stateid's: */ #define SC_STATUS_REVOKED BIT(1) #define SC_STATUS_ADMIN_REVOKED BIT(2) +#define SC_STATUS_FREEABLE BIT(3) +#define SC_STATUS_FREED BIT(4) unsigned short sc_status; struct list_head sc_cp_list; @@ -135,10 +137,24 @@ struct nfs4_cpntf_state { time64_t cpntf_time; /* last time stateid used */ }; +/* + * RFC 7862 Section 4.8 states: + * + * | A copy offload stateid will be valid until either (A) the client + * | or server restarts or (B) the client returns the resource by + * | issuing an OFFLOAD_CANCEL operation or the client replies to a + * | CB_OFFLOAD operation. + * + * Because a client might not reply to a CB_OFFLOAD, or a reply + * might get lost due to connection loss, NFSD purges async copy + * state after a short period to prevent it from accumulating + * over time. + */ +#define NFSD_COPY_INITIAL_TTL 10 + struct nfs4_cb_fattr { struct nfsd4_callback ncf_getattr; u32 ncf_cb_status; - u32 ncf_cb_bmap[1]; /* from CB_GETATTR reply */ u64 ncf_cb_change; @@ -288,6 +304,9 @@ struct nfsd4_conn { unsigned char cn_flags; }; +/* Maximum number of slots that nfsd will use in the backchannel */ +#define NFSD_BC_SLOT_TABLE_SIZE (sizeof(u32) * 8) + /* * Representation of a v4.1+ session. These are refcounted in a similar fashion * to the nfs4_client. References are only taken when the server is actively @@ -295,19 +314,19 @@ struct nfsd4_conn { */ struct nfsd4_session { atomic_t se_ref; + spinlock_t se_lock; + u32 se_cb_slot_avail; /* bitmap of available slots */ + u32 se_cb_highest_slot; /* highest slot client wants */ + u32 se_cb_prog; + bool se_dead; struct list_head se_hash; /* hash by sessionid */ struct list_head se_perclnt; -/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ -#define NFS4_SESSION_DEAD 0x010 - u32 se_flags; struct nfs4_client *se_client; struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; - struct nfsd4_channel_attrs se_bchannel; struct nfsd4_cb_sec se_cb_sec; struct list_head se_conns; - u32 se_cb_prog; - u32 se_cb_seq_nr; + u32 se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE]; struct nfsd4_slot *se_slots[]; /* forward channel slots */ }; @@ -441,9 +460,6 @@ struct nfs4_client { */ struct dentry *cl_nfsd_info_dentry; - /* for nfs41 callbacks */ - /* We currently support a single back channel with a single slot */ - unsigned long cl_cb_slot_busy; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ struct net *net; @@ -740,6 +756,8 @@ extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, extern bool nfsd4_run_cb(struct nfsd4_callback *cb); extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); +void nfsd4_async_copy_reaper(struct nfsd_net *nn); +bool nfsd4_has_active_async_copies(struct nfs4_client *clp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); @@ -782,5 +800,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct dentry *dentry, bool *file_modified, u64 *size); + struct dentry *dentry, struct nfs4_delegation **pdp); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index c625966cfcf3..696c89f68a9e 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -79,7 +79,7 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); { NFSD_MAY_READ, "READ" }, \ { NFSD_MAY_SATTR, "SATTR" }, \ { NFSD_MAY_TRUNC, "TRUNC" }, \ - { NFSD_MAY_LOCK, "LOCK" }, \ + { NFSD_MAY_NLM, "NLM" }, \ { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \ { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \ { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \ @@ -163,7 +163,7 @@ TRACE_EVENT(nfsd_compound_decode_err, __entry->opnum, __entry->status) ); -TRACE_EVENT(nfsd_compound_encode_err, +DECLARE_EVENT_CLASS(nfsd_compound_err_class, TP_PROTO( const struct svc_rqst *rqstp, u32 opnum, @@ -184,6 +184,18 @@ TRACE_EVENT(nfsd_compound_encode_err, __entry->opnum, __entry->status) ); +#define DEFINE_NFSD_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_compound_err_class, nfsd_compound_##name##_err, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + u32 opnum, \ + __be32 status \ + ), \ + TP_ARGS(rqstp, opnum, status)) + +DEFINE_NFSD_COMPOUND_ERR_EVENT(op); +DEFINE_NFSD_COMPOUND_ERR_EVENT(encode); + #define show_fs_file_type(x) \ __print_symbolic(x, \ { S_IFLNK, "LNK" }, \ @@ -1113,7 +1125,7 @@ TRACE_EVENT(nfsd_file_acquire, ), TP_fast_assign( - __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; __entry->inode = inode; __entry->may_flags = may_flags; __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0; @@ -1147,7 +1159,7 @@ TRACE_EVENT(nfsd_file_insert_err, __field(long, error) ), TP_fast_assign( - __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; __entry->inode = inode; __entry->may_flags = may_flags; __entry->error = error; @@ -1177,7 +1189,7 @@ TRACE_EVENT(nfsd_file_cons_err, __field(const void *, nf_file) ), TP_fast_assign( - __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; __entry->inode = inode; __entry->may_flags = may_flags; __entry->nf_ref = refcount_read(&nf->nf_ref); @@ -1685,7 +1697,7 @@ TRACE_EVENT(nfsd_cb_free_slot, __entry->cl_id = sid->clientid.cl_id; __entry->seqno = sid->sequence; __entry->reserved = sid->reserved; - __entry->slot_seqno = session->se_cb_seq_nr; + __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot]; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u", @@ -2232,7 +2244,7 @@ TRACE_EVENT(nfsd_copy_done, ) ); -TRACE_EVENT(nfsd_copy_async_done, +DECLARE_EVENT_CLASS(nfsd_copy_async_done_class, TP_PROTO( const struct nfsd4_copy *copy ), @@ -2301,6 +2313,15 @@ TRACE_EVENT(nfsd_copy_async_done, ) ); +#define DEFINE_COPY_ASYNC_DONE_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_async_done_class, \ + nfsd_copy_async_##name, \ + TP_PROTO(const struct nfsd4_copy *copy), \ + TP_ARGS(copy)) + +DEFINE_COPY_ASYNC_DONE_EVENT(done); +DEFINE_COPY_ASYNC_DONE_EVENT(cancel); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 22325b590e17..29cb7b812d71 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -35,7 +35,6 @@ #include "xdr3.h" #ifdef CONFIG_NFSD_V4 -#include "../internal.h" #include "acl.h" #include "idmap.h" #include "xdr4.h" @@ -321,7 +320,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; - err = check_nfsd_access(exp, rqstp); + err = check_nfsd_access(exp, rqstp, false); if (err) goto out; /* @@ -861,8 +860,7 @@ int nfsd_open_break_lease(struct inode *inode, int access) * N.B. After this call fhp needs an fh_put */ static int -__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int may_flags, struct file **filp) +__nfsd_open(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; struct inode *inode; @@ -903,11 +901,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, goto out; } - if (may_flags & NFSD_MAY_64BIT_COOKIE) - file->f_mode |= FMODE_64BITHASH; - else - file->f_mode |= FMODE_32BITHASH; - *filp = file; out: return host_err; @@ -937,7 +930,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, retry: err = fh_verify(rqstp, fhp, type, may_flags); if (!err) { - host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + host_err = __nfsd_open(fhp, type, may_flags, filp); if (host_err == -EOPENSTALE && !retried) { retried = true; fh_put(fhp); @@ -950,7 +943,6 @@ retry: /** * nfsd_open_verified - Open a regular file for the filecache - * @rqstp: RPC request * @fhp: NFS filehandle of the file to open * @may_flags: internal permission flags * @filp: OUT: open "struct file *" @@ -958,10 +950,9 @@ retry: * Returns zero on success, or a negative errno value. */ int -nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, - struct file **filp) +nfsd_open_verified(struct svc_fh *fhp, int may_flags, struct file **filp) { - return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); + return __nfsd_open(fhp, S_IFREG, may_flags, filp); } /* @@ -2174,13 +2165,15 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, loff_t offset = *offsetp; int may_flags = NFSD_MAY_READ; - if (fhp->fh_64bit_cookies) - may_flags |= NFSD_MAY_64BIT_COOKIE; - err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; + if (fhp->fh_64bit_cookies) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + offset = vfs_llseek(file, offset, SEEK_SET); if (offset < 0) { err = nfserrno((int)offset); @@ -2512,7 +2505,7 @@ nfsd_permission(struct svc_cred *cred, struct svc_export *exp, (acc & NFSD_MAY_EXEC)? " exec" : "", (acc & NFSD_MAY_SATTR)? " sattr" : "", (acc & NFSD_MAY_TRUNC)? " trunc" : "", - (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_NLM)? " nlm" : "", (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", @@ -2537,16 +2530,6 @@ nfsd_permission(struct svc_cred *cred, struct svc_export *exp, if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) return nfserr_perm; - if (acc & NFSD_MAY_LOCK) { - /* If we cannot rely on authentication in NLM requests, - * just allow locks, otherwise require read permission, or - * ownership - */ - if (exp->ex_flags & NFSEXP_NOAUTHNLM) - return 0; - else - acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; - } /* * The file owner always gets access permission for accesses that * would normally be checked at open time. This is to make diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3ff146522556..f9b09b842856 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -20,7 +20,7 @@ #define NFSD_MAY_READ 0x004 /* == MAY_READ */ #define NFSD_MAY_SATTR 0x008 #define NFSD_MAY_TRUNC 0x010 -#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_NLM 0x020 /* request is from lockd */ #define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ @@ -114,8 +114,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, - int may_flags, struct file **filp); +int nfsd_open_verified(struct svc_fh *fhp, int may_flags, + struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 2a21a7662e03..382cc1389396 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -567,6 +567,7 @@ struct nfsd4_exchange_id { struct xdr_netobj nii_domain; struct xdr_netobj nii_name; struct timespec64 nii_time; + char *server_impl_name; }; struct nfsd4_sequence { @@ -675,6 +676,7 @@ struct nfsd4_cb_offload { struct nfsd4_callback co_cb; struct nfsd42_write_res co_res; __be32 co_nfserr; + unsigned int co_retries; struct knfsd_fh co_fh; }; @@ -693,12 +695,16 @@ struct nfsd4_copy { #define NFSD4_COPY_F_SYNCHRONOUS (2) #define NFSD4_COPY_F_COMMITTED (3) #define NFSD4_COPY_F_COMPLETED (4) +#define NFSD4_COPY_F_OFFLOAD_DONE (5) /* response */ __be32 nfserr; struct nfsd42_write_res cp_res; struct knfsd_fh fh; + /* offload callback */ + struct nfsd4_cb_offload cp_cb_offload; + struct nfs4_client *cp_clp; struct nfsd_file *nf_src; @@ -709,6 +715,7 @@ struct nfsd4_copy { struct list_head copies; struct task_struct *copy_task; refcount_t refcount; + unsigned int cp_ttl; struct nfsd4_ssc_umount_item *ss_nsui; struct nfs_fh c_fh; @@ -930,6 +937,7 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +void nfsd4_exchange_id_release(union nfsd4_op_u *u); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index ba50388ee4bf..ba3e1f591f36 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -177,12 +177,14 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block * @inode: inode of metadata file * @bh: buffer head of the buffer to be initialized - * @kaddr: kernel address mapped for the page including the buffer + * @from: kernel address mapped for a chunk of the block + * + * This function does not yet support the case where block size > PAGE_SIZE. */ static void nilfs_palloc_desc_block_init(struct inode *inode, - struct buffer_head *bh, void *kaddr) + struct buffer_head *bh, void *from) { - struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + struct nilfs_palloc_group_desc *desc = from; unsigned long n = nilfs_palloc_groups_per_desc_block(inode); __le32 nfrees; @@ -337,38 +339,55 @@ static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) } /** - * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor + * nilfs_palloc_group_desc_offset - calculate the byte offset of a group + * descriptor in the folio containing it * @inode: inode of metadata file using this allocator * @group: group number - * @bh: buffer head of the buffer storing the group descriptor block - * @kaddr: kernel address mapped for the page including the buffer + * @bh: buffer head of the group descriptor block + * + * Return: Byte offset in the folio of the group descriptor for @group. */ -static struct nilfs_palloc_group_desc * -nilfs_palloc_block_get_group_desc(const struct inode *inode, - unsigned long group, - const struct buffer_head *bh, void *kaddr) +static size_t nilfs_palloc_group_desc_offset(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh) { - return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + - group % nilfs_palloc_groups_per_desc_block(inode); + return offset_in_folio(bh->b_folio, bh->b_data) + + sizeof(struct nilfs_palloc_group_desc) * + (group % nilfs_palloc_groups_per_desc_block(inode)); +} + +/** + * nilfs_palloc_bitmap_offset - calculate the byte offset of a bitmap block + * in the folio containing it + * @bh: buffer head of the bitmap block + * + * Return: Byte offset in the folio of the bitmap block for @bh. + */ +static size_t nilfs_palloc_bitmap_offset(const struct buffer_head *bh) +{ + return offset_in_folio(bh->b_folio, bh->b_data); } /** - * nilfs_palloc_block_get_entry - get kernel address of an entry + * nilfs_palloc_entry_offset - calculate the byte offset of an entry in the + * folio containing it * @inode: inode of metadata file using this allocator - * @nr: serial number of the entry (e.g. inode number) - * @bh: buffer head of the buffer storing the entry block - * @kaddr: kernel address mapped for the page including the buffer + * @nr: serial number of the entry (e.g. inode number) + * @bh: buffer head of the entry block + * + * Return: Byte offset in the folio of the entry @nr. */ -void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, - const struct buffer_head *bh, void *kaddr) +size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, + const struct buffer_head *bh) { - unsigned long entry_offset, group_offset; + unsigned long entry_index_in_group, entry_index_in_block; - nilfs_palloc_group(inode, nr, &group_offset); - entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + nilfs_palloc_group(inode, nr, &entry_index_in_group); + entry_index_in_block = entry_index_in_group % + NILFS_MDT(inode)->mi_entries_per_block; - return kaddr + bh_offset(bh) + - entry_offset * NILFS_MDT(inode)->mi_entry_size; + return offset_in_folio(bh->b_folio, bh->b_data) + + entry_index_in_block * NILFS_MDT(inode)->mi_entry_size; } /** @@ -506,7 +525,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; unsigned long n, entries_per_group; @@ -529,17 +548,17 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + + doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); + desc = kmap_local_folio(desc_bh->b_folio, doff); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); - for (j = 0; j < n; j++, desc++, group++, group_offset = 0) { + for (j = 0; j < n; j++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); - if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0) + if (nilfs_palloc_group_desc_nfrees(&desc[j], lock) == 0) continue; - kunmap_local(desc_kaddr); + kunmap_local(desc); ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (unlikely(ret < 0)) { @@ -547,12 +566,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, return ret; } - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + /* + * Re-kmap the folio containing the first (and + * subsequent) group descriptors. + */ + desc = kmap_local_folio(desc_bh->b_folio, doff); - bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + boff = nilfs_palloc_bitmap_offset(bitmap_bh); + bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); pos = nilfs_palloc_find_available_slot( bitmap, group_offset, entries_per_group, lock, wrap); @@ -562,14 +583,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, * beginning, the wrap flag only has an effect on the * first search. */ - kunmap_local(bitmap_kaddr); + kunmap_local(bitmap); if (pos >= 0) goto found; brelse(bitmap_bh); } - kunmap_local(desc_kaddr); + kunmap_local(desc); brelse(desc_bh); } @@ -578,9 +599,9 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, found: /* found a free entry */ - nilfs_palloc_group_desc_add_entries(desc, lock, -1); + nilfs_palloc_group_desc_add_entries(&desc[j], lock, -1); req->pr_entry_nr = entries_per_group * group + pos; - kunmap_local(desc_kaddr); + kunmap_local(desc); req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; @@ -611,18 +632,18 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode, void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { - struct nilfs_palloc_group_desc *desc; unsigned long group, group_offset; + size_t doff, boff; + struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc(inode, group, - req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); + desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); + + boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); + bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) @@ -633,8 +654,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap_local(bitmap_kaddr); - kunmap_local(desc_kaddr); + kunmap_local(bitmap); + kunmap_local(desc); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); @@ -653,17 +674,17 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct nilfs_palloc_group_desc *desc; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned char *bitmap; unsigned long group, group_offset; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc(inode, group, - req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); + doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); + desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); + + boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); + bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) @@ -674,8 +695,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap_local(bitmap_kaddr); - kunmap_local(desc_kaddr); + kunmap_local(bitmap); + kunmap_local(desc); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); @@ -739,7 +760,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; - void *desc_kaddr, *bitmap_kaddr; + size_t doff, boff; unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); @@ -767,8 +788,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + boff = nilfs_palloc_bitmap_offset(bitmap_bh); + bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); j = i; @@ -813,7 +834,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) entry_start = rounddown(group_offset, epb); } while (true); - kunmap_local(bitmap_kaddr); + kunmap_local(bitmap); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); @@ -827,11 +848,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) inode->i_ino); } - desc_kaddr = kmap_local_page(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); + doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); + desc = kmap_local_folio(desc_bh->b_folio, doff); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); - kunmap_local(desc_kaddr); + kunmap_local(desc); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index e19d7eb10084..3f115ab7e9a7 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -31,8 +31,8 @@ nilfs_palloc_entries_per_group(const struct inode *inode) int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); -void *nilfs_palloc_block_get_entry(const struct inode *, __u64, - const struct buffer_head *, void *); +size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, + const struct buffer_head *bh); int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 57b4af5ad646..54a3fa0cf67e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,6 +35,7 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode) ii->i_flags = 0; memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); + btnc_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; } void nilfs_btnode_cache_clear(struct address_space *btnc) @@ -68,7 +69,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -133,7 +133,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index f0ce37552446..c20207d7a989 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -68,54 +68,41 @@ static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, static unsigned int nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, - void *kaddr, unsigned int n) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp; unsigned int count; + cp = kmap_local_folio(bh->b_folio, + offset_in_folio(bh->b_folio, bh->b_data)); count = le32_to_cpu(cp->cp_checkpoints_count) + n; cp->cp_checkpoints_count = cpu_to_le32(count); + kunmap_local(cp); return count; } static unsigned int nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, - void *kaddr, unsigned int n) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp; unsigned int count; + cp = kmap_local_folio(bh->b_folio, + offset_in_folio(bh->b_folio, bh->b_data)); WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); count = le32_to_cpu(cp->cp_checkpoints_count) - n; cp->cp_checkpoints_count = cpu_to_le32(count); + kunmap_local(cp); return count; } -static inline struct nilfs_cpfile_header * -nilfs_cpfile_block_get_header(const struct inode *cpfile, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh); -} - -static struct nilfs_checkpoint * -nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, - struct buffer_head *bh, - void *kaddr) -{ - return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * - NILFS_MDT(cpfile)->mi_entry_size; -} - static void nilfs_cpfile_block_init(struct inode *cpfile, struct buffer_head *bh, - void *kaddr) + void *from) { - struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + struct nilfs_checkpoint *cp = from; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; int n = nilfs_cpfile_checkpoints_per_block(cpfile); @@ -125,6 +112,54 @@ static void nilfs_cpfile_block_init(struct inode *cpfile, } } +/** + * nilfs_cpfile_checkpoint_offset - calculate the byte offset of a checkpoint + * entry in the folio containing it + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @bh: buffer head of block containing checkpoint indexed by @cno + * + * Return: Byte offset in the folio of the checkpoint specified by @cno. + */ +static size_t nilfs_cpfile_checkpoint_offset(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh) +{ + return offset_in_folio(bh->b_folio, bh->b_data) + + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +/** + * nilfs_cpfile_cp_snapshot_list_offset - calculate the byte offset of a + * checkpoint snapshot list in the folio + * containing it + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @bh: buffer head of block containing checkpoint indexed by @cno + * + * Return: Byte offset in the folio of the checkpoint snapshot list specified + * by @cno. + */ +static size_t nilfs_cpfile_cp_snapshot_list_offset(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh) +{ + return nilfs_cpfile_checkpoint_offset(cpfile, cno, bh) + + offsetof(struct nilfs_checkpoint, cp_snapshot_list); +} + +/** + * nilfs_cpfile_ch_snapshot_list_offset - calculate the byte offset of the + * snapshot list in the header + * + * Return: Byte offset in the folio of the checkpoint snapshot list + */ +static size_t nilfs_cpfile_ch_snapshot_list_offset(void) +{ + return offsetof(struct nilfs_cpfile_header, ch_snapshot_list); +} + static int nilfs_cpfile_get_header_block(struct inode *cpfile, struct buffer_head **bhp) { @@ -214,7 +249,7 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, { struct buffer_head *cp_bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (cno < 1 || cno > nilfs_mdt_cno(cpfile)) @@ -228,8 +263,8 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, goto out_sem; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -EINVAL; goto put_cp; @@ -254,7 +289,7 @@ int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, root->ifile = ifile; put_cp: - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); out_sem: up_read(&NILFS_MDT(cpfile)->mi_sem); @@ -282,7 +317,7 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) struct buffer_head *header_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (WARN_ON_ONCE(cno < 1)) @@ -297,24 +332,22 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) if (unlikely(ret < 0)) goto out_header; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { /* a newly-created checkpoint */ nilfs_checkpoint_clear_invalid(cp); + kunmap_local(cp); if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, - kaddr, 1); - kunmap_local(kaddr); + 1); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, - kaddr); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_ncheckpoints, 1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); } else { - kunmap_local(kaddr); + kunmap_local(cp); } /* Force the buffer and the inode to become dirty */ @@ -353,7 +386,7 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, { struct buffer_head *cp_bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; if (WARN_ON_ONCE(cno < 1)) @@ -367,10 +400,10 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, goto out_sem; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (unlikely(nilfs_checkpoint_invalid(cp))) { - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); goto error; } @@ -391,7 +424,7 @@ int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode); nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode); - kunmap_local(kaddr); + kunmap_local(cp); brelse(cp_bh); out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); @@ -432,6 +465,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, struct nilfs_checkpoint *cp; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cno; + size_t offset; void *kaddr; unsigned long tnicps; int ret, ncps, nicps, nss, count, i; @@ -462,9 +496,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; } - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint( - cpfile, cno, cp_bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kaddr = kmap_local_folio(cp_bh->b_folio, offset); nicps = 0; for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { if (nilfs_checkpoint_snapshot(cp)) { @@ -474,43 +507,42 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, nicps++; } } - if (nicps > 0) { - tnicps += nicps; - mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_dirty(cpfile); - if (!nilfs_cpfile_is_in_first(cpfile, cno)) { - count = - nilfs_cpfile_block_sub_valid_checkpoints( - cpfile, cp_bh, kaddr, nicps); - if (count == 0) { - /* make hole */ - kunmap_local(kaddr); - brelse(cp_bh); - ret = - nilfs_cpfile_delete_checkpoint_block( - cpfile, cno); - if (ret == 0) - continue; - nilfs_err(cpfile->i_sb, - "error %d deleting checkpoint block", - ret); - break; - } - } + kunmap_local(kaddr); + + if (nicps <= 0) { + brelse(cp_bh); + continue; } - kunmap_local(kaddr); + tnicps += nicps; + mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (nilfs_cpfile_is_in_first(cpfile, cno)) { + brelse(cp_bh); + continue; + } + + count = nilfs_cpfile_block_sub_valid_checkpoints(cpfile, cp_bh, + nicps); brelse(cp_bh); + if (count) + continue; + + /* Delete the block if there are no more valid checkpoints */ + ret = nilfs_cpfile_delete_checkpoint_block(cpfile, cno); + if (unlikely(ret)) { + nilfs_err(cpfile->i_sb, + "error %d deleting checkpoint block", ret); + break; + } } if (tnicps > 0) { - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, - kaddr); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); - kunmap_local(kaddr); + kunmap_local(header); } brelse(header_bh); @@ -544,6 +576,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, struct buffer_head *bh; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + size_t offset; void *kaddr; int n, ret; int ncps, i; @@ -562,8 +595,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - kaddr = kmap_local_page(bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh); + cp = kaddr = kmap_local_folio(bh->b_folio, offset); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { if (!nilfs_checkpoint_invalid(cp)) { nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, @@ -597,7 +630,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, struct nilfs_cpinfo *ci = buf; __u64 curr = *cnop, next; unsigned long curr_blkoff, next_blkoff; - void *kaddr; + size_t offset; int n = 0, ret; down_read(&NILFS_MDT(cpfile)->mi_sem); @@ -606,10 +639,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out; - kaddr = kmap_local_page(bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + header = kmap_local_folio(bh->b_folio, 0); curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); - kunmap_local(kaddr); + kunmap_local(header); brelse(bh); if (curr == 0) { ret = 0; @@ -627,9 +659,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = 0; /* No snapshots (started from a hole block) */ goto out; } - kaddr = kmap_local_page(bh->b_page); + offset = nilfs_cpfile_checkpoint_offset(cpfile, curr, bh); + cp = kmap_local_folio(bh->b_folio, offset); while (n < nci) { - cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); curr = ~(__u64)0; /* Terminator */ if (unlikely(nilfs_checkpoint_invalid(cp) || !nilfs_checkpoint_snapshot(cp))) @@ -641,9 +673,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, if (next == 0) break; /* reach end of the snapshot list */ + kunmap_local(cp); next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); if (curr_blkoff != next_blkoff) { - kunmap_local(kaddr); brelse(bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &bh); @@ -651,12 +683,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, WARN_ON(ret == -ENOENT); goto out; } - kaddr = kmap_local_page(bh->b_page); } + offset = nilfs_cpfile_checkpoint_offset(cpfile, next, bh); + cp = kmap_local_folio(bh->b_folio, offset); curr = next; curr_blkoff = next_blkoff; } - kunmap_local(kaddr); + kunmap_local(cp); brelse(bh); *cnop = curr; ret = n; @@ -733,26 +766,6 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); } -static struct nilfs_snapshot_list * -nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, - __u64 cno, - struct buffer_head *bh, - void *kaddr) -{ - struct nilfs_cpfile_header *header; - struct nilfs_checkpoint *cp; - struct nilfs_snapshot_list *list; - - if (cno != 0) { - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - list = &cp->cp_snapshot_list; - } else { - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); - list = &header->ch_snapshot_list; - } - return list; -} - static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; @@ -761,94 +774,103 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) struct nilfs_snapshot_list *list; __u64 curr, prev; unsigned long curr_blkoff, prev_blkoff; - void *kaddr; + size_t offset, curr_list_offset, prev_list_offset; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (unlikely(ret < 0)) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) - goto out_sem; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + goto out_header; + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } if (nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } - kunmap_local(kaddr); + kunmap_local(cp); - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) - goto out_cp; - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* + * Find the last snapshot before the checkpoint being changed to + * snapshot mode by going backwards through the snapshot list. + * Set "prev" to its checkpoint number, or 0 if not found. + */ + header = kmap_local_folio(header_bh->b_folio, 0); list = &header->ch_snapshot_list; curr_bh = header_bh; get_bh(curr_bh); curr = 0; curr_blkoff = 0; + curr_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); prev = le64_to_cpu(list->ssl_prev); while (prev > cno) { prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); curr = prev; + kunmap_local(list); if (curr_blkoff != prev_blkoff) { - kunmap_local(kaddr); brelse(curr_bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &curr_bh); - if (ret < 0) - goto out_header; - kaddr = kmap_local_page(curr_bh->b_page); + if (unlikely(ret < 0)) + goto out_cp; } + curr_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, curr, curr_bh); + list = kmap_local_folio(curr_bh->b_folio, curr_list_offset); curr_blkoff = prev_blkoff; - cp = nilfs_cpfile_block_get_checkpoint( - cpfile, curr, curr_bh, kaddr); - list = &cp->cp_snapshot_list; prev = le64_to_cpu(list->ssl_prev); } - kunmap_local(kaddr); + kunmap_local(list); if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_curr; + + prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, prev, prev_bh); } else { prev_bh = header_bh; get_bh(prev_bh); + prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } - kaddr = kmap_local_page(curr_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, curr, curr_bh, kaddr); + /* Update the list entry for the next snapshot */ + list = kmap_local_folio(curr_bh->b_folio, curr_list_offset); list->ssl_prev = cpu_to_le64(cno); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + /* Update the checkpoint being changed to a snapshot */ + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); nilfs_checkpoint_set_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); - kaddr = kmap_local_page(prev_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, prev, prev_bh, kaddr); + /* Update the list entry for the previous snapshot */ + list = kmap_local_folio(prev_bh->b_folio, prev_list_offset); list->ssl_next = cpu_to_le64(cno); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* Update the statistics in the header */ + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_nsnapshots, 1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(prev_bh); mark_buffer_dirty(curr_bh); @@ -861,12 +883,12 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) out_curr: brelse(curr_bh); - out_header: - brelse(header_bh); - out_cp: brelse(cp_bh); + out_header: + brelse(header_bh); + out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; @@ -879,79 +901,87 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) struct nilfs_checkpoint *cp; struct nilfs_snapshot_list *list; __u64 next, prev; - void *kaddr; + size_t offset, next_list_offset, prev_list_offset; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (unlikely(ret < 0)) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) - goto out_sem; - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + goto out_header; + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, cp_bh); + cp = kmap_local_folio(cp_bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } if (!nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_local(kaddr); + kunmap_local(cp); goto out_cp; } list = &cp->cp_snapshot_list; next = le64_to_cpu(list->ssl_next); prev = le64_to_cpu(list->ssl_prev); - kunmap_local(kaddr); + kunmap_local(cp); - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) - goto out_cp; if (next != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &next_bh); if (ret < 0) - goto out_header; + goto out_cp; + + next_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, next, next_bh); } else { next_bh = header_bh; get_bh(next_bh); + next_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_next; + + prev_list_offset = nilfs_cpfile_cp_snapshot_list_offset( + cpfile, prev, prev_bh); } else { prev_bh = header_bh; get_bh(prev_bh); + prev_list_offset = nilfs_cpfile_ch_snapshot_list_offset(); } - kaddr = kmap_local_page(next_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, next, next_bh, kaddr); + /* Update the list entry for the next snapshot */ + list = kmap_local_folio(next_bh->b_folio, next_list_offset); list->ssl_prev = cpu_to_le64(prev); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(prev_bh->b_page); - list = nilfs_cpfile_block_get_snapshot_list( - cpfile, prev, prev_bh, kaddr); + /* Update the list entry for the previous snapshot */ + list = kmap_local_folio(prev_bh->b_folio, prev_list_offset); list->ssl_next = cpu_to_le64(next); - kunmap_local(kaddr); + kunmap_local(list); - kaddr = kmap_local_page(cp_bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + /* Update the snapshot being changed back to a plain checkpoint */ + cp = kmap_local_folio(cp_bh->b_folio, offset); cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); nilfs_checkpoint_clear_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); - kaddr = kmap_local_page(header_bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + /* Update the statistics in the header */ + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->ch_nsnapshots, -1); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(next_bh); mark_buffer_dirty(prev_bh); @@ -964,12 +994,12 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) out_next: brelse(next_bh); - out_header: - brelse(header_bh); - out_cp: brelse(cp_bh); + out_header: + brelse(header_bh); + out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; @@ -990,7 +1020,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *bh; struct nilfs_checkpoint *cp; - void *kaddr; + size_t offset; int ret; /* @@ -1004,13 +1034,14 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); if (ret < 0) goto out; - kaddr = kmap_local_page(bh->b_page); - cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + + offset = nilfs_cpfile_checkpoint_offset(cpfile, cno, bh); + cp = kmap_local_folio(bh->b_folio, offset); if (nilfs_checkpoint_invalid(cp)) ret = -ENOENT; else ret = nilfs_checkpoint_snapshot(cp); - kunmap_local(kaddr); + kunmap_local(cp); brelse(bh); out: @@ -1079,7 +1110,6 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) { struct buffer_head *bh; struct nilfs_cpfile_header *header; - void *kaddr; int ret; down_read(&NILFS_MDT(cpfile)->mi_sem); @@ -1087,12 +1117,11 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out_sem; - kaddr = kmap_local_page(bh->b_page); - header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + header = kmap_local_folio(bh->b_folio, 0); cpstat->cs_cno = nilfs_mdt_cno(cpfile); cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); - kunmap_local(kaddr); + kunmap_local(header); brelse(bh); out_sem: diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0bef662176a4..e220dcb08aa6 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -89,15 +89,15 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MAX); entry->de_blocknr = cpu_to_le64(0); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_palloc_commit_alloc_entry(dat, req); nilfs_dat_commit_entry(dat, req); @@ -113,15 +113,15 @@ static void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MIN); entry->de_blocknr = cpu_to_le64(0); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_dat_commit_entry(dat, req); @@ -143,14 +143,14 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, sector_t blocknr) { struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_local(kaddr); + kunmap_local(entry); nilfs_dat_commit_entry(dat, req); } @@ -160,19 +160,19 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; __u64 start; sector_t blocknr; - void *kaddr; + size_t offset; int ret; ret = nilfs_dat_prepare_entry(dat, req, 0); if (ret < 0) return ret; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (blocknr == 0) { ret = nilfs_palloc_prepare_free_entry(dat, req); @@ -200,11 +200,11 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, struct nilfs_dat_entry *entry; __u64 start, end; sector_t blocknr; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); end = start = le64_to_cpu(entry->de_start); if (!dead) { end = nilfs_mdt_cno(dat); @@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, } entry->de_end = cpu_to_le64(end); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (blocknr == 0) nilfs_dat_commit_free(dat, req); @@ -225,14 +225,14 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; __u64 start; sector_t blocknr; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(req->pr_entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, - req->pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, req->pr_entry_nr, + req->pr_entry_bh); + entry = kmap_local_folio(req->pr_entry_bh->b_folio, offset); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_local(kaddr); + kunmap_local(entry); if (start == nilfs_mdt_cno(dat) && blocknr == 0) nilfs_palloc_abort_free_entry(dat, req); @@ -336,7 +336,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) { struct buffer_head *entry_bh; struct nilfs_dat_entry *entry; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); @@ -359,21 +359,21 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) } } - kaddr = kmap_local_page(entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh); + entry = kmap_local_folio(entry_bh->b_folio, offset); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { nilfs_crit(dat->i_sb, "%s: invalid vblocknr = %llu, [%llu, %llu)", __func__, (unsigned long long)vblocknr, (unsigned long long)le64_to_cpu(entry->de_start), (unsigned long long)le64_to_cpu(entry->de_end)); - kunmap_local(kaddr); + kunmap_local(entry); brelse(entry_bh); return -EINVAL; } WARN_ON(blocknr == 0); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_local(kaddr); + kunmap_local(entry); mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); @@ -407,7 +407,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) struct buffer_head *entry_bh, *bh; struct nilfs_dat_entry *entry; sector_t blocknr; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); @@ -423,8 +423,8 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) } } - kaddr = kmap_local_page(entry_bh->b_page); - entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(dat, vblocknr, entry_bh); + entry = kmap_local_folio(entry_bh->b_folio, offset); blocknr = le64_to_cpu(entry->de_blocknr); if (blocknr == 0) { ret = -ENOENT; @@ -433,7 +433,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) *blocknrp = blocknr; out: - kunmap_local(kaddr); + kunmap_local(entry); brelse(entry_bh); return ret; } @@ -442,11 +442,12 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, size_t nvi) { struct buffer_head *entry_bh; - struct nilfs_dat_entry *entry; + struct nilfs_dat_entry *entry, *first_entry; struct nilfs_vinfo *vinfo = buf; __u64 first, last; - void *kaddr; + size_t offset; unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + unsigned int entry_size = NILFS_MDT(dat)->mi_entry_size; int i, j, n, ret; for (i = 0; i < nvi; i += n) { @@ -454,23 +455,28 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, 0, &entry_bh); if (ret < 0) return ret; - kaddr = kmap_local_page(entry_bh->b_page); - /* last virtual block number in this block */ + first = vinfo->vi_vblocknr; first = div64_ul(first, entries_per_block); first *= entries_per_block; + /* first virtual block number in this block */ + last = first + entries_per_block - 1; + /* last virtual block number in this block */ + + offset = nilfs_palloc_entry_offset(dat, first, entry_bh); + first_entry = kmap_local_folio(entry_bh->b_folio, offset); for (j = i, n = 0; j < nvi && vinfo->vi_vblocknr >= first && vinfo->vi_vblocknr <= last; j++, n++, vinfo = (void *)vinfo + visz) { - entry = nilfs_palloc_block_get_entry( - dat, vinfo->vi_vblocknr, entry_bh, kaddr); + entry = (void *)first_entry + + (vinfo->vi_vblocknr - first) * entry_size; vinfo->vi_start = le64_to_cpu(entry->de_start); vinfo->vi_end = le64_to_cpu(entry->de_end); vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); } - kunmap_local(kaddr); + kunmap_local(first_entry); brelse(entry_bh); } diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index fe5b1a30c509..0a3aea6c416b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -70,7 +70,7 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode) */ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) { - unsigned int last_byte = inode->i_size; + u64 last_byte = inode->i_size; last_byte -= page_nr << PAGE_SHIFT; if (last_byte > PAGE_SIZE) @@ -95,7 +95,7 @@ static void nilfs_commit_chunk(struct folio *folio, unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); + nr_dirty = nilfs_page_count_clean_buffers(folio, from, to); copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); @@ -289,7 +289,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) * The folio is mapped and unlocked. When the caller is finished with * the entry, it should call folio_release_kmap(). * - * On failure, returns NULL and the caller should ignore foliop. + * On failure, returns an error pointer and the caller should ignore foliop. */ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, const struct qstr *qstr, struct folio **foliop) @@ -312,22 +312,24 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, do { char *kaddr = nilfs_get_folio(dir, n, foliop); - if (!IS_ERR(kaddr)) { - de = (struct nilfs_dir_entry *)kaddr; - kaddr += nilfs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - nilfs_error(dir->i_sb, - "zero-length directory entry"); - folio_release_kmap(*foliop, kaddr); - goto out; - } - if (nilfs_match(namelen, name, de)) - goto found; - de = nilfs_next_entry(de); + if (IS_ERR(kaddr)) + return ERR_CAST(kaddr); + + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, + "zero-length directory entry"); + folio_release_kmap(*foliop, kaddr); + goto out; } - folio_release_kmap(*foliop, kaddr); + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); } + folio_release_kmap(*foliop, kaddr); + if (++n >= npages) n = 0; /* next folio is past the blocks we've got */ @@ -340,7 +342,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, } } while (n != start); out: - return NULL; + return ERR_PTR(-ENOENT); found: ei->i_dir_start_lookup = n; @@ -384,18 +386,18 @@ fail: return NULL; } -ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino) { - ino_t res = 0; struct nilfs_dir_entry *de; struct folio *folio; de = nilfs_find_entry(dir, qstr, &folio); - if (de) { - res = le64_to_cpu(de->inode); - folio_release_kmap(folio, de); - } - return res; + if (IS_ERR(de)) + return PTR_ERR(de); + + *ino = le64_to_cpu(de->inode); + folio_release_kmap(folio, de); + return 0; } void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1c9ae36a03ab..2dbb15767df1 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; } - if (!buffer_mapped(bh)) { - bh->b_bdev = inode->i_sb->s_bdev; + if (!buffer_mapped(bh)) set_buffer_mapped(bh); - } bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -165,7 +163,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &empty_aops; + inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 1e86b9303b7c..e7339eb3c08a 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -98,7 +98,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) .pr_entry_nr = ino, .pr_entry_bh = NULL }; struct nilfs_inode *raw_inode; - void *kaddr; + size_t offset; int ret; ret = nilfs_palloc_prepare_free_entry(ifile, &req); @@ -113,11 +113,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) return ret; } - kaddr = kmap_local_page(req.pr_entry_bh->b_page); - raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, - req.pr_entry_bh, kaddr); + offset = nilfs_palloc_entry_offset(ifile, req.pr_entry_nr, + req.pr_entry_bh); + raw_inode = kmap_local_folio(req.pr_entry_bh->b_folio, offset); raw_inode->i_flags = 0; - kunmap_local(kaddr); + kunmap_local(raw_inode); mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 625545cc2a98..5d116a566d9e 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -21,9 +21,9 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { - void *kaddr = kmap_local_page(ibh->b_page); + size_t __offset_in_folio = nilfs_palloc_entry_offset(ifile, ino, ibh); - return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); + return kmap_local_folio(ibh->b_folio, __offset_in_folio); } static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index be6acf6e2bfc..23f3a75edd50 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -170,37 +170,6 @@ static int nilfs_writepages(struct address_space *mapping, return err; } -static int nilfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - int err; - - if (sb_rdonly(inode->i_sb)) { - /* - * It means that filesystem was remounted in read-only - * mode because of error or metadata corruption. But we - * have dirty pages that try to be flushed in background. - * So, here we simply discard this dirty page. - */ - nilfs_clear_folio_dirty(folio); - folio_unlock(folio); - return -EROFS; - } - - folio_redirty_for_writepage(wbc, folio); - folio_unlock(folio); - - if (wbc->sync_mode == WB_SYNC_ALL) { - err = nilfs_construct_segment(inode->i_sb); - if (unlikely(err)) - return err; - } else if (wbc->for_reclaim) - nilfs_flush_segment(inode->i_sb, inode->i_ino); - - return 0; -} - static bool nilfs_dirty_folio(struct address_space *mapping, struct folio *folio) { @@ -273,7 +242,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start, + nr_dirty = nilfs_page_count_clean_buffers(folio, start, start + copied); copied = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); @@ -295,7 +264,6 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations nilfs_aops = { - .writepage = nilfs_writepage, .read_folio = nilfs_read_folio, .writepages = nilfs_writepages, .dirty_folio = nilfs_dirty_folio, @@ -304,9 +272,14 @@ const struct address_space_operations nilfs_aops = { .write_end = nilfs_write_end, .invalidate_folio = block_invalidate_folio, .direct_IO = nilfs_direct_IO, + .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, }; +const struct address_space_operations nilfs_buffer_cache_aops = { + .invalidate_folio = block_invalidate_folio, +}; + static int nilfs_insert_inode_locked(struct inode *inode, struct nilfs_root *root, unsigned long ino) @@ -575,8 +548,14 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, inode = nilfs_iget_locked(sb, root, ino); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + if (!inode->i_nlink) { + iput(inode); + return ERR_PTR(-ESTALE); + } return inode; + } err = __nilfs_read_inode(sb, root, ino, inode); if (unlikely(err)) { @@ -706,6 +685,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) NILFS_I(s_inode)->i_flags = 0; memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + s_inode->i_mapping->a_ops = &nilfs_buffer_cache_aops; err = nilfs_attach_btree_node_cache(s_inode); if (unlikely(err)) { diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index ceb7dc0b5bad..965b5ad1c0df 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -33,7 +33,8 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, struct buffer_head *, void *)) { struct nilfs_inode_info *ii = NILFS_I(inode); - void *kaddr; + struct folio *folio = bh->b_folio; + void *from; int ret; /* Caller exclude read accesses using page lock */ @@ -47,12 +48,14 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); - kaddr = kmap_local_page(bh->b_page); - memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); + /* Initialize block (block size > PAGE_SIZE not yet supported) */ + from = kmap_local_folio(folio, offset_in_folio(folio, bh->b_data)); + memset(from, 0, bh->b_size); if (init_block) - init_block(inode, bh, kaddr); - flush_dcache_page(bh->b_page); - kunmap_local(kaddr); + init_block(inode, bh, from); + kunmap_local(from); + + flush_dcache_folio(folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -89,7 +92,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, if (buffer_uptodate(bh)) goto failed_bh; - bh->b_bdev = sb->s_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); if (likely(!err)) { get_bh(bh); @@ -396,10 +398,9 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) return test_bit(NILFS_I_DIRTY, &ii->i_state); } -static int -nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +static int nilfs_mdt_write_folio(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; struct super_block *sb; int err = 0; @@ -432,11 +433,23 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) return err; } +static int nilfs_mdt_writeback(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = nilfs_mdt_write_folio(folio, wbc); + + return error; +} static const struct address_space_operations def_mdt_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, - .writepage = nilfs_mdt_write_page, + .writepages = nilfs_mdt_writeback, + .migrate_folio = buffer_migrate_folio_norefs, }; static const struct inode_operations def_mdt_iops; @@ -571,7 +584,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) if (!bh_frozen) bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0); - bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); + bh_frozen = get_nth_bh(bh_frozen, + offset_in_folio(folio, bh->b_data) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); @@ -601,7 +615,8 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) if (!IS_ERR(folio)) { bh_frozen = folio_buffers(folio); if (bh_frozen) { - n = bh_offset(bh) >> inode->i_blkbits; + n = offset_in_folio(folio, bh->b_data) >> + inode->i_blkbits; bh_frozen = get_nth_bh(bh_frozen, n); } folio_unlock(folio); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c950139db6ef..1d836a5540f3 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -55,12 +55,25 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; ino_t ino; + int res; if (dentry->d_name.len > NILFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = nilfs_inode_by_name(dir, &dentry->d_name); - inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; + res = nilfs_inode_by_name(dir, &dentry->d_name, &ino); + if (res) { + if (res != -ENOENT) + return ERR_PTR(res); + inode = NULL; + } else { + inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + if (inode == ERR_PTR(-ESTALE)) { + nilfs_error(dir->i_sb, + "deleted inode referenced: %lu", ino); + return ERR_PTR(-EIO); + } + } + return d_splice_alias(inode, dentry); } @@ -149,6 +162,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~__GFP_FS)); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) @@ -263,10 +279,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) struct folio *folio; int err; - err = -ENOENT; de = nilfs_find_entry(dir, &dentry->d_name, &folio); - if (!de) + if (IS_ERR(de)) { + err = PTR_ERR(de); goto out; + } inode = d_inode(dentry); err = -EIO; @@ -362,10 +379,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (unlikely(err)) return err; - err = -ENOENT; old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio); - if (!old_de) + if (IS_ERR(old_de)) { + err = PTR_ERR(old_de); goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -382,10 +400,12 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (dir_de && !nilfs_empty_dir(new_inode)) goto out_dir; - err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio); - if (!new_de) + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, + &new_folio); + if (IS_ERR(new_de)) { + err = PTR_ERR(new_de); goto out_dir; + } nilfs_set_link(new_dir, new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); nilfs_mark_inode_dirty(new_dir); @@ -440,12 +460,13 @@ out: */ static struct dentry *nilfs_get_parent(struct dentry *child) { - unsigned long ino; + ino_t ino; + int res; struct nilfs_root *root; - ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); - if (!ino) - return ERR_PTR(-ENOENT); + res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino); + if (res) + return ERR_PTR(res); root = NILFS_I(d_inode(child))->i_root; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index fb1c4c5bae7c..dff241c53fc5 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -254,7 +254,7 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) /* dir.c */ int nilfs_add_link(struct dentry *, struct inode *); -ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino); int nilfs_make_empty(struct inode *, struct inode *); struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *, struct folio **); @@ -401,6 +401,7 @@ extern const struct file_operations nilfs_dir_operations; extern const struct inode_operations nilfs_file_inode_operations; extern const struct file_operations nilfs_file_operations; extern const struct address_space_operations nilfs_aops; +extern const struct address_space_operations nilfs_buffer_cache_aops; extern const struct inode_operations nilfs_dir_inode_operations; extern const struct inode_operations nilfs_special_inode_operations; extern const struct inode_operations nilfs_symlink_inode_operations; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 9c0b7cddeaae..9de2a494a069 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = get_nth_bh(bh, block - first_block); - touch_buffer(bh); wait_on_buffer(bh); return bh; } @@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, folio_put(folio); return NULL; } + bh->b_bdev = inode->i_sb->s_bdev; return bh; } @@ -77,7 +77,8 @@ void nilfs_forget_buffer(struct buffer_head *bh) const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | - BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) | + BIT(BH_Delay)); lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); @@ -98,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh) */ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) { - void *kaddr0, *kaddr1; + void *saddr, *daddr; unsigned long bits; - struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio; struct buffer_head *bh; - kaddr0 = kmap_local_page(spage); - kaddr1 = kmap_local_page(dpage); - memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); - kunmap_local(kaddr1); - kunmap_local(kaddr0); + saddr = kmap_local_folio(sfolio, bh_offset(sbh)); + daddr = kmap_local_folio(dfolio, bh_offset(dbh)); + memcpy(daddr, saddr, sbh->b_size); + kunmap_local(daddr); + kunmap_local(saddr); dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; dbh->b_blocknr = sbh->b_blocknr; @@ -121,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) unlock_buffer(bh); } if (bits & BIT(BH_Uptodate)) - SetPageUptodate(dpage); + folio_mark_uptodate(dfolio); else - ClearPageUptodate(dpage); + folio_clear_uptodate(dfolio); if (bits & BIT(BH_Mapped)) - SetPageMappedToDisk(dpage); + folio_set_mappedtodisk(dfolio); else - ClearPageMappedToDisk(dpage); + folio_clear_mappedtodisk(dfolio); } /** @@ -400,13 +401,15 @@ void nilfs_clear_folio_dirty(struct folio *folio) folio_clear_uptodate(folio); folio_clear_mappedtodisk(folio); + folio_clear_checked(folio); head = folio_buffers(folio); if (head) { const unsigned long clear_bits = (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | - BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) | + BIT(BH_Delay)); bh = head; do { @@ -419,14 +422,14 @@ void nilfs_clear_folio_dirty(struct folio *folio) __nilfs_clear_folio_dirty(folio); } -unsigned int nilfs_page_count_clean_buffers(struct page *page, +unsigned int nilfs_page_count_clean_buffers(struct folio *folio, unsigned int from, unsigned int to) { unsigned int block_start, block_end; struct buffer_head *bh, *head; unsigned int nc = 0; - for (bh = head = page_buffers(page), block_start = 0; + for (bh = head = folio_buffers(folio), block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { block_end = block_start + bh->b_size; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 64521a03a19e..136cd1c143c9 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -43,8 +43,8 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_folio_dirty(struct folio *folio); void nilfs_clear_dirty_pages(struct address_space *mapping); -unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, - unsigned int); +unsigned int nilfs_page_count_clean_buffers(struct folio *folio, + unsigned int from, unsigned int to); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 21d81097a89f..e43405bf521e 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -481,19 +481,16 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, - loff_t pos, struct page *page) + loff_t pos, struct folio *folio) { struct buffer_head *bh_org; - size_t from = pos & ~PAGE_MASK; - void *kaddr; + size_t from = offset_in_folio(folio, pos); bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; - kaddr = kmap_local_page(page); - memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); - kunmap_local(kaddr); + memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size); brelse(bh_org); return 0; } @@ -531,13 +528,13 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, folio); if (unlikely(err)) - goto failed_page; + goto failed_folio; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) - goto failed_page; + goto failed_folio; block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, folio, NULL); @@ -548,7 +545,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, (*nr_salvaged_blocks)++; goto next; - failed_page: + failed_folio: folio_unlock(folio); folio_put(folio); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc431b4c34c9..e08cab03366b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -205,7 +205,6 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; - void *kaddr; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, @@ -220,9 +219,13 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - kaddr = kmap_local_page(bh->b_page); - crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); - kunmap_local(kaddr); + size_t offset = offset_in_folio(bh->b_folio, bh->b_data); + unsigned char *from; + + /* Do not support block sizes larger than PAGE_SIZE */ + from = kmap_local_folio(bh->b_folio, offset); + crc = crc32_le(crc, from, bh->b_size); + kunmap_local(from); } raw_sum->ss_datasum = cpu_to_le32(crc); } @@ -374,7 +377,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi, struct buffer_head *bh) { - int len, err; + int err; BUG_ON(wi->nr_vecs <= 0); repeat: @@ -385,8 +388,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, (wi->nilfs->ns_blocksize_bits - 9); } - len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); - if (len == bh->b_size) { + if (bio_add_folio(wi->bio, bh->b_folio, bh->b_size, + offset_in_folio(bh->b_folio, bh->b_data))) { wi->end++; return 0; } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index eea5a6a12f7b..d3ecc813d633 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -70,11 +70,20 @@ nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, max - curr + 1); } -static struct nilfs_segment_usage * -nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, - struct buffer_head *bh, void *kaddr) +/** + * nilfs_sufile_segment_usage_offset - calculate the byte offset of a segment + * usage entry in the folio containing it + * @sufile: segment usage file inode + * @segnum: number of segment usage + * @bh: buffer head of block containing segment usage indexed by @segnum + * + * Return: Byte offset in the folio of the segment usage entry. + */ +static size_t nilfs_sufile_segment_usage_offset(const struct inode *sufile, + __u64 segnum, + struct buffer_head *bh) { - return kaddr + bh_offset(bh) + + return offset_in_folio(bh->b_folio, bh->b_data) + nilfs_sufile_get_offset(sufile, segnum) * NILFS_MDT(sufile)->mi_entry_size; } @@ -112,13 +121,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { struct nilfs_sufile_header *header; - void *kaddr; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); } @@ -313,6 +320,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; + size_t offset; void *kaddr; unsigned long nsegments, nsus, cnt; int ret, j; @@ -322,10 +330,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); last_alloc = le64_to_cpu(header->sh_last_alloc); - kunmap_local(kaddr); + kunmap_local(header); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; @@ -359,9 +366,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) &su_bh); if (ret < 0) goto out_header; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); nsus = nilfs_sufile_segment_usages_in_block( sufile, segnum, maxsegnum); @@ -372,12 +380,11 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_segment_usage_set_dirty(su); kunmap_local(kaddr); - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); - kunmap_local(kaddr); + kunmap_local(header); sui->ncleansegs--; mark_buffer_dirty(header_bh); @@ -411,18 +418,18 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); - kunmap_local(kaddr); + kunmap_local(su); return; } nilfs_segment_usage_set_dirty(su); - kunmap_local(kaddr); + kunmap_local(su); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; @@ -436,14 +443,14 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int clean, dirty; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { - kunmap_local(kaddr); + kunmap_local(su); return; } clean = nilfs_segment_usage_clean(su); @@ -453,7 +460,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); - kunmap_local(kaddr); + kunmap_local(su); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; @@ -467,15 +474,15 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int sudirty; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); - kunmap_local(kaddr); + kunmap_local(su); return; } if (unlikely(nilfs_segment_usage_error(su))) @@ -488,7 +495,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); @@ -507,7 +514,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) { struct buffer_head *bh; - void *kaddr; + size_t offset; struct nilfs_segment_usage *su; int ret; @@ -523,12 +530,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) goto out_sem; } - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - kunmap_local(kaddr); + kunmap_local(su); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, @@ -546,7 +553,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -568,7 +575,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, { struct buffer_head *bh; struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int ret; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -576,8 +583,8 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, if (ret < 0) goto out_sem; - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (modtime) { /* * Check segusage error and set su_lastmod only when updating @@ -587,7 +594,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); - kunmap_local(kaddr); + kunmap_local(su); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); @@ -619,7 +626,6 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - void *kaddr; int ret; down_read(&NILFS_MDT(sufile)->mi_sem); @@ -628,8 +634,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) if (ret < 0) goto out_sem; - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); @@ -638,7 +643,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); - kunmap_local(kaddr); + kunmap_local(header); brelse(header_bh); out_sem: @@ -651,18 +656,18 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, struct buffer_head *su_bh) { struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; int suclean; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); if (nilfs_segment_usage_error(su)) { - kunmap_local(kaddr); + kunmap_local(su); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); - kunmap_local(kaddr); + kunmap_local(su); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); @@ -700,7 +705,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, unsigned long segusages_per_block; unsigned long nsegs, ncleaned; __u64 segnum; - void *kaddr; + size_t offset; ssize_t n, nc; int ret; int j; @@ -731,16 +736,16 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, /* hole */ continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kmap_local_folio(su_bh->b_folio, offset); su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; - kunmap_local(kaddr); + kunmap_local(su2); brelse(su_bh); goto out_header; } @@ -752,7 +757,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, nc++; } } - kunmap_local(kaddr); + kunmap_local(su2); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; @@ -799,7 +804,6 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) struct buffer_head *header_bh; struct nilfs_sufile_header *header; struct nilfs_sufile_info *sui = NILFS_SUI(sufile); - void *kaddr; unsigned long nsegs, nrsvsegs; int ret = 0; @@ -837,10 +841,9 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) sui->allocmin = 0; } - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); - kunmap_local(kaddr); + kunmap_local(header); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); @@ -874,6 +877,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + size_t offset; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; @@ -901,9 +905,9 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (j = 0; j < n; j++, su = (void *)su + susz, si = (void *)si + sisz) { si->sui_lastmod = le64_to_cpu(su->su_lastmod); @@ -951,7 +955,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, struct buffer_head *header_bh, *bh; struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; struct nilfs_segment_usage *su; - void *kaddr; + size_t offset; unsigned long blkoff, prev_blkoff; int cleansi, cleansu, dirtysi, dirtysu; long ncleaned = 0, ndirtied = 0; @@ -983,9 +987,9 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, goto out_header; for (;;) { - kaddr = kmap_local_page(bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, sup->sup_segnum, bh, kaddr); + offset = nilfs_sufile_segment_usage_offset( + sufile, sup->sup_segnum, bh); + su = kmap_local_folio(bh->b_folio, offset); if (nilfs_suinfo_update_lastmod(sup)) su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); @@ -1020,7 +1024,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } - kunmap_local(kaddr); + kunmap_local(su); sup = (void *)sup + supsz; if (sup >= supend) @@ -1076,6 +1080,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *su_bh; struct nilfs_segment_usage *su; + size_t offset; void *kaddr; size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; sector_t seg_start, seg_end, start_block, end_block; @@ -1125,9 +1130,9 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) continue; } - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, - su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset(sufile, segnum, + su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, offset); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { if (!nilfs_segment_usage_clean(su)) continue; @@ -1167,9 +1172,10 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } ndiscarded += nblocks; - kaddr = kmap_local_page(su_bh->b_page); - su = nilfs_sufile_block_get_segment_usage( - sufile, segnum, su_bh, kaddr); + offset = nilfs_sufile_segment_usage_offset( + sufile, segnum, su_bh); + su = kaddr = kmap_local_folio(su_bh->b_folio, + offset); } /* start new extent */ @@ -1221,7 +1227,6 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_sufile_info *sui; struct buffer_head *header_bh; struct nilfs_sufile_header *header; - void *kaddr; int err; if (susize > sb->s_blocksize) { @@ -1262,10 +1267,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, } sui = NILFS_SUI(sufile); - kaddr = kmap_local_page(header_bh->b_page); - header = kaddr + bh_offset(header_bh); + header = kmap_local_folio(header_bh->b_folio, 0); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); - kunmap_local(kaddr); + kunmap_local(header); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5dbef7f5c95..6004dfdfdf0f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -16,7 +16,6 @@ #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; @@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) new_fsn_mark = NULL; } - rcu_read_lock(); - f = lookup_fdget_rcu(fd); - rcu_read_unlock(); + f = fget_raw(fd); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index a511f9d8677b..0e36aaf379b7 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -15,7 +15,6 @@ config FANOTIFY config FANOTIFY_ACCESS_PERMISSIONS bool "fanotify permissions checking" depends on FANOTIFY - depends on SECURITY default n help Say Y here is you want fanotify listeners to be able to make permissions diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 224bccaab4cc..24c7c5df4998 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> #include <linux/init.h> #include <linux/jiffies.h> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..2d85c71717d6 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -266,13 +265,6 @@ static int create_fd(struct fsnotify_group *group, const struct path *path, group->fanotify_data.f_flags | __FMODE_NONOTIFY, current_cred()); if (IS_ERR(new_file)) { - /* - * we still send an event even if we can't open the file. this - * can happen when say tasks are gone and we try to open their - * /proc files or we try to open a WRONLY file like in sysfs - * we just send the errno to userspace since there isn't much - * else we can do. - */ put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); } else { @@ -663,7 +655,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL, *pidfd_file = NULL; - int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; + int ret, pidfd = -ESRCH, fd = -EBADF; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -691,10 +683,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); - if (fd < 0) - return fd; + /* + * Opening an fd from dentry can fail for several reasons. + * For example, when tasks are gone and we try to open their + * /proc files or we try to open a WRONLY file like in sysfs + * or when trying to open a file that was deleted on the + * remote network server. + * + * For a group with FAN_REPORT_FD_ERROR, we will send the + * event with the error instead of the open fd, otherwise + * Userspace may not get the error at all. + * In any case, userspace will not know which file failed to + * open, so add a debug print for further investigation. + */ + if (fd < 0) { + pr_debug("fanotify: create_fd(%pd2) failed err=%d\n", + path->dentry, fd); + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { + /* + * Historically, we've handled EOPENSTALE in a + * special way and silently dropped such + * events. Now we have to keep it to maintain + * backward compatibility... + */ + if (fd == -EOPENSTALE) + fd = 0; + return fd; + } + } } - metadata.fd = fd; + if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) + metadata.fd = fd; + else + metadata.fd = fd >= 0 ? fd : FAN_NOFD; if (pidfd_mode) { /* @@ -709,18 +730,16 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, * The PIDTYPE_TGID check for an event->pid is performed * preemptively in an attempt to catch out cases where the event * listener reads events after the event generating process has - * already terminated. Report FAN_NOPIDFD to the event listener - * in those cases, with all other pidfd creation errors being - * reported as FAN_EPIDFD. + * already terminated. Depending on flag FAN_REPORT_FD_ERROR, + * report either -ESRCH or FAN_NOPIDFD to the event listener in + * those cases with all other pidfd creation errors reported as + * the error code itself or as FAN_EPIDFD. */ - if (metadata.pid == 0 || - !pid_has_task(event->pid, PIDTYPE_TGID)) { - pidfd = FAN_NOPIDFD; - } else { + if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID)) pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); - if (pidfd < 0) - pidfd = FAN_EPIDFD; - } + + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0) + pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD; } ret = -EFAULT; @@ -737,9 +756,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; - if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PERM(event)->fd = fd; - if (info_mode) { ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); @@ -753,15 +769,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (pidfd_file) fd_install(pidfd, pidfd_file); + if (fanotify_is_perm_event(event->mask)) + FANOTIFY_PERM(event)->fd = fd; + return metadata.event_len; out_close_fd: - if (fd != FAN_NOFD) { + if (f) { put_unused_fd(fd); fput(f); } - if (pidfd >= 0) { + if (pidfd_file) { put_unused_fd(pidfd); fput(pidfd_file); } @@ -828,15 +847,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, event, buf, count); - if (unlikely(ret == -EOPENSTALE)) { - /* - * We cannot report events with stale fd so drop it. - * Setting ret to 0 will continue the event loop and - * do the right thing if there are no more events to - * read (i.e. return bytes read, -EAGAIN or wait). - */ - ret = 0; - } /* * Permission events get queued to wait for response. Other @@ -845,7 +855,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (!fanotify_is_perm_event(event->mask)) { fsnotify_destroy_event(group, &event->fse); } else { - if (ret <= 0) { + if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) { spin_lock(&group->notification_lock); finish_permission_event(group, FANOTIFY_PERM(event), FAN_DENY, NULL); @@ -1003,22 +1013,17 @@ static int fanotify_find_path(int dfd, const char __user *filename, dfd, filename, flags); if (filename == NULL) { - struct fd f = fdget(dfd); + CLASS(fd, f)(dfd); - ret = -EBADF; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; - ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(file_inode(fd_file(f))->i_mode))) { - fdput(f); - goto out; - } + !(S_ISDIR(file_inode(fd_file(f))->i_mode))) + return -ENOTDIR; *path = fd_file(f)->f_path; path_get(path); - fdput(f); } else { unsigned int lookup_flags = 0; @@ -1682,7 +1687,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct inode *inode = NULL; struct vfsmount *mnt = NULL; struct fsnotify_group *group; - struct fd f; struct path path; struct fan_fsid __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; @@ -1752,14 +1756,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, umask = FANOTIFY_EVENT_FLAGS; } - f = fdget(fanotify_fd); - if (unlikely(!fd_file(f))) + CLASS(fd, f)(fanotify_fd); + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an fanotify instance */ - ret = -EINVAL; if (unlikely(fd_file(f)->f_op != &fanotify_fops)) - goto fput_and_out; + return -EINVAL; group = fd_file(f)->private_data; /* @@ -1767,23 +1770,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * marks. This also includes setting up such marks by a group that * was initialized by an unprivileged user. */ - ret = -EPERM; if ((!capable(CAP_SYS_ADMIN) || FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && mark_type != FAN_MARK_INODE) - goto fput_and_out; + return -EPERM; /* * Permission events require minimum priority FAN_CLASS_CONTENT. */ - ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && group->priority < FSNOTIFY_PRIO_CONTENT) - goto fput_and_out; + return -EINVAL; if (mask & FAN_FS_ERROR && mark_type != FAN_MARK_FILESYSTEM) - goto fput_and_out; + return -EINVAL; /* * Evictable is only relevant for inode marks, because only inode object @@ -1791,7 +1792,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (flags & FAN_MARK_EVICTABLE && mark_type != FAN_MARK_INODE) - goto fput_and_out; + return -EINVAL; /* * Events that do not carry enough information to report @@ -1803,7 +1804,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) - goto fput_and_out; + return -EINVAL; /* * FAN_RENAME uses special info type records to report the old and @@ -1811,23 +1812,22 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * useful and was not implemented. */ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) - goto fput_and_out; + return -EINVAL; if (mark_cmd == FAN_MARK_FLUSH) { - ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); else if (mark_type == FAN_MARK_FILESYSTEM) fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); - goto fput_and_out; + return 0; } ret = fanotify_find_path(dfd, pathname, &path, flags, (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) - goto fput_and_out; + return ret; if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); @@ -1906,8 +1906,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, path_put_and_out: path_put(&path); -fput_and_out: - fdput(f); return ret; } @@ -1954,7 +1952,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index dec553034027..e933f9c65d90 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -47,10 +47,8 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); - if ((ret == FILEID_INVALID) || (ret < 0)) { - WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); + if ((ret == FILEID_INVALID) || (ret < 0)) return; - } f->handle_type = ret; f->handle_bytes = size * sizeof(u32); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 82ae8254c068..f976949d2634 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -333,16 +333,19 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, if (!inode_mark) return 0; - if (mask & FS_EVENT_ON_CHILD) { - /* - * Some events can be sent on both parent dir and child marks - * (e.g. FS_ATTRIB). If both parent dir and child are - * watching, report the event once to parent dir with name (if - * interested) and once to child without name (if interested). - * The child watcher is expecting an event without a file name - * and without the FS_EVENT_ON_CHILD flag. - */ - mask &= ~FS_EVENT_ON_CHILD; + /* + * Some events can be sent on both parent dir and child marks (e.g. + * FS_ATTRIB). If both parent dir and child are watching, report the + * event once to parent dir with name (if interested) and once to child + * without name (if interested). + * + * In any case regardless whether the parent is watching or not, the + * child watcher is expecting an event without the FS_EVENT_ON_CHILD + * flag. The file name is expected if and only if this is a directory + * event. + */ + mask &= ~FS_EVENT_ON_CHILD; + if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) { dir = NULL; name = NULL; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 0794dcaf1e47..e0c48956608a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -732,7 +732,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, struct fsnotify_group *group; struct inode *inode; struct path path; - struct fd f; int ret; unsigned flags = 0; @@ -752,21 +751,17 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; - f = fdget(fd); - if (unlikely(!fd_file(f))) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { - ret = -EINVAL; - goto fput_and_out; - } + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) + return -EINVAL; /* verify that this is indeed an inotify instance */ - if (unlikely(fd_file(f)->f_op != &inotify_fops)) { - ret = -EINVAL; - goto fput_and_out; - } + if (unlikely(fd_file(f)->f_op != &inotify_fops)) + return -EINVAL; if (!(mask & IN_DONT_FOLLOW)) flags |= LOOKUP_FOLLOW; @@ -776,7 +771,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, ret = inotify_find_inode(pathname, &path, flags, (mask & IN_ALL_EVENTS)); if (ret) - goto fput_and_out; + return ret; /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; @@ -785,8 +780,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); -fput_and_out: - fdput(f); return ret; } @@ -794,33 +787,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; - struct fd f; - int ret = -EINVAL; + CLASS(fd, f)(fd); - f = fdget(fd); - if (unlikely(!fd_file(f))) + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an inotify instance */ if (unlikely(fd_file(f)->f_op != &inotify_fops)) - goto out; + return -EINVAL; group = fd_file(f)->private_data; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) - goto out; - - ret = 0; + return -EINVAL; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); - -out: - fdput(f); - return ret; + return 0; } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c45b222cf9c1..4981439e6209 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -138,8 +138,11 @@ static void fsnotify_get_sb_watched_objects(struct super_block *sb) static void fsnotify_put_sb_watched_objects(struct super_block *sb) { - if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb))) - wake_up_var(fsnotify_sb_watched_objects(sb)); + atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb); + + /* the superblock can go away after this decrement */ + if (atomic_long_dec_and_test(watched_objects)) + wake_up_var(watched_objects); } static void fsnotify_get_inode_ref(struct inode *inode) @@ -150,8 +153,11 @@ static void fsnotify_get_inode_ref(struct inode *inode) static void fsnotify_put_inode_ref(struct inode *inode) { - fsnotify_put_sb_watched_objects(inode->i_sb); + /* read ->i_sb before the inode can go away */ + struct super_block *sb = inode->i_sb; + iput(inode); + fsnotify_put_sb_watched_objects(sb); } /* diff --git a/fs/nsfs.c b/fs/nsfs.c index c675fc40ce2d..663f8656158d 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; - if (previous) - mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns)); - else - mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns)); + mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous); if (IS_ERR(mnt_ns)) return PTR_ERR(mnt_ns); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 6ede3e924dec..8d789b017fa9 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -976,15 +976,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, goto out; /* Check for compressed frame. */ - err = attr_is_frame_compressed(ni, attr, vcn >> NTFS_LZNT_CUNIT, &hint); + err = attr_is_frame_compressed(ni, attr_b, vcn >> NTFS_LZNT_CUNIT, + &hint, run); if (err) goto out; if (hint) { /* if frame is compressed - don't touch it. */ *lcn = COMPRESSED_LCN; - *len = hint; - err = -EOPNOTSUPP; + /* length to the end of frame. */ + *len = NTFS_LZNT_CLUSTERS - (vcn & (NTFS_LZNT_CLUSTERS - 1)); + err = 0; goto out; } @@ -1027,16 +1029,16 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, /* Check if 'vcn' and 'vcn0' in different attribute segments. */ if (vcn < svcn || evcn1 <= vcn) { - /* Load attribute for truncated vcn. */ - attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, - &vcn, &mi); - if (!attr) { + struct ATTRIB *attr2; + /* Load runs for truncated vcn. */ + attr2 = ni_find_attr(ni, attr_b, &le_b, ATTR_DATA, NULL, + 0, &vcn, &mi); + if (!attr2) { err = -EINVAL; goto out; } - svcn = le64_to_cpu(attr->nres.svcn); - evcn1 = le64_to_cpu(attr->nres.evcn) + 1; - err = attr_load_runs(attr, ni, run, NULL); + evcn1 = le64_to_cpu(attr2->nres.evcn) + 1; + err = attr_load_runs(attr2, ni, run, NULL); if (err) goto out; } @@ -1517,15 +1519,18 @@ out: /* * attr_is_frame_compressed - Used to detect compressed frame. + * + * attr - base (primary) attribute segment. + * run - run to use, usually == &ni->file.run. + * Only base segments contains valid 'attr->nres.c_unit' */ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, - CLST frame, CLST *clst_data) + CLST frame, CLST *clst_data, struct runs_tree *run) { int err; u32 clst_frame; CLST clen, lcn, vcn, alen, slen, vcn_next; size_t idx; - struct runs_tree *run; *clst_data = 0; @@ -1537,7 +1542,6 @@ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, clst_frame = 1u << attr->nres.c_unit; vcn = frame * clst_frame; - run = &ni->file.run; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), @@ -1673,7 +1677,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, if (err) goto out; - err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data); + err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data, run); if (err) goto out; @@ -2600,3 +2604,74 @@ int attr_force_nonresident(struct ntfs_inode *ni) return err; } + +/* + * Change the compression of data attribute + */ +int attr_set_compress(struct ntfs_inode *ni, bool compr) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) + return -ENOENT; + + if (is_attr_compressed(attr) == !!compr) { + /* Already required compressed state. */ + return 0; + } + + if (attr->non_res) { + u16 run_off; + u32 run_size; + char *run; + + if (attr->nres.data_size) { + /* + * There are rare cases when it possible to change + * compress state without big changes. + * TODO: Process these cases. + */ + return -EOPNOTSUPP; + } + + run_off = le16_to_cpu(attr->nres.run_off); + run_size = le32_to_cpu(attr->size) - run_off; + run = Add2Ptr(attr, run_off); + + if (!compr) { + /* remove field 'attr->nres.total_size'. */ + memmove(run - 8, run, run_size); + run_off -= 8; + } + + if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) { + /* + * Ignore rare case when there are no 8 bytes in record with attr. + * TODO: split attribute. + */ + return -EOPNOTSUPP; + } + + if (compr) { + /* Make a gap for 'attr->nres.total_size'. */ + memmove(run + 8, run, run_size); + run_off += 8; + attr->nres.total_size = attr->nres.alloc_size; + } + attr->nres.run_off = cpu_to_le16(run_off); + } + + /* Update data attribute flags. */ + if (compr) { + attr->flags |= ATTR_FLAG_COMPRESSED; + attr->nres.c_unit = NTFS_LZNT_CUNIT; + } else { + attr->flags &= ~ATTR_FLAG_COMPRESSED; + attr->nres.c_unit = 0; + } + mi->dirty = true; + + return 0; +} diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 9f4bd8d26090..a4d74bed74fa 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -382,59 +382,6 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) return true; } -/* - * al_delete_le - Delete first le from the list which matches its parameters. - */ -bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, - const __le16 *name, u8 name_len, const struct MFT_REF *ref) -{ - u16 size; - struct ATTR_LIST_ENTRY *le; - size_t off; - typeof(ni->attr_list) *al = &ni->attr_list; - - /* Scan forward to the first le that matches the input. */ - le = al_find_ex(ni, NULL, type, name, name_len, &vcn); - if (!le) - return false; - - off = PtrOffset(al->le, le); - -next: - if (off >= al->size) - return false; - if (le->type != type) - return false; - if (le->name_len != name_len) - return false; - if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len, - ni->mi.sbi->upcase, true)) - return false; - if (le64_to_cpu(le->vcn) != vcn) - return false; - - /* - * The caller specified a segment reference, so we have to - * scan through the matching entries until we find that segment - * reference or we run of matching entries. - */ - if (ref && memcmp(ref, &le->ref, sizeof(*ref))) { - off += le16_to_cpu(le->size); - le = Add2Ptr(al->le, off); - goto next; - } - - /* Save on stack the size of 'le'. */ - size = le16_to_cpu(le->size); - /* Delete the le. */ - memmove(le, Add2Ptr(le, size), al->size - (off + size)); - - al->size -= size; - al->dirty = true; - - return true; -} - int al_update(struct ntfs_inode *ni, int sync) { int err; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index cf4fe21a5039..04107b950717 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -710,20 +710,17 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; - size_t bits0 = bits; u32 wbits = 8 * sb->s_blocksize; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbit = bit & (wbits - 1); struct buffer_head *bh; + u32 op; - while (iw < wnd->nwnd && bits) { - u32 tail, op; - + for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; - tail = wbits - wbit; - op = min_t(u32, tail, bits); + op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -736,20 +733,15 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) ntfs_bitmap_clear_le(bh->b_data, wbit, op); wnd->free_bits[iw] += op; + wnd->total_zeroes += op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); - wnd->total_zeroes += op; - bits -= op; - wbit = 0; - iw += 1; + wnd_add_free_ext(wnd, bit, op, false); } - - wnd_add_free_ext(wnd, bit, bits0, false); - return err; } @@ -760,20 +752,17 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; - size_t bits0 = bits; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); struct buffer_head *bh; + u32 op; - while (iw < wnd->nwnd && bits) { - u32 tail, op; - + for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; - tail = wbits - wbit; - op = min_t(u32, tail, bits); + op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -785,21 +774,16 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) ntfs_bitmap_set_le(bh->b_data, wbit, op); wnd->free_bits[iw] -= op; + wnd->total_zeroes -= op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); - wnd->total_zeroes -= op; - bits -= op; - wbit = 0; - iw += 1; + if (!RB_EMPTY_ROOT(&wnd->start_tree)) + wnd_remove_free_ext(wnd, bit, op); } - - if (!RB_EMPTY_ROOT(&wnd->start_tree)) - wnd_remove_free_ext(wnd, bit, bits0); - return err; } @@ -852,15 +836,13 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); + u32 op; - while (iw < wnd->nwnd && bits) { - u32 tail, op; - + for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; - tail = wbits - wbit; - op = min_t(u32, tail, bits); + op = min_t(u32, wbits - wbit, bits); if (wbits != wnd->free_bits[iw]) { bool ret; @@ -875,10 +857,6 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) if (!ret) return false; } - - bits -= op; - wbit = 0; - iw += 1; } return true; @@ -928,6 +906,7 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); + u32 op; size_t end; struct rb_node *n; struct e_node *e; @@ -945,14 +924,11 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) return false; use_wnd: - while (iw < wnd->nwnd && bits) { - u32 tail, op; - + for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; - tail = wbits - wbit; - op = min_t(u32, tail, bits); + op = min_t(u32, wbits - wbit, bits); if (wnd->free_bits[iw]) { bool ret; @@ -966,10 +942,6 @@ use_wnd: if (!ret) goto out; } - - bits -= op; - wbit = 0; - iw += 1; } ret = true; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 6202895a4542..3f96a11804c9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -82,13 +82,14 @@ int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); u32 flags = fa->flags; unsigned int new_fl = 0; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL)) + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL)) return -EOPNOTSUPP; if (flags & FS_IMMUTABLE_FL) @@ -97,6 +98,15 @@ int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, if (flags & FS_APPEND_FL) new_fl |= S_APPEND; + /* Allowed to change compression for empty files and for directories only. */ + if (!is_dedup(ni) && !is_encrypted(ni) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + /* Change compress state. */ + int err = ni_set_compress(inode, flags & FS_COMPR_FL); + if (err) + return err; + } + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); inode_set_ctime_current(inode); @@ -172,13 +182,15 @@ static int ntfs_extend_initialized_size(struct file *file, loff_t pos = valid; int err; + if (valid >= new_valid) + return 0; + if (is_resident(ni)) { ni->i_valid = new_valid; return 0; } WARN_ON(is_compressed(ni)); - WARN_ON(valid >= new_valid); for (;;) { u32 zerofrom, len; @@ -212,7 +224,7 @@ static int ntfs_extend_initialized_size(struct file *file, if (err) goto out; - folio_zero_range(folio, zerofrom, folio_size(folio)); + folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom); err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) @@ -407,6 +419,42 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = 0; } + if (file && is_sparsed(ni)) { + /* + * This code optimizes large writes to sparse file. + * TODO: merge this fragment with fallocate fragment. + */ + struct ntfs_sb_info *sbi = ni->mi.sbi; + CLST vcn = pos >> sbi->cluster_bits; + CLST cend = bytes_to_cluster(sbi, end); + CLST cend_v = bytes_to_cluster(sbi, ni->i_valid); + CLST lcn, clen; + bool new; + + if (cend_v > cend) + cend_v = cend; + + /* + * Allocate and zero new clusters. + * Zeroing these clusters may be too long. + */ + for (; vcn < cend_v; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend_v - vcn, &lcn, + &clen, &new, true); + if (err) + goto out; + } + /* + * Allocate but not zero new clusters. + */ + for (; vcn < cend; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend - vcn, &lcn, + &clen, &new, false); + if (err) + goto out; + } + } + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); @@ -483,7 +531,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) } /* - * ntfs_fallocate + * ntfs_fallocate - file_operations::ntfs_fallocate * * Preallocate space for a file. This implements ntfs's fallocate file * operation, which gets called from sys_fallocate system call. User @@ -618,6 +666,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_lock(ni); err = attr_collapse_range(ni, vbo, len); ni_unlock(ni); + if (err) + goto out; } else if (mode & FALLOC_FL_INSERT_RANGE) { /* Check new size. */ err = inode_newsize_ok(inode, new_size); @@ -740,10 +790,10 @@ out: } /* - * ntfs3_setattr - inode_operations::setattr + * ntfs_setattr - inode_operations::setattr */ -int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr) +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); @@ -803,10 +853,12 @@ out: return err; } -static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +/* + * check_read_restriction: + * common code for ntfs_file_read_iter and ntfs_file_splice_read + */ +static int check_read_restriction(struct inode *inode) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) @@ -817,11 +869,6 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return -EOPNOTSUPP; } - if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { - ntfs_inode_warn(inode, "direct i/o + compressed not supported"); - return -EOPNOTSUPP; - } - #ifndef CONFIG_NTFS3_LZX_XPRESS if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { ntfs_inode_warn( @@ -836,37 +883,44 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return -EOPNOTSUPP; } - return generic_file_read_iter(iocb, iter); + return 0; } -static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +/* + * ntfs_file_read_iter - file_operations::read_iter + */ +static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = file_inode(in); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); + ssize_t err; - if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) - return -EIO; + err = check_read_restriction(inode); + if (err) + return err; - if (is_encrypted(ni)) { - ntfs_inode_warn(inode, "encrypted i/o not supported"); + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); return -EOPNOTSUPP; } -#ifndef CONFIG_NTFS3_LZX_XPRESS - if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { - ntfs_inode_warn( - inode, - "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files"); - return -EOPNOTSUPP; - } -#endif + return generic_file_read_iter(iocb, iter); +} - if (is_dedup(ni)) { - ntfs_inode_warn(inode, "read deduplicated not supported"); - return -EOPNOTSUPP; - } +/* + * ntfs_file_splice_read - file_operations::splice_read + */ +static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t err; + + err = check_read_restriction(inode); + if (err) + return err; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -935,6 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) u64 frame_vbo; pgoff_t index; bool frame_uptodate; + struct folio *folio; if (frame_size < PAGE_SIZE) { /* @@ -989,8 +1044,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (err) { for (ip = 0; ip < pages_per_frame; ip++) { page = pages[ip]; - unlock_page(page); - put_page(page); + folio = page_folio(page); + folio_unlock(folio); + folio_put(folio); } goto out; } @@ -1000,9 +1056,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) off = offset_in_page(valid); for (; ip < pages_per_frame; ip++, off = 0) { page = pages[ip]; + folio = page_folio(page); zero_user_segment(page, off, PAGE_SIZE); flush_dcache_page(page); - SetPageUptodate(page); + folio_mark_uptodate(folio); } ni_lock(ni); @@ -1011,9 +1068,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (ip = 0; ip < pages_per_frame; ip++) { page = pages[ip]; - SetPageUptodate(page); - unlock_page(page); - put_page(page); + folio = page_folio(page); + folio_mark_uptodate(folio); + folio_unlock(folio); + folio_put(folio); } if (err) @@ -1055,8 +1113,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (ip = 0; ip < pages_per_frame; ip++) { page = pages[ip]; - unlock_page(page); - put_page(page); + folio = page_folio(page); + folio_unlock(folio); + folio_put(folio); } goto out; } @@ -1097,9 +1156,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (ip = 0; ip < pages_per_frame; ip++) { page = pages[ip]; ClearPageDirty(page); - SetPageUptodate(page); - unlock_page(page); - put_page(page); + folio = page_folio(page); + folio_mark_uptodate(folio); + folio_unlock(folio); + folio_put(folio); } if (err) @@ -1134,14 +1194,11 @@ out: } /* - * ntfs_file_write_iter - file_operations::write_iter + * check_write_restriction: + * common code for ntfs_file_write_iter and ntfs_file_splice_write */ -static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static int check_write_restriction(struct inode *inode) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - ssize_t ret; - int err; struct ntfs_inode *ni = ntfs_i(inode); if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) @@ -1152,13 +1209,31 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EOPNOTSUPP; } - if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { - ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "write into deduplicated not supported"); return -EOPNOTSUPP; } - if (is_dedup(ni)) { - ntfs_inode_warn(inode, "write into deduplicated not supported"); + return 0; +} + +/* + * ntfs_file_write_iter - file_operations::write_iter + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ntfs_inode *ni = ntfs_i(inode); + ssize_t ret; + int err; + + err = check_write_restriction(inode); + if (err) + return err; + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); return -EOPNOTSUPP; } @@ -1246,7 +1321,14 @@ static int ntfs_file_release(struct inode *inode, struct file *file) /* If we are last writer on the inode, drop the block reservation. */ if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && - atomic_read(&inode->i_writecount) == 1)) { + atomic_read(&inode->i_writecount) == 1) + /* + * The only file when inode->i_fop = &ntfs_file_operations and + * init_rwsem(&ni->file.run_lock) is not called explicitly is MFT. + * + * Add additional check here. + */ + && inode->i_ino != MFT_REC_MFT) { ni_lock(ni); down_write(&ni->file.run_lock); @@ -1282,10 +1364,27 @@ int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return err; } +/* + * ntfs_file_splice_write - file_operations::splice_write + */ +static ssize_t ntfs_file_splice_write(struct pipe_inode_info *pipe, + struct file *file, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t err; + struct inode *inode = file_inode(file); + + err = check_write_restriction(inode); + if (err) + return err; + + return iter_file_splice_write(pipe, file, ppos, len, flags); +} + // clang-format off const struct inode_operations ntfs_file_inode_operations = { .getattr = ntfs_getattr, - .setattr = ntfs3_setattr, + .setattr = ntfs_setattr, .listxattr = ntfs_listxattr, .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, @@ -1303,10 +1402,10 @@ const struct file_operations ntfs_file_operations = { .compat_ioctl = ntfs_compat_ioctl, #endif .splice_read = ntfs_file_splice_read, + .splice_write = ntfs_file_splice_write, .mmap = ntfs_file_mmap, .open = ntfs_file_open, .fsync = generic_file_fsync, - .splice_write = iter_file_splice_write, .fallocate = ntfs_fallocate, .release = ntfs_file_release, }; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index a469c608a394..8b39d0ce5f28 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -102,7 +102,9 @@ void ni_clear(struct ntfs_inode *ni) { struct rb_node *node; - if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec)) + if (!ni->vfs_inode.i_nlink && ni->mi.mrec && + is_rec_inuse(ni->mi.mrec) && + !(ni->mi.sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) ni_delete_all(ni); al_destroy(ni); @@ -1899,47 +1901,6 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, } /* - * fiemap_fill_next_extent_k - a copy of fiemap_fill_next_extent - * but it accepts kernel address for fi_extents_start - */ -static int fiemap_fill_next_extent_k(struct fiemap_extent_info *fieinfo, - u64 logical, u64 phys, u64 len, u32 flags) -{ - struct fiemap_extent extent; - struct fiemap_extent __user *dest = fieinfo->fi_extents_start; - - /* only count the extents */ - if (fieinfo->fi_extents_max == 0) { - fieinfo->fi_extents_mapped++; - return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; - } - - if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) - return 1; - - if (flags & FIEMAP_EXTENT_DELALLOC) - flags |= FIEMAP_EXTENT_UNKNOWN; - if (flags & FIEMAP_EXTENT_DATA_ENCRYPTED) - flags |= FIEMAP_EXTENT_ENCODED; - if (flags & (FIEMAP_EXTENT_DATA_TAIL | FIEMAP_EXTENT_DATA_INLINE)) - flags |= FIEMAP_EXTENT_NOT_ALIGNED; - - memset(&extent, 0, sizeof(extent)); - extent.fe_logical = logical; - extent.fe_physical = phys; - extent.fe_length = len; - extent.fe_flags = flags; - - dest += fieinfo->fi_extents_mapped; - memcpy(dest, &extent, sizeof(extent)); - - fieinfo->fi_extents_mapped++; - if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) - return 1; - return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; -} - -/* * ni_fiemap - Helper for file_fiemap(). * * Assumed ni_lock. @@ -1949,12 +1910,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, __u64 vbo, __u64 len) { int err = 0; - struct fiemap_extent __user *fe_u = fieinfo->fi_extents_start; - struct fiemap_extent *fe_k = NULL; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; - struct runs_tree *run; - struct rw_semaphore *run_lock; + struct runs_tree run; struct ATTRIB *attr; CLST vcn = vbo >> cluster_bits; CLST lcn, clen; @@ -1965,13 +1923,11 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, u32 flags; bool ok; + run_init(&run); if (S_ISDIR(ni->vfs_inode.i_mode)) { - run = &ni->dir.alloc_run; attr = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, I30_NAME, ARRAY_SIZE(I30_NAME), NULL, NULL); - run_lock = &ni->dir.run_lock; } else { - run = &ni->file.run; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL); if (!attr) { @@ -1986,7 +1942,6 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, "fiemap is not supported for compressed file (cp -r)"); goto out; } - run_lock = &ni->file.run_lock; } if (!attr || !attr->non_res) { @@ -1998,52 +1953,32 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, goto out; } - /* - * To avoid lock problems replace pointer to user memory by pointer to kernel memory. - */ - fe_k = kmalloc_array(fieinfo->fi_extents_max, - sizeof(struct fiemap_extent), - GFP_NOFS | __GFP_ZERO); - if (!fe_k) { - err = -ENOMEM; - goto out; - } - fieinfo->fi_extents_start = fe_k; - end = vbo + len; alloc_size = le64_to_cpu(attr->nres.alloc_size); if (end > alloc_size) end = alloc_size; - down_read(run_lock); - while (vbo < end) { if (idx == -1) { - ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + ok = run_lookup_entry(&run, vcn, &lcn, &clen, &idx); } else { CLST vcn_next = vcn; - ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && + ok = run_get_entry(&run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next; if (!ok) vcn = vcn_next; } if (!ok) { - up_read(run_lock); - down_write(run_lock); - err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), - attr->name_len, run, vcn); - - up_write(run_lock); - down_read(run_lock); + attr->name_len, &run, vcn); if (err) break; - ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + ok = run_lookup_entry(&run, vcn, &lcn, &clen, &idx); if (!ok) { err = -EINVAL; @@ -2068,8 +2003,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, } else if (is_attr_compressed(attr)) { CLST clst_data; - err = attr_is_frame_compressed( - ni, attr, vcn >> attr->nres.c_unit, &clst_data); + err = attr_is_frame_compressed(ni, attr, + vcn >> attr->nres.c_unit, + &clst_data, &run); if (err) break; if (clst_data < NTFS_LZNT_CLUSTERS) @@ -2098,8 +2034,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, if (vbo + dlen >= end) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent_k(fieinfo, vbo, lbo, dlen, - flags); + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, + flags); if (err < 0) break; @@ -2120,8 +2056,7 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, if (vbo + bytes >= end) flags |= FIEMAP_EXTENT_LAST; - err = fiemap_fill_next_extent_k(fieinfo, vbo, lbo, bytes, - flags); + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); if (err < 0) break; if (err == 1) { @@ -2132,21 +2067,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, vbo += bytes; } - up_read(run_lock); - - /* - * Copy to user memory out of lock - */ - if (copy_to_user(fe_u, fe_k, - fieinfo->fi_extents_max * - sizeof(struct fiemap_extent))) { - err = -EFAULT; - } - out: - /* Restore original pointer. */ - fieinfo->fi_extents_start = fe_u; - kfree(fe_k); + run_close(&run); return err; } @@ -2675,7 +2597,8 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, down_write(&ni->file.run_lock); run_truncate_around(run, le64_to_cpu(attr->nres.svcn)); frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT); - err = attr_is_frame_compressed(ni, attr, frame, &clst_data); + err = attr_is_frame_compressed(ni, attr, frame, &clst_data, + run); up_write(&ni->file.run_lock); if (err) goto out1; @@ -3455,3 +3378,75 @@ out: return 0; } + +/* + * ni_set_compress + * + * Helper for 'ntfs_fileattr_set'. + * Changes compression for empty files and directories only. + */ +int ni_set_compress(struct inode *inode, bool compr) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + struct ATTR_STD_INFO *std; + const char *bad_inode; + + if (is_compressed(ni) == !!compr) + return 0; + + if (is_sparsed(ni)) { + /* sparse and compress not compatible. */ + return -EOPNOTSUPP; + } + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) { + /*Skip other inodes. (symlink,fifo,...) */ + return -EOPNOTSUPP; + } + + bad_inode = NULL; + + ni_lock(ni); + + std = ni_std(ni); + if (!std) { + bad_inode = "no std"; + goto out; + } + + if (S_ISREG(inode->i_mode)) { + err = attr_set_compress(ni, compr); + if (err) { + if (err == -ENOENT) { + /* Fix on the fly? */ + /* Each file must contain data attribute. */ + bad_inode = "no data attribute"; + } + goto out; + } + } + + ni->std_fa = std->fa; + if (compr) + std->fa |= FILE_ATTRIBUTE_COMPRESSED; + else + std->fa &= ~FILE_ATTRIBUTE_COMPRESSED; + + if (ni->std_fa != std->fa) { + ni->std_fa = std->fa; + ni->mi.dirty = true; + } + /* update duplicate information and directory entries in ni_write_inode.*/ + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + err = 0; + +out: + ni_unlock(ni); + if (bad_inode) { + ntfs_bad_inode(inode, bad_inode); + err = -EINVAL; + } + + return err; +} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index c64dd114ac65..d0d530f4e2b9 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -609,14 +609,29 @@ static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head) *head = cpu_to_le16(index); } +/* + * Enumerate restart table. + * + * @t - table to enumerate. + * @c - current enumerated element. + * + * enumeration starts with @c == NULL + * returns next element or NULL + */ static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c) { __le32 *e; u32 bprt; - u16 rsize = t ? le16_to_cpu(t->size) : 0; + u16 rsize; + + if (!t) + return NULL; + + rsize = le16_to_cpu(t->size); if (!c) { - if (!t || !t->total) + /* start enumeration. */ + if (!t->total) return NULL; e = Add2Ptr(t, sizeof(struct RESTART_TABLE)); } else { diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 0fa636038b4e..03471bc9371c 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -2699,4 +2699,4 @@ unlock_out: out: __putname(uni); return err; -}
\ No newline at end of file +} diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index f672072e6bd4..be04d2845bb7 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -536,11 +536,15 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, if (inode->i_state & I_NEW) inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { - /* Inode overlaps? */ - _ntfs_bad_inode(inode); + /* + * Sequence number is not expected. + * Looks like inode was reused but caller uses the old reference + */ + iput(inode); + inode = ERR_PTR(-ESTALE); } - if (IS_ERR(inode) && name) + if (IS_ERR(inode)) ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR); return inode; @@ -605,7 +609,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bytes = ((u64)len << cluster_bits) - off; - if (lcn == SPARSE_LCN) { + if (lcn >= sbi->used.bitmap.nbits) { + /* This case includes resident/compressed/sparse. */ if (!create) { if (bh->b_size > bytes) bh->b_size = bytes; @@ -1672,7 +1677,10 @@ out6: attr = ni_find_attr(ni, NULL, NULL, ATTR_EA, NULL, 0, NULL, NULL); if (attr && attr->non_res) { /* Delete ATTR_EA, if non-resident. */ - attr_set_size(ni, ATTR_EA, NULL, 0, NULL, 0, NULL, false, NULL); + struct runs_tree run; + run_init(&run); + attr_set_size(ni, ATTR_EA, NULL, 0, &run, 0, NULL, false, NULL); + run_close(&run); } if (rp_inserted) @@ -2076,7 +2084,7 @@ static const char *ntfs_get_link(struct dentry *de, struct inode *inode, // clang-format off const struct inode_operations ntfs_link_inode_operations = { .get_link = ntfs_get_link, - .setattr = ntfs3_setattr, + .setattr = ntfs_setattr, .listxattr = ntfs_listxattr, }; diff --git a/fs/ntfs3/lib/lzx_decompress.c b/fs/ntfs3/lib/lzx_decompress.c index 6b16f07073c1..4d5701024f83 100644 --- a/fs/ntfs3/lib/lzx_decompress.c +++ b/fs/ntfs3/lib/lzx_decompress.c @@ -512,8 +512,7 @@ static int lzx_decompress_block(const struct lzx_decompressor *d, * the same code. (For R0, the swap is a no-op.) */ match_offset = recent_offsets[offset_slot]; - recent_offsets[offset_slot] = recent_offsets[0]; - recent_offsets[0] = match_offset; + swap(recent_offsets[offset_slot], recent_offsets[0]); } else { /* Explicit offset */ diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index 4aae598d6d88..fdc9b2ebf341 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -236,6 +236,9 @@ static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, /* Do decompression until pointers are inside range. */ while (up < unc_end && cmpr < cmpr_end) { + // return err if more than LZNT_CHUNK_SIZE bytes are written + if (up - unc > LZNT_CHUNK_SIZE) + return -EINVAL; /* Correct index */ while (unc + s_max_off[index] < up) index += 1; diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index f16d318c4372..abf7e81584a9 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -81,7 +81,7 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, if (err < 0) inode = ERR_PTR(err); else { - ni_lock(ni); + ni_lock_dir(ni); inode = dir_search_u(dir, uni, NULL); ni_unlock(ni); } @@ -395,7 +395,7 @@ static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name) /* * Try slow way with current upcase table */ - uni = __getname(); + uni = kmem_cache_alloc(names_cachep, GFP_NOWAIT); if (!uni) return -ENOMEM; @@ -417,7 +417,7 @@ static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name) err = 0; out: - __putname(uni); + kmem_cache_free(names_cachep, uni); return err; } @@ -503,7 +503,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .rename = ntfs_rename, .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, - .setattr = ntfs3_setattr, + .setattr = ntfs_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, .fiemap = ntfs_fiemap, @@ -512,7 +512,7 @@ const struct inode_operations ntfs_dir_inode_operations = { }; const struct inode_operations ntfs_special_inode_operations = { - .setattr = ntfs3_setattr, + .setattr = ntfs_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, .get_acl = ntfs_get_acl, diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 584f814715f4..cd8e8374bb5a 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -334,7 +334,7 @@ struct mft_inode { /* Nested class for ntfs_inode::ni_lock. */ enum ntfs_inode_mutex_lock_class { - NTFS_INODE_MUTEX_DIRTY, + NTFS_INODE_MUTEX_DIRTY = 1, NTFS_INODE_MUTEX_SECURITY, NTFS_INODE_MUTEX_OBJID, NTFS_INODE_MUTEX_REPARSE, @@ -446,13 +446,15 @@ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, u64 frame, u64 frames, u8 frame_bits, u32 *ondisk_size, u64 *vbo_data); int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, - CLST frame, CLST *clst_data); + CLST frame, CLST *clst_data, + struct runs_tree *run); int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid); int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); +int attr_set_compress(struct ntfs_inode *ni, bool compr); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); @@ -471,8 +473,6 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, struct ATTR_LIST_ENTRY **new_le); bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); -bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, - const __le16 *name, u8 name_len, const struct MFT_REF *ref); int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { @@ -502,8 +502,8 @@ int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); -int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr); +int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -588,6 +588,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool *is_bad); bool ni_is_dirty(struct inode *inode); +int ni_set_compress(struct inode *inode, bool compr); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 6c76503edc20..61d53d39f3b9 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -212,7 +212,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || - !IS_ALIGNED(off, 4)) { + !IS_ALIGNED(off, 8)) { return NULL; } @@ -223,29 +223,24 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) prev_type = 0; attr = Add2Ptr(rec, off); } else { - /* Check if input attr inside record. */ + /* + * We don't need to check previous attr here. There is + * a bounds checking in the previous round. + */ off = PtrOffset(rec, attr); - if (off >= used) - return NULL; asize = le32_to_cpu(attr->size); - if (asize < SIZEOF_RESIDENT) { - /* Impossible 'cause we should not return such attribute. */ - return NULL; - } - - /* Overflow check. */ - if (off + asize < off) - return NULL; prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } - asize = le32_to_cpu(attr->size); - - /* Can we use the first field (attr->type). */ + /* + * Can we use the first fields: + * attr->type, + * attr->size + */ if (off + 8 > used) { static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); return NULL; @@ -265,10 +260,19 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (t32 < prev_type) return NULL; + asize = le32_to_cpu(attr->size); + + if (!IS_ALIGNED(asize, 8)) + return NULL; + /* Check overflow and boundary. */ if (off + asize < off || off + asize > used) return NULL; + /* Can we use the field attr->non_res. */ + if (off + 9 > used) + return NULL; + /* Check size of attribute. */ if (!attr->non_res) { /* Check resident fields. */ @@ -293,6 +297,10 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (attr->non_res != 1) return NULL; + /* Can we use memory including attr->nres.valid_size? */ + if (asize < SIZEOF_NONRESIDENT) + return NULL; + t16 = le16_to_cpu(attr->nres.run_off); if (t16 > asize) return NULL; @@ -319,7 +327,8 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (!attr->nres.svcn && is_attr_ext(attr)) { /* First segment of sparse/compressed attribute */ - if (asize + 8 < SIZEOF_NONRESIDENT_EX) + /* Can we use memory including attr->nres.total_size? */ + if (asize < SIZEOF_NONRESIDENT_EX) return NULL; tot_size = le64_to_cpu(attr->nres.total_size); @@ -329,10 +338,10 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (tot_size > alloc_size) return NULL; } else { - if (asize + 8 < SIZEOF_NONRESIDENT) + if (attr->nres.c_unit) return NULL; - if (attr->nres.c_unit) + if (alloc_size > mi->sbi->volume.size) return NULL; } diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index cb8cf0161177..6e86d66197ef 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -959,7 +959,7 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, * Large positive number requires to store 5 bytes * e.g.: 05 FF 7E FF FF 00 00 00 */ - if (size_size > 8) + if (size_size > sizeof(len)) return -EINVAL; len = run_unpack_s64(run_buf, size_size, 0); @@ -971,7 +971,7 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, if (!offset_size) lcn = SPARSE_LCN64; - else if (offset_size <= 8) { + else if (offset_size <= sizeof(s64)) { s64 dlcn; /* Initial value of dlcn is -1 or 0. */ @@ -984,8 +984,10 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, return -EINVAL; lcn = prev_lcn + dlcn; prev_lcn = lcn; - } else + } else { + /* The size of 'dlcn' can't be > 8. */ return -EINVAL; + } next_vcn = vcn64 + len; /* Check boundary. */ @@ -1053,8 +1055,8 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, { int ret, err; CLST next_vcn, lcn, len; - size_t index; - bool ok; + size_t index, done; + bool ok, zone; struct wnd_bitmap *wnd; ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size); @@ -1085,8 +1087,9 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, continue; down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + zone = max(wnd->zone_bit, lcn) < min(wnd->zone_end, lcn + len); /* Check for free blocks. */ - ok = wnd_is_used(wnd, lcn, len); + ok = !zone && wnd_is_used(wnd, lcn, len); up_read(&wnd->rw_lock); if (ok) continue; @@ -1094,14 +1097,33 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, /* Looks like volume is corrupted. */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); - if (down_write_trylock(&wnd->rw_lock)) { - /* Mark all zero bits as used in range [lcn, lcn+len). */ - size_t done; - err = wnd_set_used_safe(wnd, lcn, len, &done); - up_write(&wnd->rw_lock); - if (err) - return err; + if (!down_write_trylock(&wnd->rw_lock)) + continue; + + if (zone) { + /* + * Range [lcn, lcn + len) intersects with zone. + * To avoid complex with zone just turn it off. + */ + wnd_zone_set(wnd, 0, 0); + } + + /* Mark all zero bits as used in range [lcn, lcn+len). */ + err = wnd_set_used_safe(wnd, lcn, len, &done); + if (zone) { + /* Restore zone. Lock mft run. */ + struct rw_semaphore *lock = + is_mounted(sbi) ? &sbi->mft.ni->file.run_lock : + NULL; + if (lock) + down_read(lock); + ntfs_refresh_zone(sbi); + if (lock) + up_read(lock); } + up_write(&wnd->rw_lock); + if (err) + return err; } return ret; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index a8758b85803f..6a0f6b0a3ab2 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -90,7 +90,7 @@ void ntfs_printk(const struct super_block *sb, const char *fmt, ...) level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); + printk("%c%cntfs3(%s): %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); va_end(args); } @@ -124,10 +124,15 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) struct dentry *de = d_find_alias(inode); if (de) { + int len; spin_lock(&de->d_lock); - snprintf(name, sizeof(s_name_buf), " \"%s\"", - de->d_name.name); + len = snprintf(name, sizeof(s_name_buf), " \"%s\"", + de->d_name.name); spin_unlock(&de->d_lock); + if (len <= 0) + name[0] = 0; + else if (len >= sizeof(s_name_buf)) + name[sizeof(s_name_buf) - 1] = 0; } else { name[0] = 0; } @@ -140,7 +145,7 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, + printk("%c%cntfs3(%s): ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, sb->s_id, inode->i_ino, name ? name : "", &vaf); va_end(args); @@ -259,23 +264,23 @@ enum Opt { // clang-format off static const struct fs_parameter_spec ntfs_fs_parameters[] = { - fsparam_uid("uid", Opt_uid), - fsparam_gid("gid", Opt_gid), - fsparam_u32oct("umask", Opt_umask), - fsparam_u32oct("dmask", Opt_dmask), - fsparam_u32oct("fmask", Opt_fmask), - fsparam_flag_no("sys_immutable", Opt_immutable), - fsparam_flag_no("discard", Opt_discard), - fsparam_flag_no("force", Opt_force), - fsparam_flag_no("sparse", Opt_sparse), - fsparam_flag_no("hidden", Opt_nohidden), - fsparam_flag_no("hide_dot_files", Opt_hide_dot_files), - fsparam_flag_no("windows_names", Opt_windows_names), - fsparam_flag_no("showmeta", Opt_showmeta), - fsparam_flag_no("acl", Opt_acl), - fsparam_string("iocharset", Opt_iocharset), - fsparam_flag_no("prealloc", Opt_prealloc), - fsparam_flag_no("case", Opt_nocase), + fsparam_uid("uid", Opt_uid), + fsparam_gid("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag("sys_immutable", Opt_immutable), + fsparam_flag("discard", Opt_discard), + fsparam_flag("force", Opt_force), + fsparam_flag("sparse", Opt_sparse), + fsparam_flag("nohidden", Opt_nohidden), + fsparam_flag("hide_dot_files", Opt_hide_dot_files), + fsparam_flag("windows_names", Opt_windows_names), + fsparam_flag("showmeta", Opt_showmeta), + fsparam_flag("acl", Opt_acl), + fsparam_string("iocharset", Opt_iocharset), + fsparam_flag("prealloc", Opt_prealloc), + fsparam_flag("nocase", Opt_nocase), {} }; // clang-format on @@ -345,28 +350,28 @@ static int ntfs_fs_parse_param(struct fs_context *fc, opts->fmask = 1; break; case Opt_immutable: - opts->sys_immutable = result.negated ? 0 : 1; + opts->sys_immutable = 1; break; case Opt_discard: - opts->discard = result.negated ? 0 : 1; + opts->discard = 1; break; case Opt_force: - opts->force = result.negated ? 0 : 1; + opts->force = 1; break; case Opt_sparse: - opts->sparse = result.negated ? 0 : 1; + opts->sparse = 1; break; case Opt_nohidden: - opts->nohidden = result.negated ? 1 : 0; + opts->nohidden = 1; break; case Opt_hide_dot_files: - opts->hide_dot_files = result.negated ? 0 : 1; + opts->hide_dot_files = 1; break; case Opt_windows_names: - opts->windows_names = result.negated ? 0 : 1; + opts->windows_names = 1; break; case Opt_showmeta: - opts->showmeta = result.negated ? 0 : 1; + opts->showmeta = 1; break; case Opt_acl: if (!result.negated) @@ -385,10 +390,10 @@ static int ntfs_fs_parse_param(struct fs_context *fc, param->string = NULL; break; case Opt_prealloc: - opts->prealloc = result.negated ? 0 : 1; + opts->prealloc = 1; break; case Opt_nocase: - opts->nocase = result.negated ? 1 : 0; + opts->nocase = 1; break; default: /* Should not be here unless we forget add case. */ @@ -1491,11 +1496,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef __BIG_ENDIAN { - const __le16 *src = sbi->upcase; u16 *dst = sbi->upcase; for (i = 0; i < 0x10000; i++) - *dst++ = le16_to_cpu(*src++); + __swab16s(dst++); } #endif diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 0703e1ae32b2..e0055dcf8fe3 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -705,7 +705,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, #endif /* - * ntfs_acl_chmod - Helper for ntfs3_setattr(). + * ntfs_acl_chmod - Helper for ntfs_setattr(). */ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ea9127ba3208..395e23920632 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4767,7 +4767,7 @@ bail: } /* - * Allcate and add clusters into the extent b-tree. + * Allocate and add clusters into the extent b-tree. * The new clusters(clusters_to_add) will be inserted at logical_offset. * The extent b-tree's root is specified by et, and * it is not limited to the file storage. Any extent tree can use this diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 45db1781ea73..1d1b4b7edba0 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -70,6 +70,8 @@ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_NUM_LOCKS }; +#define ocfs2_iocb_init_rw_locked(iocb) \ + (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4b9f45d7049e..4200a0341343 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1765,42 +1765,41 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, long fd; int sectsize; char *p = (char *)page; - struct fd f; ssize_t ret = -EINVAL; int live_threshold; if (reg->hr_bdev_file) - goto out; + return -EINVAL; /* We can't heartbeat without having had our node number * configured yet. */ if (o2nm_this_node() == O2NM_MAX_NODES) - goto out; + return -EINVAL; fd = simple_strtol(p, &p, 0); if (!p || (*p && (*p != '\n'))) - goto out; + return -EINVAL; if (fd < 0 || fd >= INT_MAX) - goto out; + return -EINVAL; - f = fdget(fd); - if (fd_file(f) == NULL) - goto out; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EINVAL; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out2; + return -EINVAL; if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode)) - goto out2; + return -EINVAL; reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev, BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); if (IS_ERR(reg->hr_bdev_file)) { ret = PTR_ERR(reg->hr_bdev_file); reg->hr_bdev_file = NULL; - goto out2; + return ret; } sectsize = bdev_logical_block_size(reg_bdev(reg)); @@ -1906,9 +1905,6 @@ out3: fput(reg->hr_bdev_file); reg->hr_bdev_file = NULL; } -out2: - fdput(f); -out: return ret; } diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 15d0ed9c13e5..8bf17231d7b7 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -60,7 +60,7 @@ static void o2quo_fence_self(void) switch (o2nm_single_cluster->cl_fence_method) { case O2NM_FENCE_PANIC: panic("*** ocfs2 is very sorry to be fencing this system by " - "panicing ***\n"); + "panicking ***\n"); break; default: WARN_ON(o2nm_single_cluster->cl_fence_method >= diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 213206ebdd58..7799f4d16ce9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1065,26 +1065,39 @@ int ocfs2_find_entry(const char *name, int namelen, { struct buffer_head *bh; struct ocfs2_dir_entry *res_dir = NULL; + int ret = 0; if (ocfs2_dir_indexed(dir)) return ocfs2_find_entry_dx(name, namelen, dir, lookup); + if (unlikely(i_size_read(dir) <= 0)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } /* * The unindexed dir code only uses part of the lookup * structure, so there's no reason to push it down further * than this. */ - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); - else + } else { bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + } if (bh == NULL) return -ENOENT; lookup->dl_leaf_bh = bh; lookup->dl_entry = res_dir; - return 0; +out: + return ret; } /* @@ -2010,6 +2023,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name + * Return -EFSCORRUPTED if found corruption * * Callers should have i_rwsem + a cluster lock on dir */ @@ -2023,9 +2037,12 @@ int ocfs2_check_dir_for_entry(struct inode *dir, trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = ocfs2_find_entry(name, namelen, dir, &lookup); + if (ret == 0) { ret = -EEXIST; mlog_errno(ret); + } else if (ret == -ENOENT) { + ret = 0; } ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index bae60ca2672a..847a52dcbe7d 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -62,8 +62,6 @@ enum dlm_status { DLM_MAXSTATS, /* 41: upper limit for return code validation */ }; -/* for pretty-printing dlm_status error messages */ -const char *dlm_errmsg(enum dlm_status err); /* for pretty-printing dlm_status error names */ const char *dlm_errname(enum dlm_status err); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index be5e9ed7da8d..e9ef4e2b0e75 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -164,59 +164,6 @@ static const char *dlm_errnames[] = { [DLM_MAXSTATS] = "DLM_MAXSTATS", }; -static const char *dlm_errmsgs[] = { - [DLM_NORMAL] = "request in progress", - [DLM_GRANTED] = "request granted", - [DLM_DENIED] = "request denied", - [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", - [DLM_WORKING] = "async request in progress", - [DLM_BLOCKED] = "lock request blocked", - [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", - [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", - [DLM_SYSERR] = "system error", - [DLM_NOSUPPORT] = "unsupported", - [DLM_CANCELGRANT] = "can't cancel convert: already granted", - [DLM_IVLOCKID] = "bad lockid", - [DLM_SYNC] = "synchronous request granted", - [DLM_BADTYPE] = "bad resource type", - [DLM_BADRESOURCE] = "bad resource handle", - [DLM_MAXHANDLES] = "no more resource handles", - [DLM_NOCLINFO] = "can't contact cluster manager", - [DLM_NOLOCKMGR] = "can't contact lock manager", - [DLM_NOPURGED] = "can't contact purge daemon", - [DLM_BADARGS] = "bad api args", - [DLM_VOID] = "no status", - [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", - [DLM_IVBUFLEN] = "invalid resource name length", - [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", - [DLM_BADPARAM] = "invalid lock mode specified", - [DLM_VALNOTVALID] = "value block has been invalidated", - [DLM_REJECTED] = "request rejected, unrecognized client", - [DLM_ABORT] = "blocked lock request cancelled", - [DLM_CANCEL] = "conversion request cancelled", - [DLM_IVRESHANDLE] = "invalid resource handle", - [DLM_DEADLOCK] = "deadlock recovery refused this request", - [DLM_DENIED_NOASTS] = "failed to allocate AST", - [DLM_FORWARD] = "request must wait for primary's response", - [DLM_TIMEOUT] = "timeout value for lock has expired", - [DLM_IVGROUPID] = "invalid group specification", - [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", - [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", - [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", - [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", - [DLM_RECOVERING] = "lock resource being recovered", - [DLM_MIGRATING] = "lock resource being migrated", - [DLM_MAXSTATS] = "invalid error number", -}; - -const char *dlm_errmsg(enum dlm_status err) -{ - if (err >= DLM_MAXSTATS || err < 0) - return dlm_errmsgs[DLM_MAXSTATS]; - return dlm_errmsgs[err]; -} -EXPORT_SYMBOL_GPL(dlm_errmsg); - const char *dlm_errname(enum dlm_status err) { if (err >= DLM_MAXSTATS || err < 0) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 60df52e4c1f8..764ecbd5ad41 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3110,6 +3110,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) struct ocfs2_lock_res *iter = v; struct ocfs2_lock_res *dummy = &priv->p_iter_res; + (*pos)++; spin_lock(&ocfs2_dlm_tracking_lock); iter = ocfs2_dlm_next_res(iter, priv); list_del_init(&dummy->l_debug_list); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 96b684763b39..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ad131a2fc58e..957ced628eb1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1129,9 +1129,12 @@ int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_ocfs2_setattr(inode, dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, dentry->d_name.len, dentry->d_name.name, - attr->ia_valid, attr->ia_mode, - from_kuid(&init_user_ns, attr->ia_uid), - from_kgid(&init_user_ns, attr->ia_gid)); + attr->ia_valid, + attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0, + attr->ia_valid & ATTR_UID ? + from_kuid(&init_user_ns, attr->ia_uid) : 0, + attr->ia_valid & ATTR_GID ? + from_kgid(&init_user_ns, attr->ia_gid) : 0); /* ensuring we don't even attempt to truncate a symlink */ if (S_ISLNK(inode->i_mode)) @@ -1784,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { @@ -2387,6 +2398,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } else inode_lock(inode); + ocfs2_iocb_init_rw_locked(iocb); + /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2533,6 +2546,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, if (!direct_io && nowait) return -EOPNOTSUPP; + ocfs2_iocb_init_rw_locked(iocb); + /* * buffered reads protect themselves in ->read_folio(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. @@ -2801,6 +2816,7 @@ const struct file_operations ocfs2_fops = { .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; WRAP_DIR_ITER(ocfs2_readdir) // FIXME! @@ -2817,6 +2833,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 8ac42ea81a17..d1aa04a5af1b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -971,9 +971,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < - left) { - if (bit_off == start) { + while (1) { + bit_off = ocfs2_find_next_zero_bit(bitmap, left, start); + if ((bit_off < left) && (bit_off == start)) { count++; start++; continue; @@ -998,29 +998,12 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, } } + if (bit_off >= left) + break; count = 1; start = bit_off + 1; } - /* clear the contiguous bits until the end boundary */ - if (count) { - blkno = la_start_blk + - ocfs2_clusters_to_blocks(osb->sb, - start - count); - - trace_ocfs2_sync_local_to_main_free( - count, start - count, - (unsigned long long)la_start_blk, - (unsigned long long)blkno); - - status = ocfs2_release_clusters(handle, - main_bm_inode, - main_bm_bh, blkno, - count); - if (status < 0) - mlog_errno(status); - } - bail: if (status) mlog_errno(status); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 59c92353151a..5550f8afa438 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,8 +200,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); - if (status) + if (status) { + iput(inode); return ERR_PTR(status); + } return inode; } diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index ebb5c99f490e..788a8de922a4 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -97,7 +97,6 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); int ocfs2_global_read_info(struct super_block *sb, int type); int ocfs2_global_write_info(struct super_block *sb, int type); -int ocfs2_global_read_dquot(struct dquot *dquot); int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); static inline int ocfs2_sync_dquot(struct dquot *dquot) { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2b0daced98eb..3404e7a30c33 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -893,7 +893,7 @@ static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid) int status = 0; trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type); - if (!sb_has_quota_loaded(sb, type)) { + if (!sb_has_quota_active(sb, type)) { status = -ESRCH; goto out; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 73d3367c533b..2956d888c131 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -867,6 +867,7 @@ out: brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); + info->dqi_priv = NULL; return status; } diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index c4a4016d3866..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -574,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3d404624bb96..c79b4291777f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd0a05365e79..73a6f6fd8a8e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } diff --git a/fs/open.c b/fs/open.c index acaeb3e25c88..ffcfef67ac86 100644 --- a/fs/open.c +++ b/fs/open.c @@ -187,19 +187,13 @@ long do_ftruncate(struct file *file, loff_t length, int small) long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { - struct fd f; - int error; - if (length < 0) return -EINVAL; - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - error = do_ftruncate(fd_file(f), length, small); - - fdput(f); - return error; + return do_ftruncate(fd_file(f), length, small); } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) @@ -349,14 +343,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fallocate(fd_file(f), mode, offset, len); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) @@ -410,7 +402,6 @@ static bool access_need_override_creds(int flags) static const struct cred *access_override_creds(void) { - const struct cred *old_cred; struct cred *override_cred; override_cred = prepare_creds(); @@ -455,13 +446,7 @@ static const struct cred *access_override_creds(void) * freeing. */ override_cred->non_rcu = 1; - - old_cred = override_creds(override_cred); - - /* override_cred() gets its own ref */ - put_cred(override_cred); - - return old_cred; + return override_creds(override_cred); } static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) @@ -531,7 +516,7 @@ out_path_release: } out: if (old_cred) - revert_creds(old_cred); + put_cred(revert_creds(old_cred)); return res; } @@ -580,23 +565,18 @@ out: SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct fd f = fdget_raw(fd); + CLASS(fd_raw, f)(fd); int error; - error = -EBADF; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; - error = -ENOTDIR; if (!d_can_lookup(fd_file(f)->f_path.dentry)) - goto out_putf; + return -ENOTDIR; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); -out_putf: - fdput(f); -out: return error; } @@ -671,14 +651,12 @@ int vfs_fchmod(struct file *file, umode_t mode) SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct fd f = fdget(fd); - int err = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - err = vfs_fchmod(fd_file(f), mode); - fdput(f); - } - return err; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, @@ -865,14 +843,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group) int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fchown(fd_file(f), user, group); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) @@ -946,6 +922,10 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; + error = fsnotify_open_perm(f); + if (error) + goto cleanup_all; + error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; @@ -1457,6 +1437,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) @@ -1574,23 +1556,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -/** - * sys_close_range() - Close all file descriptors in a given range. - * - * @fd: starting file descriptor to close - * @max_fd: last file descriptor to close - * @flags: reserved for future extensions - * - * This closes a range of file descriptors. All file descriptors - * from @fd up to and including @max_fd are closed. - * Currently, errors to close a given file descriptor are ignored. - */ -SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, - unsigned int, flags) -{ - return __close_range(fd, max_fd, flags); -} - /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2ed6ad641a20..0c28e5fa3407 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -16,7 +16,6 @@ #include <linux/sched/signal.h> #include <linux/cred.h> #include <linux/namei.h> -#include <linux/fdtable.h> #include <linux/ratelimit.h> #include <linux/exportfs.h> #include "overlayfs.h" @@ -416,13 +415,13 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, return err; } -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper) { struct ovl_fh *fh; int fh_type, dwords; int buflen = MAX_HANDLE_SZ; - uuid_t *uuid = &real->d_sb->s_uuid; + uuid_t *uuid = &realinode->i_sb->s_uuid; int err; /* Make sure the real fid stays 32bit aligned */ @@ -439,13 +438,13 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); + fh_type = exportfs_encode_inode_fh(realinode, (void *)fh->fb.fid, + &dwords, NULL, 0); buflen = (dwords << 2); err = -EIO; - if (WARN_ON(fh_type < 0) || - WARN_ON(buflen > MAX_HANDLE_SZ) || - WARN_ON(fh_type == FILEID_INVALID)) + if (fh_type < 0 || fh_type == FILEID_INVALID || + WARN_ON(buflen > MAX_HANDLE_SZ)) goto out_err; fh->fb.version = OVL_FH_VERSION; @@ -481,7 +480,7 @@ struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) if (!ovl_can_decode_fh(origin->d_sb)) return NULL; - return ovl_encode_real_fh(ofs, origin, false); + return ovl_encode_real_fh(ofs, d_inode(origin), false); } int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, @@ -506,7 +505,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, upper, true); + fh = ovl_encode_real_fh(ofs, d_inode(upper), true); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -1260,7 +1259,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) dput(parent); dput(next); } - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ab65e98a1def..c9993ff66fc2 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -553,15 +553,17 @@ out_cleanup: goto out_dput; } -static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode, - umode_t mode, const struct cred *old_cred) +static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry, + struct inode *inode, + umode_t mode, + const struct cred *old_cred) { int err; struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return ERR_PTR(-ENOMEM); override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; @@ -569,19 +571,26 @@ static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode, old_cred, override_cred); if (err) { put_cred(override_cred); - return err; + return ERR_PTR(err); } - put_cred(override_creds(override_cred)); - put_cred(override_cred); - return 0; + /* + * Caller is going to match this with revert_creds() and drop + * referenec on the returned creds. + * We must be called with creator creds already, otherwise we risk + * leaking creds. + */ + old_cred = override_creds(override_cred); + WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb)); + + return override_cred; } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr, bool origin) { int err; - const struct cred *old_cred; + const struct cred *old_cred, *new_cred = NULL; struct dentry *parent = dentry->d_parent; old_cred = ovl_override_creds(dentry->d_sb); @@ -610,9 +619,13 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, * create a new inode, so just use the ovl mounter's * fs{u,g}id. */ - err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred); - if (err) + new_cred = ovl_setup_cred_for_create(dentry, inode, attr->mode, + old_cred); + err = PTR_ERR(new_cred); + if (IS_ERR(new_cred)) { + new_cred = NULL; goto out_revert_creds; + } } if (!ovl_dentry_is_whiteout(dentry)) @@ -621,7 +634,8 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, err = ovl_create_over_whiteout(dentry, inode, attr); out_revert_creds: - revert_creds(old_cred); + ovl_revert_creds(old_cred); + put_cred(new_cred); return err; } @@ -702,7 +716,7 @@ static int ovl_set_link_redirect(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); err = ovl_set_redirect(dentry, false); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -912,7 +926,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) err = ovl_remove_upper(dentry, is_dir, &list); else err = ovl_remove_and_whiteout(dentry, &list); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (!err) { if (is_dir) clear_nlink(dentry->d_inode); @@ -1292,7 +1306,7 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (update_nlink) ovl_nlink_end(new); else @@ -1306,18 +1320,22 @@ out: static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, struct inode *inode, umode_t mode) { - const struct cred *old_cred; + const struct cred *old_cred, *new_cred = NULL; struct path realparentpath; struct file *realfile; + struct ovl_file *of; struct dentry *newdentry; /* It's okay to set O_NOATIME, since the owner will be current fsuid */ int flags = file->f_flags | OVL_OPEN_FLAGS; int err; old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); - if (err) + new_cred = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); + err = PTR_ERR(new_cred); + if (IS_ERR(new_cred)) { + new_cred = NULL; goto out_revert_creds; + } ovl_path_upper(dentry->d_parent, &realparentpath); realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, @@ -1327,17 +1345,25 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, if (err) goto out_revert_creds; + of = ovl_file_alloc(realfile); + if (!of) { + fput(realfile); + err = -ENOMEM; + goto out_revert_creds; + } + /* ovl_instantiate() consumes the newdentry reference on success */ newdentry = dget(realfile->f_path.dentry); err = ovl_instantiate(dentry, inode, newdentry, false, file); if (!err) { - file->private_data = realfile; + file->private_data = of; } else { dput(newdentry); - fput(realfile); + ovl_file_free(of); } out_revert_creds: - revert_creds(old_cred); + ovl_revert_creds(old_cred); + put_cred(new_cred); return err; } @@ -1389,7 +1415,7 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir, put_realfile: /* Without FMODE_OPENED ->release() won't be called on @file */ if (!(file->f_mode & FMODE_OPENED)) - fput(file->private_data); + ovl_file_free(file->private_data); put_inode: iput(inode); drop_write: diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 5868cb222955..444aeeccb6da 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -176,35 +176,37 @@ static int ovl_connect_layer(struct dentry *dentry) * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static int ovl_check_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct inode *inode) { - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; + struct dentry *dentry; + int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ - if (!ovl_dentry_upper(dentry) && !decodable) + if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ - if (!ovl_dentry_lower(dentry)) + if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (dentry == dentry->d_sb->s_root) + if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ - if (ovl_dentry_upper(dentry) && decodable && - !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + if (ovl_inode_upper(inode) && decodable && + !ovl_test_flag(OVL_INDEX, inode)) return 0; /* @@ -213,14 +215,23 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && decodable) - return ovl_connect_layer(dentry); + if (!decodable || !S_ISDIR(inode->i_mode)) + return 1; + + dentry = d_find_any_alias(inode); + if (!dentry) + return -ENOENT; + + err = ovl_connect_layer(dentry); + dput(dentry); + if (err < 0) + return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } -static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; @@ -231,13 +242,13 @@ static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ - err = enc_lower = ovl_check_encode_origin(dentry); + err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : + ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -251,8 +262,8 @@ out: return err; fail: - pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i)\n", - dentry, err); + pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + inode->i_ino, err); goto out; } @@ -260,19 +271,13 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); - struct dentry *dentry; int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; - dentry = d_find_any_alias(inode); - if (!dentry) - return FILEID_INVALID; - - bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); - dput(dentry); + bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 4504493b20be..969b458100fe 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -51,7 +51,7 @@ static struct file *ovl_open_realfile(const struct file *file, realfile = backing_file_open(&file->f_path, flags, realpath, current_cred()); } - revert_creds(old_cred); + ovl_revert_creds(old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, @@ -89,56 +89,110 @@ static int ovl_change_flags(struct file *file, unsigned int flags) return 0; } -static int ovl_real_fdget_meta(const struct file *file, struct fd *real, - bool allow_meta) +struct ovl_file { + struct file *realfile; + struct file *upperfile; +}; + +struct ovl_file *ovl_file_alloc(struct file *realfile) { - struct dentry *dentry = file_dentry(file); - struct file *realfile = file->private_data; - struct path realpath; - int err; + struct ovl_file *of = kzalloc(sizeof(struct ovl_file), GFP_KERNEL); - real->word = (unsigned long)realfile; + if (unlikely(!of)) + return NULL; - if (allow_meta) { - ovl_path_real(dentry, &realpath); - } else { - /* lazy lookup and verify of lowerdata */ - err = ovl_verify_lowerdata(dentry); - if (err) - return err; + of->realfile = realfile; + return of; +} - ovl_path_realdata(dentry, &realpath); - } - if (!realpath.dentry) - return -EIO; +void ovl_file_free(struct ovl_file *of) +{ + fput(of->realfile); + if (of->upperfile) + fput(of->upperfile); + kfree(of); +} - /* Has it been copied up since we'd opened it? */ - if (unlikely(file_inode(realfile) != d_inode(realpath.dentry))) { - struct file *f = ovl_open_realfile(file, &realpath); - if (IS_ERR(f)) - return PTR_ERR(f); - real->word = (unsigned long)f | FDPUT_FPUT; - return 0; +static bool ovl_is_real_file(const struct file *realfile, + const struct path *realpath) +{ + return file_inode(realfile) == d_inode(realpath->dentry); +} + +static struct file *ovl_real_file_path(const struct file *file, + struct path *realpath) +{ + struct ovl_file *of = file->private_data; + struct file *realfile = of->realfile; + + if (WARN_ON_ONCE(!realpath->dentry)) + return ERR_PTR(-EIO); + + /* + * If the realfile that we want is not where the data used to be at + * open time, either we'd been copied up, or it's an fsync of a + * metacopied file. We need the upperfile either way, so see if it + * is already opened and if it is not then open and store it. + */ + if (unlikely(!ovl_is_real_file(realfile, realpath))) { + struct file *upperfile = READ_ONCE(of->upperfile); + struct file *old; + + if (!upperfile) { /* Nobody opened upperfile yet */ + upperfile = ovl_open_realfile(file, realpath); + if (IS_ERR(upperfile)) + return upperfile; + + /* Store the upperfile for later */ + old = cmpxchg_release(&of->upperfile, NULL, upperfile); + if (old) { /* Someone opened upperfile before us */ + fput(upperfile); + upperfile = old; + } + } + /* + * Stored file must be from the right inode, unless someone's + * been corrupting the upper layer. + */ + if (WARN_ON_ONCE(!ovl_is_real_file(upperfile, realpath))) + return ERR_PTR(-EIO); + + realfile = upperfile; } /* Did the flags change since open? */ - if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS)) - return ovl_change_flags(realfile, file->f_flags); + if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS)) { + int err = ovl_change_flags(realfile, file->f_flags); - return 0; + if (err) + return ERR_PTR(err); + } + + return realfile; } -static int ovl_real_fdget(const struct file *file, struct fd *real) +static struct file *ovl_real_file(const struct file *file) { - if (d_is_dir(file_dentry(file))) { + struct dentry *dentry = file_dentry(file); + struct path realpath; + int err; + + if (d_is_dir(dentry)) { struct file *f = ovl_dir_real_file(file, false); - if (IS_ERR(f)) - return PTR_ERR(f); - real->word = (unsigned long)f; - return 0; + + if (WARN_ON_ONCE(!f)) + return ERR_PTR(-EIO); + return f; } - return ovl_real_fdget_meta(file, real, false); + /* lazy lookup and verify of lowerdata */ + err = ovl_verify_lowerdata(dentry); + if (err) + return ERR_PTR(err); + + ovl_path_realdata(dentry, &realpath); + + return ovl_real_file_path(file, &realpath); } static int ovl_open(struct inode *inode, struct file *file) @@ -146,6 +200,7 @@ static int ovl_open(struct inode *inode, struct file *file) struct dentry *dentry = file_dentry(file); struct file *realfile; struct path realpath; + struct ovl_file *of; int err; /* lazy lookup and verify lowerdata */ @@ -168,22 +223,27 @@ static int ovl_open(struct inode *inode, struct file *file) if (IS_ERR(realfile)) return PTR_ERR(realfile); - file->private_data = realfile; + of = ovl_file_alloc(realfile); + if (!of) { + fput(realfile); + return -ENOMEM; + } + + file->private_data = of; return 0; } static int ovl_release(struct inode *inode, struct file *file) { - fput(file->private_data); - + ovl_file_free(file->private_data); return 0; } static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); - struct fd real; + struct file *realfile; const struct cred *old_cred; loff_t ret; @@ -199,9 +259,9 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, 0, 0); } - ret = ovl_real_fdget(file, &real); - if (ret) - return ret; + realfile = ovl_real_file(file); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); /* * Overlay file f_pos is the master copy that is preserved @@ -211,17 +271,15 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) * files, so we use the real file to perform seeks. */ ovl_inode_lock(inode); - fd_file(real)->f_pos = file->f_pos; + realfile->f_pos = file->f_pos; old_cred = ovl_override_creds(inode->i_sb); - ret = vfs_llseek(fd_file(real), offset, whence); - revert_creds(old_cred); + ret = vfs_llseek(realfile, offset, whence); + ovl_revert_creds(old_cred); - file->f_pos = fd_file(real)->f_pos; + file->f_pos = realfile->f_pos; ovl_inode_unlock(inode); - fdput(real); - return ret; } @@ -231,6 +289,11 @@ static void ovl_file_modified(struct file *file) ovl_copyattr(file_inode(file)); } +static void ovl_file_end_write(struct kiocb *iocb, ssize_t ret) +{ + ovl_file_modified(iocb->ki_filp); +} + static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; @@ -262,39 +325,33 @@ static void ovl_file_accessed(struct file *file) static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - struct fd real; - ssize_t ret; + struct file *realfile; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), - .user_file = file, .accessed = ovl_file_accessed, }; if (!iov_iter_count(iter)) return 0; - ret = ovl_real_fdget(file, &real); - if (ret) - return ret; - - ret = backing_file_read_iter(fd_file(real), iter, iocb, iocb->ki_flags, - &ctx); - fdput(real); + realfile = ovl_real_file(file); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); - return ret; + return backing_file_read_iter(realfile, iter, iocb, iocb->ki_flags, + &ctx); } static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct fd real; + struct file *realfile; ssize_t ret; int ifl = iocb->ki_flags; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), - .user_file = file, - .end_write = ovl_file_modified, + .end_write = ovl_file_end_write, }; if (!iov_iter_count(iter)) @@ -304,8 +361,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) /* Update mode */ ovl_copyattr(inode); - ret = ovl_real_fdget(file, &real); - if (ret) + realfile = ovl_real_file(file); + ret = PTR_ERR(realfile); + if (IS_ERR(realfile)) goto out_unlock; if (!ovl_should_sync(OVL_FS(inode->i_sb))) @@ -316,8 +374,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - ret = backing_file_write_iter(fd_file(real), iter, iocb, ifl, &ctx); - fdput(real); + ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx); out_unlock: inode_unlock(inode); @@ -329,20 +386,22 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - struct fd real; + struct file *realfile; ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(in)->i_sb), - .user_file = in, .accessed = ovl_file_accessed, }; + struct kiocb iocb; - ret = ovl_real_fdget(in, &real); - if (ret) - return ret; + realfile = ovl_real_file(in); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); - ret = backing_file_splice_read(fd_file(real), ppos, pipe, len, flags, &ctx); - fdput(real); + init_sync_kiocb(&iocb, in); + iocb.ki_pos = *ppos; + ret = backing_file_splice_read(realfile, &iocb, pipe, len, flags, &ctx); + *ppos = iocb.ki_pos; return ret; } @@ -350,7 +409,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, /* * Calling iter_file_splice_write() directly from overlay's f_op may deadlock * due to lock order inversion between pipe->mutex in iter_file_splice_write() - * and file_start_write(fd_file(real)) in ovl_write_iter(). + * and file_start_write(realfile) in ovl_write_iter(). * * So do everything ovl_write_iter() does and call iter_file_splice_write() on * the real file. @@ -358,25 +417,28 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - struct fd real; + struct file *realfile; struct inode *inode = file_inode(out); ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), - .user_file = out, - .end_write = ovl_file_modified, + .end_write = ovl_file_end_write, }; + struct kiocb iocb; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = ovl_real_fdget(out, &real); - if (ret) + realfile = ovl_real_file(out); + ret = PTR_ERR(realfile); + if (IS_ERR(realfile)) goto out_unlock; - ret = backing_file_splice_write(pipe, fd_file(real), ppos, len, flags, &ctx); - fdput(real); + init_sync_kiocb(&iocb, out); + iocb.ki_pos = *ppos; + ret = backing_file_splice_write(pipe, realfile, &iocb, len, flags, &ctx); + *ppos = iocb.ki_pos; out_unlock: inode_unlock(inode); @@ -386,7 +448,10 @@ out_unlock: static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct fd real; + struct dentry *dentry = file_dentry(file); + enum ovl_path_type type; + struct path upperpath; + struct file *upperfile; const struct cred *old_cred; int ret; @@ -394,38 +459,38 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret <= 0) return ret; - ret = ovl_real_fdget_meta(file, &real, !datasync); - if (ret) - return ret; - /* Don't sync lower file for fear of receiving EROFS error */ - if (file_inode(fd_file(real)) == ovl_inode_upper(file_inode(file))) { - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fsync_range(fd_file(real), start, end, datasync); - revert_creds(old_cred); - } + type = ovl_path_type(dentry); + if (!OVL_TYPE_UPPER(type) || (datasync && OVL_TYPE_MERGE(type))) + return 0; - fdput(real); + ovl_path_upper(dentry, &upperpath); + upperfile = ovl_real_file_path(file, &upperpath); + if (IS_ERR(upperfile)) + return PTR_ERR(upperfile); + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fsync_range(upperfile, start, end, datasync); + ovl_revert_creds(old_cred); return ret; } static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { - struct file *realfile = file->private_data; + struct ovl_file *of = file->private_data; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), - .user_file = file, .accessed = ovl_file_accessed, }; - return backing_file_mmap(realfile, vma, &ctx); + return backing_file_mmap(of->realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - struct fd real; + struct file *realfile; const struct cred *old_cred; int ret; @@ -436,19 +501,18 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len if (ret) goto out_unlock; - ret = ovl_real_fdget(file, &real); - if (ret) + realfile = ovl_real_file(file); + ret = PTR_ERR(realfile); + if (IS_ERR(realfile)) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fallocate(fd_file(real), mode, offset, len); - revert_creds(old_cred); + ret = vfs_fallocate(realfile, mode, offset, len); + ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file); - fdput(real); - out_unlock: inode_unlock(inode); @@ -457,19 +521,17 @@ out_unlock: static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { - struct fd real; + struct file *realfile; const struct cred *old_cred; int ret; - ret = ovl_real_fdget(file, &real); - if (ret) - return ret; + realfile = ovl_real_file(file); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fadvise(fd_file(real), offset, len, advice); - revert_creds(old_cred); - - fdput(real); + ret = vfs_fadvise(realfile, offset, len, advice); + ovl_revert_creds(old_cred); return ret; } @@ -485,7 +547,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); - struct fd real_in, real_out; + struct file *realfile_in, *realfile_out; const struct cred *old_cred; loff_t ret; @@ -498,42 +560,39 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, goto out_unlock; } - ret = ovl_real_fdget(file_out, &real_out); - if (ret) + realfile_out = ovl_real_file(file_out); + ret = PTR_ERR(realfile_out); + if (IS_ERR(realfile_out)) goto out_unlock; - ret = ovl_real_fdget(file_in, &real_in); - if (ret) { - fdput(real_out); + realfile_in = ovl_real_file(file_in); + ret = PTR_ERR(realfile_in); + if (IS_ERR(realfile_in)) goto out_unlock; - } old_cred = ovl_override_creds(file_inode(file_out)->i_sb); switch (op) { case OVL_COPY: - ret = vfs_copy_file_range(fd_file(real_in), pos_in, - fd_file(real_out), pos_out, len, flags); + ret = vfs_copy_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); break; case OVL_CLONE: - ret = vfs_clone_file_range(fd_file(real_in), pos_in, - fd_file(real_out), pos_out, len, flags); + ret = vfs_clone_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); break; case OVL_DEDUPE: - ret = vfs_dedupe_file_range_one(fd_file(real_in), pos_in, - fd_file(real_out), pos_out, len, + ret = vfs_dedupe_file_range_one(realfile_in, pos_in, + realfile_out, pos_out, len, flags); break; } - revert_creds(old_cred); + ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file_out); - fdput(real_in); - fdput(real_out); - out_unlock: inode_unlock(inode_out); @@ -577,20 +636,19 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, static int ovl_flush(struct file *file, fl_owner_t id) { - struct fd real; + struct file *realfile; const struct cred *old_cred; - int err; + int err = 0; - err = ovl_real_fdget(file, &real); - if (err) - return err; + realfile = ovl_real_file(file); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); - if (fd_file(real)->f_op->flush) { + if (realfile->f_op->flush) { old_cred = ovl_override_creds(file_inode(file)->i_sb); - err = fd_file(real)->f_op->flush(fd_file(real), id); - revert_creds(old_cred); + err = realfile->f_op->flush(realfile, id); + ovl_revert_creds(old_cred); } - fdput(real); return err; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 35fd3e3e1778..6f0e15f86c21 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -80,7 +80,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = ovl_do_notify_change(ofs, upperdentry, attr); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); @@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_do_getattr(&realpath, stat, request_mask, flags); + err = vfs_getattr_nosec(&realpath, stat, request_mask, flags); if (err) goto out; @@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = ovl_do_getattr(&realpath, &lowerstat, lowermask, - flags); + err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask, + flags); if (err) goto out; @@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = ovl_do_getattr(&realpath, &lowerdatastat, - lowermask, flags); + err = vfs_getattr_nosec(&realpath, &lowerdatastat, + lowermask, flags); if (err) goto out; } else { @@ -280,7 +280,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, stat->nlink = dentry->d_inode->i_nlink; out: - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -317,7 +317,7 @@ int ovl_permission(struct mnt_idmap *idmap, mask |= MAY_READ; } err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -334,7 +334,7 @@ static const char *ovl_get_link(struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); p = vfs_get_link(ovl_dentry_real(dentry), done); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return p; } @@ -469,7 +469,7 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, old_cred = ovl_override_creds(inode->i_sb); acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); - revert_creds(old_cred); + ovl_revert_creds(old_cred); } return acl; @@ -498,7 +498,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, old_cred = ovl_override_creds(dentry->d_sb); real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out; @@ -523,7 +523,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); - revert_creds(old_cred); + ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ @@ -600,7 +600,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, old_cred = ovl_override_creds(inode->i_sb); err = realinode->i_op->fiemap(realinode, fieinfo, start, len); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -616,8 +616,13 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f struct file *file; unsigned int cmd; int err; + unsigned int flags; + + flags = O_RDONLY; + if (force_o_largefile()) + flags |= O_LARGEFILE; - file = dentry_open(realpath, O_RDONLY, current_cred()); + file = dentry_open(realpath, flags, current_cred()); if (IS_ERR(file)) return PTR_ERR(file); @@ -671,7 +676,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) err = ovl_real_fileattr_set(&upperpath, fa); - revert_creds(old_cred); + ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* @@ -733,7 +738,7 @@ int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) old_cred = ovl_override_creds(inode->i_sb); err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 5764f91d283e..cea820cb3b55 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -542,7 +542,7 @@ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, real, is_upper); + fh = ovl_encode_real_fh(ofs, d_inode(real), is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; @@ -738,7 +738,7 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(ofs, origin, false); + fh = ovl_encode_real_fh(ofs, d_inode(origin), false); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -961,7 +961,7 @@ static int ovl_maybe_validate_verity(struct dentry *dentry) if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); - revert_creds(old_cred); + ovl_revert_creds(old_cred); } ovl_inode_unlock(inode); @@ -995,7 +995,7 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); err = ovl_lookup_data_layers(dentry, redirect, &datapath); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (err) goto out_err; @@ -1342,7 +1342,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (origin_path) { dput(origin_path->dentry); kfree(origin_path); @@ -1366,7 +1366,7 @@ out_put_upper: kfree(upperredirect); out: kfree(d.redirect); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return ERR_PTR(err); } @@ -1423,7 +1423,7 @@ bool ovl_lower_positive(struct dentry *dentry) dput(this); } } - revert_creds(old_cred); + ovl_revert_creds(old_cred); return positive; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0bfe35da4b7b..0021e2025020 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } -static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - if (flags & AT_GETATTR_NOSEC) - return vfs_getattr_nosec(path, stat, request_mask, flags); - return vfs_getattr(path, stat, request_mask, flags); -} - /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); @@ -429,6 +421,7 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); +void ovl_revert_creds(const struct cred *old_cred); static inline const struct cred *ovl_creds(struct super_block *sb) { @@ -862,6 +855,9 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); +struct ovl_file; +struct ovl_file *ovl_file_alloc(struct file *realfile); +void ovl_file_free(struct ovl_file *of); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); @@ -869,7 +865,7 @@ int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new); int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct inode *realinode, bool is_upper); struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index e42546c6c5df..1115c22deca0 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void) const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), - fsparam_string("lowerdir+", Opt_lowerdir_add), - fsparam_string("datadir+", Opt_datadir_add), - fsparam_string("upperdir", Opt_upperdir), - fsparam_string("workdir", Opt_workdir), + fsparam_file_or_string("lowerdir+", Opt_lowerdir_add), + fsparam_file_or_string("datadir+", Opt_datadir_add), + fsparam_file_or_string("upperdir", Opt_upperdir), + fsparam_file_or_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), @@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) +static inline bool is_upper_layer(enum ovl_opt layer) +{ + return layer == Opt_upperdir || layer == Opt_workdir; +} + +/* Handle non-file descriptor-based layer options that require path lookup. */ +static inline int ovl_kern_path(const char *layer_name, struct path *layer_path, + enum ovl_opt layer) { - char *name = kstrdup(layer_name, GFP_KERNEL); - bool upper = (layer == Opt_upperdir || layer == Opt_workdir); - struct path path; int err; + switch (layer) { + case Opt_upperdir: + fallthrough; + case Opt_workdir: + fallthrough; + case Opt_lowerdir: + err = ovl_mount_dir(layer_name, layer_path); + break; + case Opt_lowerdir_add: + fallthrough; + case Opt_datadir_add: + err = ovl_mount_dir_noesc(layer_name, layer_path); + break; + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } + + return err; +} + +static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name, + struct path *layer_path, enum ovl_opt layer) +{ + char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL); + bool upper; + int err = 0; + if (!name) return -ENOMEM; - if (upper || layer == Opt_lowerdir) - err = ovl_mount_dir(name, &path); - else - err = ovl_mount_dir_noesc(name, &path); + upper = is_upper_layer(layer); + err = ovl_mount_dir_check(fc, layer_path, layer, name, upper); if (err) - goto out_free; - - err = ovl_mount_dir_check(fc, &path, layer, name, upper); - if (err) - goto out_put; + return err; if (!upper) { err = ovl_ctx_realloc_lower(fc); if (err) - goto out_put; + return err; } /* Store the user provided path string in ctx to show in mountinfo */ - ovl_add_layer(fc, layer, &path, &name); + ovl_add_layer(fc, layer, layer_path, &name); + return err; +} + +static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, + enum ovl_opt layer) +{ + struct path layer_path __free(path_put) = {}; + int err = 0; + + switch (param->type) { + case fs_value_is_string: + err = ovl_kern_path(param->string, &layer_path, layer); + if (err) + return err; + err = ovl_do_parse_layer(fc, param->string, &layer_path, layer); + break; + case fs_value_is_file: { + char *buf __free(kfree); + char *layer_name; + + buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + if (!buf) + return -ENOMEM; + + layer_path = param->file->f_path; + path_get(&layer_path); + + layer_name = d_path(&layer_path, buf, PATH_MAX); + if (IS_ERR(layer_name)) + return PTR_ERR(layer_name); + + err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer); + break; + } + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } -out_put: - path_put(&path); -out_free: - kfree(name); return err; } @@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) iter = dup; for (nr = 0; nr < nr_lower; nr++) { - err = ovl_parse_layer(fc, iter, Opt_lowerdir); + struct path path __free(path_put) = {}; + + err = ovl_kern_path(iter, &path, Opt_lowerdir); + if (err) + goto out_err; + + err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir); if (err) goto out_err; @@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param->string, opt); + err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0ca8af060b0c..881ec5592da5 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -290,7 +290,7 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data } inode_unlock(dir->d_inode); } - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -808,7 +808,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) } err = 0; out: - revert_creds(old_cred); + ovl_revert_creds(old_cred); return err; } @@ -860,7 +860,7 @@ static struct file *ovl_dir_open_realfile(const struct file *file, old_cred = ovl_override_creds(file_inode(file)->i_sb); res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return res; } @@ -987,7 +987,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (err) return err; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index edc9216f6e27..0819c739cc2f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -68,6 +68,11 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } +void ovl_revert_creds(const struct cred *old_cred) +{ + revert_creds(old_cred); +} + /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -197,6 +202,9 @@ void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, bool ovl_dentry_weird(struct dentry *dentry) { + if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry)) + return true; + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT | DCACHE_OP_HASH | @@ -1178,7 +1186,7 @@ int ovl_nlink_start(struct dentry *dentry) * value relative to the upper inode nlink in an upper inode xattr. */ err = ovl_set_nlink_upper(dentry); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (err) goto out_drop_write; @@ -1203,7 +1211,7 @@ void ovl_nlink_end(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); ovl_cleanup_index(dentry); - revert_creds(old_cred); + ovl_revert_creds(old_cred); } ovl_inode_unlock(inode); diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c index 383978e4663c..88055deca936 100644 --- a/fs/overlayfs/xattrs.c +++ b/fs/overlayfs/xattrs.c @@ -47,7 +47,7 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (err < 0) goto out; } @@ -72,7 +72,7 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char WARN_ON(flags != XATTR_REPLACE); err = ovl_do_removexattr(ofs, realdentry, name); } - revert_creds(old_cred); + ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ @@ -91,7 +91,7 @@ static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); - revert_creds(old_cred); + ovl_revert_creds(old_cred); return res; } @@ -121,7 +121,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) old_cred = ovl_override_creds(dentry->d_sb); res = vfs_listxattr(realdentry, list, size); - revert_creds(old_cred); + ovl_revert_creds(old_cred); if (res <= 0 || size == 0) return res; @@ -268,4 +268,3 @@ const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs) return ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; } - diff --git a/fs/pidfs.c b/fs/pidfs.c index 80675b6bf884..049352f973de 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> +#include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> @@ -22,6 +24,97 @@ #include "internal.h" #include "mount.h" +static struct rb_root pidfs_ino_tree = RB_ROOT; + +#if BITS_PER_LONG == 32 +static inline unsigned long pidfs_ino(u64 ino) +{ + return lower_32_bits(ino); +} + +/* On 32 bit the generation number are the upper 32 bits. */ +static inline u32 pidfs_gen(u64 ino) +{ + return upper_32_bits(ino); +} + +#else + +/* On 64 bit simply return ino. */ +static inline unsigned long pidfs_ino(u64 ino) +{ + return ino; +} + +/* On 64 bit the generation number is 0. */ +static inline u32 pidfs_gen(u64 ino) +{ + return 0; +} +#endif + +static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); + struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); + u64 pid_ino_a = pid_a->ino; + u64 pid_ino_b = pid_b->ino; + + if (pid_ino_a < pid_ino_b) + return -1; + if (pid_ino_a > pid_ino_b) + return 1; + return 0; +} + +void pidfs_add_pid(struct pid *pid) +{ + static u64 pidfs_ino_nr = 2; + + /* + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 2 + * so inode numbering starts at 2 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr += 2; + + pid->ino = pidfs_ino_nr; + pid->stashed = NULL; + pidfs_ino_nr++; + + write_seqcount_begin(&pidmap_lock_seq); + rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); + write_seqcount_end(&pidmap_lock_seq); +} + +void pidfs_remove_pid(struct pid *pid) +{ + write_seqcount_begin(&pidmap_lock_seq); + rb_erase(&pid->pidfs_node, &pidfs_ino_tree); + write_seqcount_end(&pidmap_lock_seq); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -114,6 +207,102 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } +static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +{ + struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + size_t usize = _IOC_SIZE(cmd); + struct pidfd_info kinfo = {}; + struct user_namespace *user_ns; + const struct cred *c; + __u64 mask; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif + + if (!uinfo) + return -EINVAL; + if (usize < PIDFD_INFO_SIZE_VER0) + return -EINVAL; /* First version, no smaller struct possible */ + + if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) + return -EFAULT; + + c = get_task_cred(task); + if (!c) + return -ESRCH; + + /* Unconditionally return identifiers and credentials, the rest only on request */ + + user_ns = current_user_ns(); + kinfo.ruid = from_kuid_munged(user_ns, c->uid); + kinfo.rgid = from_kgid_munged(user_ns, c->gid); + kinfo.euid = from_kuid_munged(user_ns, c->euid); + kinfo.egid = from_kgid_munged(user_ns, c->egid); + kinfo.suid = from_kuid_munged(user_ns, c->suid); + kinfo.sgid = from_kgid_munged(user_ns, c->sgid); + kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); + kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); + kinfo.mask |= PIDFD_INFO_CREDS; + put_cred(c); + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); +#endif + + /* + * Copy pid/tgid last, to reduce the chances the information might be + * stale. Note that it is not possible to ensure it will be valid as the + * task might return as soon as the copy_to_user finishes, but that's ok + * and userspace expects that might happen and can act accordingly, so + * this is just best-effort. What we can do however is checking that all + * the fields are set correctly, or return ESRCH to avoid providing + * incomplete information. */ + + kinfo.ppid = task_ppid_nr_ns(task, NULL); + kinfo.tgid = task_tgid_vnr(task); + kinfo.pid = task_pid_vnr(task); + kinfo.mask |= PIDFD_INFO_PID; + + if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + return -ESRCH; + + /* + * If userspace and the kernel have the same struct size it can just + * be copied. If userspace provides an older struct, only the bits that + * userspace knows about will be copied. If userspace provides a new + * struct, only the bits that the kernel knows about will be copied. + */ + if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) + return -EFAULT; + + return 0; +} + +static bool pidfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case FS_IOC_GETVERSION: + case PIDFD_GET_CGROUP_NAMESPACE: + case PIDFD_GET_INFO: + case PIDFD_GET_IPC_NAMESPACE: + case PIDFD_GET_MNT_NAMESPACE: + case PIDFD_GET_NET_NAMESPACE: + case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_TIME_NAMESPACE: + case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_UTS_NAMESPACE: + case PIDFD_GET_USER_NAMESPACE: + case PIDFD_GET_PID_NAMESPACE: + return true; + } + + return false; +} + static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; @@ -122,13 +311,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; - if (arg) - return -EINVAL; + if (!pidfs_ioctl_valid(cmd)) + return -ENOIOCTLCMD; + + if (cmd == FS_IOC_GETVERSION) { + if (!arg) + return -EINVAL; + + __u32 __user *argp = (__u32 __user *)arg; + return put_user(file_inode(file)->i_generation, argp); + } task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; + /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ + if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) + return pidfd_info(task, cmd, arg); + + if (arg) + return -EINVAL; + scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) @@ -238,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file) static struct vfsmount *pidfs_mnt __ro_after_init; -#if BITS_PER_LONG == 32 -/* - * Provide a fallback mechanism for 32-bit systems so processes remain - * reliably comparable by inode number even on those systems. - */ -static DEFINE_IDA(pidfd_inum_ida); - -static int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - int ret; - - ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, - UINT_MAX, GFP_ATOMIC); - if (ret < 0) - return -ENOSPC; - - *ino = ret; - return 0; -} - -static inline void pidfs_free_inum(unsigned long ino) -{ - if (ino > 0) - ida_free(&pidfd_inum_ida, ino); -} -#else -static inline int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - *ino = pid->ino; - return 0; -} -#define pidfs_free_inum(ino) ((void)(ino)) -#endif - /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -323,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode) clear_inode(inode); put_pid(pid); - pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { @@ -341,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } -static const struct dentry_operations pidfs_dentry_operations = { +const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + const struct pid *pid = inode->i_private; + + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + *max_len = 2; + *(u64 *)fh = pid->ino; + return FILEID_KERNFS; +} + +static int pidfs_ino_find(const void *key, const struct rb_node *node) +{ + const u64 pid_ino = *(u64 *)key; + const struct pid *pid = rb_entry(node, struct pid, pidfs_node); + + if (pid_ino < pid->ino) + return -1; + if (pid_ino > pid->ino) + return 1; + return 0; +} + +/* Find a struct pid based on the inode number. */ +static struct pid *pidfs_ino_get_pid(u64 ino) +{ + struct pid *pid; + struct rb_node *node; + unsigned int seq; + + guard(rcu)(); + do { + seq = read_seqcount_begin(&pidmap_lock_seq); + node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); + if (node) + break; + } while (read_seqcount_retry(&pidmap_lock_seq, seq)); + + if (!node) + return NULL; + + pid = rb_entry(node, struct pid, pidfs_node); + + /* Within our pid namespace hierarchy? */ + if (pid_vnr(pid) == 0) + return NULL; + + return get_pid(pid); +} + +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + int ret; + u64 pid_ino; + struct path path; + struct pid *pid; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + pid_ino = *(u64 *)fid; + break; + default: + return NULL; + } + + pid = pidfs_ino_get_pid(pid_ino); + if (!pid) + return NULL; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); + if (ret < 0) + return ERR_PTR(ret); + + mntput(path.mnt); + return path.dentry; +} + +/* + * Make sure that we reject any nonsensical flags that users pass via + * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and + * PIDFD_NONBLOCK as O_NONBLOCK. + */ +#define VALID_FILE_HANDLE_OPEN_FLAGS \ + (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) + +static int pidfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) + return -EINVAL; + + /* + * pidfd_ino_get_pid() will verify that the struct pid is part + * of the caller's pid namespace hierarchy. No further + * permission checks are needed. + */ + return 0; +} + +static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +{ + /* + * Clear O_LARGEFILE as open_by_handle_at() forces it and raise + * O_RDWR as pidfds always are. + */ + oflags &= ~O_LARGEFILE; + return dentry_open(path, oflags | O_RDWR, current_cred()); +} + +static const struct export_operations pidfs_export_operations = { + .encode_fh = pidfs_encode_fh, + .fh_to_dentry = pidfs_fh_to_dentry, + .open = pidfs_export_open, + .permission = pidfs_export_permission, +}; + static int pidfs_init_inode(struct inode *inode, void *data) { + const struct pid *pid = data; + inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; - /* - * Inode numbering for pidfs start at RESERVED_PIDS + 1. This - * avoids collisions with the root inode which is 1 for pseudo - * filesystems. - */ - return pidfs_inum(data, &inode->i_ino); + inode->i_ino = pidfs_ino(pid->ino); + inode->i_generation = pidfs_gen(pid->ino); + return 0; } static void pidfs_put_data(void *data) @@ -382,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &pidfs_sops; + ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; diff --git a/fs/pipe.c b/fs/pipe.c index 12b22c2723b7..82fede0f2111 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - bool was_full, wake_next_reader = false; + bool wake_writer = false, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ @@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) mutex_lock(&pipe->mutex); /* - * We only wake up writers if the pipe was full when we started - * reading in order to avoid unnecessary wakeups. + * We only wake up writers if the pipe was full when we started reading + * and it is no longer full after reading to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); @@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) + if (!buf->len) { + wake_writer |= pipe_full(head, tail, pipe->max_usage); tail = pipe_update_tail(pipe, buf, tail); + } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) + if (unlikely(wake_writer)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - mutex_lock(&pipe->mutex); - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + wake_writer = false; wake_next_reader = true; + mutex_lock(&pipe->mutex); } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; mutex_unlock(&pipe->mutex); - if (was_full) + if (wake_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); diff --git a/fs/pnode.c b/fs/pnode.c index a799e0315cc9..ef048f008bdd 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list) continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* - * We have come accross an partially unmounted - * mount in list that has not been visited yet. - * Remember it has been visited and continue - * about our merry way. + * We have come across a partially unmounted + * mount in a list that has not been visited + * yet. Remember it has been visited and + * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/proc/array.c b/fs/proc/array.c index 34a47fb0c57f..d6a0369caa93 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -109,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else - __get_task_comm(tcomm, sizeof(tcomm), p); + get_task_comm(tcomm, p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); @@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ - if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52..0edf14a9840e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> @@ -832,19 +831,21 @@ static const struct file_operations proc_single_file_operations = { struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); - struct mm_struct *mm = ERR_PTR(-ESRCH); + struct mm_struct *mm; - if (task) { - mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); - put_task_struct(task); + if (!task) + return ERR_PTR(-ESRCH); - if (!IS_ERR_OR_NULL(mm)) { - /* ensure this mm_struct can't be freed */ - mmgrab(mm); - /* but do not pin its memory */ - mmput(mm); - } - } + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); + put_task_struct(task); + + if (IS_ERR(mm)) + return mm == ERR_PTR(-ESRCH) ? NULL : mm; + + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); return mm; } @@ -2208,7 +2209,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) + if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { @@ -2553,8 +2554,8 @@ static int show_timer(struct seq_file *m, void *v) seq_printf(m, "ID: %d\n", timer->it_id); seq_printf(m, "signal: %d/%px\n", - timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); + timer->sigq.info.si_signo, + timer->sigq.info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 1f54a54bfb91..24baf23e864f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -77,7 +77,7 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) return single_open(file, seq_show, inode); } -/** +/* * Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure * that the current task has PTRACE_MODE_READ in addition to the normal * POSIX-like checks. @@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, fd); - rcu_read_unlock(); + file = fget_task(task, fd); if (file) { *mode = file->f_mode; fput(file); @@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - rcu_read_lock(); for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = task_lookup_next_fdget_rcu(p, &fd); + f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; - rcu_read_unlock(); fput(f); data.fd = fd; @@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out; + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); out: put_task_struct(p); return 0; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 87e4d6282025..1695509370b8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -102,7 +102,7 @@ struct proc_inode { union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; - struct ctl_table *sysctl_entry; + const struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index cb0edc7cbf09..714a22ded8a8 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -11,13 +11,13 @@ */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { - return (*pos <= nr_irqs) ? pos : NULL; + return *pos <= irq_get_nr_irqs() ? pos : NULL; } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos > nr_irqs) + if (*pos > irq_get_nr_irqs()) return NULL; return pos; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7d0acdad74e2..1cb33771bf9f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -50,8 +50,26 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif +#ifndef kc_xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr +static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys) +{ + return __va(phys); +} +#endif +#ifndef kc_unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr +static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt) +{ +} +#endif + static LIST_HEAD(kclist_head); -static DECLARE_RWSEM(kclist_lock); +static int kcore_nphdr; +static size_t kcore_phdrs_len; +static size_t kcore_notes_len; +static size_t kcore_data_offset; +DEFINE_STATIC_PERCPU_RWSEM(kclist_lock); static int kcore_need_update = 1; /* @@ -87,33 +105,32 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, list_add_tail(&new->list, &kclist_head); } -static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, - size_t *data_offset) +static void update_kcore_size(void) { size_t try, size; struct kcore_list *m; - *nphdr = 1; /* PT_NOTE */ + kcore_nphdr = 1; /* PT_NOTE */ size = 0; list_for_each_entry(m, &kclist_head, list) { try = kc_vaddr_to_offset((size_t)m->addr + m->size); if (try > size) size = try; - *nphdr = *nphdr + 1; + kcore_nphdr++; } - *phdrs_len = *nphdr * sizeof(struct elf_phdr); - *notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + - VMCOREINFO_NOTE_NAME_BYTES + - ALIGN(sizeof(struct elf_prstatus), 4) + - ALIGN(sizeof(struct elf_prpsinfo), 4) + - ALIGN(arch_task_struct_size, 4) + - ALIGN(vmcoreinfo_size, 4)); - *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + - *notes_len); - return *data_offset + size; + kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); + kcore_notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); + kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len + + kcore_notes_len); + proc_root_kcore->size = kcore_data_offset + size; } #ifdef CONFIG_HIGHMEM @@ -256,12 +273,10 @@ static int kcore_update_ram(void) { LIST_HEAD(list); LIST_HEAD(garbage); - int nphdr; - size_t phdrs_len, notes_len, data_offset; struct kcore_list *tmp, *pos; int ret = 0; - down_write(&kclist_lock); + percpu_down_write(&kclist_lock); if (!xchg(&kcore_need_update, 0)) goto out; @@ -279,11 +294,10 @@ static int kcore_update_ram(void) } list_splice_tail(&list, &kclist_head); - proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, ¬es_len, - &data_offset); + update_kcore_size(); out: - up_write(&kclist_lock); + percpu_up_write(&kclist_lock); list_for_each_entry_safe(pos, tmp, &garbage, list) { list_del(&pos->list); kfree(pos); @@ -312,27 +326,24 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; char *buf = file->private_data; loff_t *fpos = &iocb->ki_pos; - size_t phdrs_offset, notes_offset, data_offset; + size_t phdrs_offset, notes_offset; size_t page_offline_frozen = 1; - size_t phdrs_len, notes_len; struct kcore_list *m; size_t tsz; - int nphdr; unsigned long start; size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; - down_read(&kclist_lock); + percpu_down_read(&kclist_lock); /* * Don't race against drivers that set PageOffline() and expect no * further page access. */ page_offline_freeze(); - get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); phdrs_offset = sizeof(struct elfhdr); - notes_offset = phdrs_offset + phdrs_len; + notes_offset = phdrs_offset + kcore_phdrs_len; /* ELF file header. */ if (buflen && *fpos < sizeof(struct elfhdr)) { @@ -354,7 +365,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) .e_flags = ELF_CORE_EFLAGS, .e_ehsize = sizeof(struct elfhdr), .e_phentsize = sizeof(struct elf_phdr), - .e_phnum = nphdr, + .e_phnum = kcore_nphdr, }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); @@ -368,10 +379,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) } /* ELF program headers. */ - if (buflen && *fpos < phdrs_offset + phdrs_len) { + if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) { struct elf_phdr *phdrs, *phdr; - phdrs = kzalloc(phdrs_len, GFP_KERNEL); + phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL); if (!phdrs) { ret = -ENOMEM; goto out; @@ -379,13 +390,14 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) phdrs[0].p_type = PT_NOTE; phdrs[0].p_offset = notes_offset; - phdrs[0].p_filesz = notes_len; + phdrs[0].p_filesz = kcore_notes_len; phdr = &phdrs[1]; list_for_each_entry(m, &kclist_head, list) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + + kcore_data_offset; phdr->p_vaddr = (size_t)m->addr; if (m->type == KCORE_RAM) phdr->p_paddr = __pa(m->addr); @@ -398,7 +410,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) phdr++; } - tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); + tsz = min_t(size_t, buflen, + phdrs_offset + kcore_phdrs_len - *fpos); if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, iter) != tsz) { kfree(phdrs); @@ -412,7 +425,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) } /* ELF note segment. */ - if (buflen && *fpos < notes_offset + notes_len) { + if (buflen && *fpos < notes_offset + kcore_notes_len) { struct elf_prstatus prstatus = {}; struct elf_prpsinfo prpsinfo = { .pr_sname = 'R', @@ -424,7 +437,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) strscpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); - notes = kzalloc(notes_len, GFP_KERNEL); + notes = kzalloc(kcore_notes_len, GFP_KERNEL); if (!notes) { ret = -ENOMEM; goto out; @@ -445,9 +458,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) */ append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - min(vmcoreinfo_size, notes_len - i)); + min(vmcoreinfo_size, kcore_notes_len - i)); - tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); + tsz = min_t(size_t, buflen, + notes_offset + kcore_notes_len - *fpos); if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; @@ -463,7 +477,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) * Check to see if our file offset matches with any of * the addresses in the elf_phdr on our list. */ - start = kc_offset_to_vaddr(*fpos - data_offset); + start = kc_offset_to_vaddr(*fpos - kcore_data_offset); if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; @@ -471,19 +485,21 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) while (buflen) { struct page *page; unsigned long pfn; + phys_addr_t phys; + void *__start; /* * If this is the first iteration or the address is not within * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - struct kcore_list *iter; + struct kcore_list *pos; m = NULL; - list_for_each_entry(iter, &kclist_head, list) { - if (start >= iter->addr && - start < iter->addr + iter->size) { - m = iter; + list_for_each_entry(pos, &kclist_head, list) { + if (start >= pos->addr && + start < pos->addr + pos->size) { + m = pos; break; } } @@ -537,7 +553,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) } break; case KCORE_RAM: - pfn = __pa(start) >> PAGE_SHIFT; + phys = __pa(start); + pfn = phys >> PAGE_SHIFT; page = pfn_to_online_page(pfn); /* @@ -557,17 +574,33 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: + if (m->type == KCORE_RAM) { + __start = kc_xlate_dev_mem_ptr(phys); + if (!__start) { + ret = -ENOMEM; + if (iov_iter_zero(tsz, iter) != tsz) + ret = -EFAULT; + goto out; + } + } else { + __start = (void *)start; + } + /* * Sadly we must use a bounce buffer here to be able to * make use of copy_from_kernel_nofault(), as these * memory regions might not always be mapped on all * architectures. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + ret = copy_from_kernel_nofault(buf, __start, tsz); + if (m->type == KCORE_RAM) + kc_unxlate_dev_mem_ptr(phys, __start); + if (ret) { if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } + ret = 0; /* * We know the bounce buffer is safe to copy from, so * use _copy_to_iter() directly. @@ -593,7 +626,7 @@ skip: out: page_offline_thaw(); - up_read(&kclist_lock); + percpu_up_read(&kclist_lock); if (ret) return ret; return orig_buflen - buflen; @@ -630,6 +663,7 @@ static int release_kcore(struct inode *inode, struct file *file) } static const struct proc_ops kcore_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, .proc_release = release_kcore, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 245171d9164b..8ba9b1472390 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -91,7 +91,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", - (unsigned long)atomic_read(&zswap_stored_pages) << + (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a..c610224faf10 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d11ebc055ce0..27a283d85a6e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -17,6 +17,7 @@ #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> +#include <linux/lockdep.h> #include "internal.h" #define list_for_each_table_entry(entry, header) \ @@ -33,7 +34,7 @@ static const struct inode_operations proc_sys_dir_operations; * Support for permanently empty directories. * Must be non-empty to avoid sharing an address with other tables. */ -static struct ctl_table sysctl_mount_point[] = { +static const struct ctl_table sysctl_mount_point[] = { { } }; @@ -67,7 +68,7 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } -static struct ctl_table root_table[] = { +static const struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, @@ -88,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry); + const struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -109,14 +110,15 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) return cmp; } -/* Called under sysctl_lock */ -static struct ctl_table *find_entry(struct ctl_table_header **phead, +static const struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; + lockdep_assert_held(&sysctl_lock); + while (node) { struct ctl_node *ctl_node; @@ -141,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead, return NULL; } -static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; @@ -151,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) while (*p) { struct ctl_table_header *parent_head; - struct ctl_table *parent_entry; + const struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; @@ -180,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) return 0; } -static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; @@ -189,7 +191,7 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_node *node, struct ctl_table *table, size_t table_size) + struct ctl_node *node, const struct ctl_table *table, size_t table_size) { head->ctl_table = table; head->ctl_table_size = table_size; @@ -204,7 +206,7 @@ static void init_header(struct ctl_table_header *head, head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { - struct ctl_table *entry; + const struct ctl_table *entry; list_for_each_table_entry(entry, head) { node->header = head; @@ -217,7 +219,7 @@ static void init_header(struct ctl_table_header *head, static void erase_header(struct ctl_table_header *head) { - struct ctl_table *entry; + const struct ctl_table *entry; list_for_each_table_entry(entry, head) erase_entry(head, entry); @@ -225,7 +227,7 @@ static void erase_header(struct ctl_table_header *head) static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { - struct ctl_table *entry; + const struct ctl_table *entry; struct ctl_table_header *dir_h = &dir->header; int err; @@ -263,18 +265,20 @@ fail_links: return err; } -/* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (unlikely(p->unregistering)) return 0; p->used++; return 1; } -/* called under sysctl_lock */ static void unuse_table(struct ctl_table_header *p) { + lockdep_assert_held(&sysctl_lock); + if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); @@ -285,9 +289,11 @@ static void proc_sys_invalidate_dcache(struct ctl_table_header *head) proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } -/* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { + /* will reacquire if has to wait */ + lockdep_assert_held(&sysctl_lock); + /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock @@ -344,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root) return set; } -static struct ctl_table *lookup_entry(struct ctl_table_header **phead, - struct ctl_dir *dir, - const char *name, int namelen) +static const struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); @@ -374,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node) } static void first_entry(struct ctl_dir *dir, - struct ctl_table_header **phead, struct ctl_table **pentry) + struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = NULL; - struct ctl_table *entry = NULL; + const struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); @@ -391,10 +397,10 @@ static void first_entry(struct ctl_dir *dir, *pentry = entry; } -static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry) { struct ctl_table_header *head = *phead; - struct ctl_table *entry = *pentry; + const struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); @@ -427,7 +433,7 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; @@ -441,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i } static struct inode *proc_sys_make_inode(struct super_block *sb, - struct ctl_table_header *head, struct ctl_table *table) + struct ctl_table_header *head, const struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; @@ -512,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; - struct ctl_table *p; + const struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; @@ -550,7 +556,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; @@ -624,7 +630,7 @@ static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) @@ -642,7 +648,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; @@ -673,7 +679,7 @@ out: static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; @@ -698,11 +704,11 @@ static bool proc_sys_fill_cache(struct file *file, res = d_splice_alias(inode, child); d_lookup_done(child); if (unlikely(res)) { - if (IS_ERR(res)) { - dput(child); - return false; - } dput(child); + + if (IS_ERR(res)) + return false; + child = res; } } @@ -717,7 +723,7 @@ static bool proc_sys_fill_cache(struct file *file, static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, - struct ctl_table *table) + const struct ctl_table *table) { bool ret = true; @@ -735,7 +741,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, struct ctl_table *table, +static int scan(struct ctl_table_header *head, const struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { @@ -759,7 +765,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; - struct ctl_table *entry; + const struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; @@ -792,7 +798,7 @@ static int proc_sys_permission(struct mnt_idmap *idmap, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; - struct ctl_table *table; + const struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ @@ -836,7 +842,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap, { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; + const struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); @@ -935,7 +941,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; - struct ctl_table *entry; + const struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) @@ -1046,12 +1052,12 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry) + const struct ctl_table **pentry) { struct ctl_table_header *head; + const struct ctl_table *entry; struct ctl_table_root *root; struct ctl_table_set *set; - struct ctl_table *entry; struct ctl_dir *dir; int ret; @@ -1078,7 +1084,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, return ret; } -static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; @@ -1094,7 +1100,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } -static int sysctl_check_table_array(const char *path, struct ctl_table *table) +static int sysctl_check_table_array(const char *path, const struct ctl_table *table) { unsigned int extra; int err = 0; @@ -1133,7 +1139,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) static int sysctl_check_table(const char *path, struct ctl_table_header *header) { - struct ctl_table *entry; + const struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { if (!entry->procname) @@ -1169,8 +1175,9 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header) static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { - struct ctl_table *link_table, *entry, *link; + struct ctl_table *link_table, *link; struct ctl_table_header *links; + const struct ctl_table *entry; struct ctl_node *node; char *link_name; int name_bytes; @@ -1215,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir, struct ctl_table_root *link_root) { struct ctl_table_header *tmp_head; - struct ctl_table *entry, *link; + const struct ctl_table *entry, *link; if (header->ctl_table_size == 0 || sysctl_is_perm_empty_ctl_header(header)) @@ -1358,7 +1365,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table, size_t table_size) + const char *path, const struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; @@ -1419,7 +1426,7 @@ fail: * * See __register_sysctl_table for more details. */ -struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, +struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, @@ -1448,7 +1455,7 @@ EXPORT_SYMBOL(register_sysctl_sz); * * Context: if your base directory does not exist it will be created for you. */ -void __init __register_sysctl_init(const char *path, struct ctl_table *table, +void __init __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size) { struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); @@ -1466,7 +1473,7 @@ static void put_links(struct ctl_table_header *header) struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; - struct ctl_table *entry; + const struct ctl_table *entry; if (header->set == root_set) return; @@ -1477,7 +1484,7 @@ static void put_links(struct ctl_table_header *header) list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; - struct ctl_table *link; + const struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index f4616083faef..04bb29721419 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) - seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10); seq_putc(p, '\n'); } return 0; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index da60956b2915..8b444e862319 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -76,7 +76,7 @@ static void show_all_irqs(struct seq_file *p) seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } - show_irq_gap(p, nr_irqs - next); + show_irq_gap(p, irq_get_nr_irqs() - next); } static int show_stat(struct seq_file *p, void *v) @@ -196,7 +196,7 @@ static int stat_open(struct inode *inode, struct file *file) unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ - size += 2 * nr_irqs; + size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..f02cd362309a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -909,8 +909,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. + * + * The length of the second argument of mnemonics[] + * needs to be 3 instead of previously set 2 + * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) + * to avoid spurious + * -Werror=unterminated-string-initialization warning + * with GCC 15 */ - static const char mnemonics[BITS_PER_LONG][2] = { + static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ @@ -971,7 +978,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) @@ -987,11 +994,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; - if (vma->vm_flags & (1UL << i)) { - seq_putc(m, mnemonics[i][0]); - seq_putc(m, mnemonics[i][1]); - seq_putc(m, ' '); - } + if (vma->vm_flags & (1UL << i)) + seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } @@ -1806,7 +1810,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, } for (; addr != end; addr += PAGE_SIZE, idx++) { - unsigned long cur_flags = flags; + u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && @@ -2661,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg, return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, - arg->vec_len * sizeof(struct page_region))) + size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b52d85f8ad59..658bf199d424 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -404,6 +404,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) if (!iov_iter_count(iter)) return acc; } + + cond_resched(); } return acc; @@ -414,6 +416,34 @@ static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) return __read_vmcore(iter, &iocb->ki_pos); } +/** + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @size: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *vmcore_alloc_buf(size_t size) +{ +#ifdef CONFIG_MMU + return vmalloc_user(size); +#else + return vzalloc(size); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU + /* * The vmcore fault handler uses the page cache and fills data using the * standard __read_vmcore() function. @@ -461,33 +491,6 @@ static const struct vm_operations_struct vmcore_mmap_ops = { .fault = mmap_vmcore_fault, }; -/** - * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @size: size of buffer - * - * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap - * the buffer to user-space by means of remap_vmalloc_range(). - * - * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is - * disabled and there's no need to allow users to mmap the buffer. - */ -static inline char *vmcore_alloc_buf(size_t size) -{ -#ifdef CONFIG_MMU - return vmalloc_user(size); -#else - return vzalloc(size); -#endif -} - -/* - * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is - * essential for mmap_vmcore() in order to map physically - * non-contiguous objects (ELF header, ELF note segment and memory - * regions in the 1st kernel pointed to by PT_LOAD entries) into - * virtually contiguous user-space in ELF layout. - */ -#ifdef CONFIG_MMU /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 4311fcbc84f2..bc68b4de5287 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -901,7 +901,7 @@ MODULE_DEVICE_TABLE(of, dt_match); static struct platform_driver ramoops_driver = { .probe = ramoops_probe, - .remove_new = ramoops_remove, + .remove = ramoops_remove, .driver = { .name = "ramoops", .of_match_table = dt_match, diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85925ec0051a..3310d1ad4d0e 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -179,8 +179,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) */ static const char *qnx6_checkroot(struct super_block *s) { - static char match_root[2][3] = {".\0\0", "..\0"}; - int i, error = 0; + int error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; @@ -189,11 +188,9 @@ static const char *qnx6_checkroot(struct super_block *s) if (IS_ERR(folio)) return "error reading root directory"; dir_entry = kmap_local_folio(folio, 0); - for (i = 0; i < 2; i++) { - /* maximum 3 bytes - due to match_root limitation */ - if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) - error = 1; - } + if (memcmp(dir_entry[0].de_fname, ".", 2) || + memcmp(dir_entry[1].de_fname, "..", 3)) + error = 1; folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 4c925e55dbcd..818083a36bef 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -9,14 +9,13 @@ config QUOTA help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the - ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems. - Note that gfs2 and xfs use their own quota system. - Ext3, ext4 and reiserfs also support journaled quotas for which - you don't need to run quotacheck(8) after an unclean shutdown. - For further details, read the Quota mini-HOWTO, available from - <https://www.tldp.org/docs.html#howto>, or the documentation provided - with the quota tools. Probably the quota support is only useful for - multi user systems. If unsure, say N. + ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2 + and xfs use their own quota system. Ext3 and ext4 also support + journaled quotas for which you don't need to run quotacheck(8) after + an unclean shutdown. For further details, read the Quota mini-HOWTO, + available from <https://www.tldp.org/docs.html#howto>, or the + documentation provided with the quota tools. Probably the quota + support is only useful for multi user systems. If unsure, say N. config QUOTA_NETLINK_INTERFACE bool "Report quota messages through netlink interface" diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b40410cd39af..f9578918cfb2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -80,7 +80,6 @@ #include <linux/quotaops.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> -#include "../internal.h" /* ugh */ #include <linux/uaccess.h> @@ -689,6 +688,8 @@ int dquot_writeback_dquots(struct super_block *sb, int type) WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); + flush_delayed_work("a_release_work); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 290157bc7bec..7c2b75a44485 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -976,21 +976,19 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, struct super_block *sb; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; - struct fd f; + CLASS(fd_raw, f)(fd); int ret; - f = fdget_raw(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = -EINVAL; if (type >= MAXQUOTAS) - goto out; + return -EINVAL; if (quotactl_cmd_write(cmds)) { ret = mnt_want_write(fd_file(f)->f_path.mnt); if (ret) - goto out; + return ret; } sb = fd_file(f)->f_path.mnt->mnt_sb; @@ -1008,7 +1006,5 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, if (quotactl_cmd_write(cmds)) mnt_drop_write(fd_file(f)->f_path.mnt); -out: - fdput(f); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a..a6133241dfb8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; - struct fd f = fdget_pos(fd); - if (!fd_file(f)) + CLASS(fd_pos, f)(fd); + if (fd_empty(f)) return -EBADF; retval = -EINVAL; @@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput_pos(f); return retval; } @@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned int, whence) { int retval; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); loff_t offset; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - retval = -EINVAL; if (whence > SEEK_MAX) - goto out_putf; + return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); @@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } -out_putf: - fdput_pos(f); return retval; } #endif @@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file) ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } return ret; } @@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } return ret; @@ -751,21 +745,17 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { - struct fd f; - ssize_t ret = -EBADF; - if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { - ret = -ESPIPE; - if (fd_file(f)->f_mode & FMODE_PREAD) - ret = vfs_read(fd_file(f), buf, count, &pos); - fdput(f); - } + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - return ret; + if (fd_file(f)->f_mode & FMODE_PREAD) + return vfs_read(fd_file(f), buf, count, &pos); + + return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, @@ -785,21 +775,17 @@ COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { - struct fd f; - ssize_t ret = -EBADF; - if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { - ret = -ESPIPE; - if (fd_file(f)->f_mode & FMODE_PWRITE) - ret = vfs_write(fd_file(f), buf, count, &pos); - fdput(f); - } + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; - return ret; + if (fd_file(f)->f_mode & FMODE_PWRITE) + return vfs_write(fd_file(f), buf, count, &pos); + + return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, @@ -1075,10 +1061,10 @@ out: static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -1087,7 +1073,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } if (ret > 0) @@ -1099,10 +1084,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; - if (fd_file(f)) { + if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; @@ -1111,7 +1096,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; - fdput_pos(f); } if (ret > 0) @@ -1129,18 +1113,16 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { - struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { + CLASS(fd, f)(fd); + if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); - fdput(f); } if (ret > 0) @@ -1152,18 +1134,16 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { - struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; - f = fdget(fd); - if (fd_file(f)) { + CLASS(fd, f)(fd); + if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); - fdput(f); } if (ret > 0) @@ -1315,7 +1295,6 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { - struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; @@ -1326,35 +1305,32 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, /* * Get input file, and verify that it is ok.. */ - retval = -EBADF; - in = fdget(in_fd); - if (!fd_file(in)) - goto out; + CLASS(fd, in)(in_fd); + if (fd_empty(in)) + return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) - goto fput_in; - retval = -ESPIPE; + return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) - goto fput_in; + return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) - goto fput_in; + return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ - retval = -EBADF; - out = fdget(out_fd); - if (!fd_file(out)) - goto fput_in; + CLASS(fd, out)(out_fd); + if (fd_empty(out)) + return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) - goto fput_out; + return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; @@ -1363,9 +1339,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { - retval = -EOVERFLOW; if (pos >= max) - goto fput_out; + return -EOVERFLOW; count = max - pos; } @@ -1384,7 +1359,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) - goto fput_out; + return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { @@ -1410,12 +1385,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, inc_syscw(current); if (pos > max) retval = -EOVERFLOW; - -fput_out: - fdput(out); -fput_in: - fdput(in); -out: return retval; } @@ -1671,36 +1640,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, { loff_t pos_in; loff_t pos_out; - struct fd f_in; - struct fd f_out; ssize_t ret = -EBADF; - f_in = fdget(fd_in); - if (!fd_file(f_in)) - goto out2; + CLASS(fd, f_in)(fd_in); + if (fd_empty(f_in)) + return -EBADF; - f_out = fdget(fd_out); - if (!fd_file(f_out)) - goto out1; + CLASS(fd, f_out)(fd_out); + if (fd_empty(f_out)) + return -EBADF; - ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } - ret = -EINVAL; if (flags != 0) - goto out; + return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); @@ -1722,12 +1687,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, fd_file(f_out)->f_pos = pos_out; } } - -out: - fdput(f_out); -out1: - fdput(f_in); -out2: return ret; } @@ -1830,18 +1789,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; - if (!IS_ALIGNED(pos, len)) - return false; + if (!IS_ALIGNED(iocb->ki_pos, len)) + return -EINVAL; - return true; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; } +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); diff --git a/fs/readdir.c b/fs/readdir.c index 6d29cab8576e..0038efda417b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, .dirent = dirent }; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; - fdput_pos(f); return error; } @@ -309,7 +308,7 @@ efault: SYSCALL_DEFINE3(getdents, unsigned int, fd, struct linux_dirent __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, .count = count, @@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } @@ -392,7 +389,7 @@ efault: SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct linux_dirent64 __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } @@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; - struct fd f = fdget_pos(fd); + CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, .dirent = dirent }; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); if (buf.result) error = buf.result; - fdput_pos(f); return error; } @@ -560,7 +554,7 @@ efault: COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { - struct fd f; + CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, @@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = iterate_dir(fd_file(f), &buf.ctx); @@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fdput_pos(f); return error; } #endif diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig deleted file mode 100644 index 0e6fe26458fe..000000000000 --- a/fs/reiserfs/Kconfig +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config REISERFS_FS - tristate "Reiserfs support (deprecated)" - select BUFFER_HEAD - select CRC32 - select LEGACY_DIRECT_IO - help - Reiserfs is deprecated and scheduled to be removed from the kernel - in 2025. If you are still using it, please migrate to another - filesystem or tell us your usecase for reiserfs. - - Reiserfs stores not just filenames but the files themselves in a - balanced tree. Uses journalling. - - Balanced trees are more efficient than traditional file system - architectural foundations. - - In general, ReiserFS is as fast as ext2, but is very efficient with - large directories and small files. Additional patches are needed - for NFS and quotas, please see - <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links. - - It is more easily extended to have features currently found in - database and keyword search systems than block allocation based file - systems are. The next version will be so extended, and will support - plugins consistent with our motto ``It takes more than a license to - make source code open.'' - - Read <https://reiser4.wiki.kernel.org/index.php/Main_Page> - to learn more about reiserfs. - - Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. - - If you like it, you can pay us to add new features to it that you - need, buy a support contract, or pay us to port it to another OS. - -config REISERFS_CHECK - bool "Enable reiserfs debug mode" - depends on REISERFS_FS - help - If you set this to Y, then ReiserFS will perform every check it can - possibly imagine of its internal consistency throughout its - operation. It will also go substantially slower. More than once we - have forgotten that this was on, and then gone despondent over the - latest benchmarks.:-) Use of this option allows our team to go all - out in checking for consistency when debugging without fear of its - effect on end users. If you are on the verge of sending in a bug - report, say Y and you might get a useful error message. Almost - everyone should say N. - -config REISERFS_PROC_INFO - bool "Stats in /proc/fs/reiserfs" - depends on REISERFS_FS && PROC_FS - help - Create under /proc/fs/reiserfs a hierarchy of files, displaying - various ReiserFS statistics and internal data at the expense of - making your kernel or module slightly larger (+8 KB). This also - increases the amount of kernel memory required for each mount. - Almost everyone but ReiserFS developers and people fine-tuning - reiserfs or tracing problems should say N. - -config REISERFS_FS_XATTR - bool "ReiserFS extended attributes" - depends on REISERFS_FS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page for details). - - If unsure, say N. - -config REISERFS_FS_POSIX_ACL - bool "ReiserFS POSIX Access Control Lists" - depends on REISERFS_FS_XATTR - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - If you don't know what Access Control Lists are, say N - -config REISERFS_FS_SECURITY - bool "ReiserFS Security Labels" - depends on REISERFS_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ReiserFS filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile deleted file mode 100644 index bd29c58ccbd8..000000000000 --- a/fs/reiserfs/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for the linux reiser-filesystem routines. -# - -obj-$(CONFIG_REISERFS_FS) += reiserfs.o - -reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ - super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ - hashes.o tail_conversion.o journal.o resize.o \ - item_ops.o ioctl.o xattr.o lock.o - -ifeq ($(CONFIG_REISERFS_PROC_INFO),y) -reiserfs-objs += procfs.o -endif - -ifeq ($(CONFIG_REISERFS_FS_XATTR),y) -reiserfs-objs += xattr_user.o xattr_trusted.o -endif - -ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) -reiserfs-objs += xattr_security.o -endif - -ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) -reiserfs-objs += xattr_acl.o -endif - -TAGS: - etags *.c diff --git a/fs/reiserfs/README b/fs/reiserfs/README deleted file mode 100644 index 11e9ecf24b63..000000000000 --- a/fs/reiserfs/README +++ /dev/null @@ -1,151 +0,0 @@ -[LICENSING] - -ReiserFS is hereby licensed under the GNU General -Public License version 2. - -Source code files that contain the phrase "licensing governed by -reiserfs/README" are "governed files" throughout this file. Governed -files are licensed under the GPL. The portions of them owned by Hans -Reiser, or authorized to be licensed by him, have been in the past, -and likely will be in the future, licensed to other parties under -other licenses. If you add your code to governed files, and don't -want it to be owned by Hans Reiser, put your copyright label on that -code so the poor blight and his customers can keep things straight. -All portions of governed files not labeled otherwise are owned by Hans -Reiser, and by adding your code to it, widely distributing it to -others or sending us a patch, and leaving the sentence in stating that -licensing is governed by the statement in this file, you accept this. -It will be a kindness if you identify whether Hans Reiser is allowed -to license code labeled as owned by you on your behalf other than -under the GPL, because he wants to know if it is okay to do so and put -a check in the mail to you (for non-trivial improvements) when he -makes his next sale. He makes no guarantees as to the amount if any, -though he feels motivated to motivate contributors, and you can surely -discuss this with him before or after contributing. You have the -right to decline to allow him to license your code contribution other -than under the GPL. - -Further licensing options are available for commercial and/or other -interests directly from Hans Reiser: hans@reiser.to. If you interpret -the GPL as not allowing those additional licensing options, you read -it wrongly, and Richard Stallman agrees with me, when carefully read -you can see that those restrictions on additional terms do not apply -to the owner of the copyright, and my interpretation of this shall -govern for this license. - -Finally, nothing in this license shall be interpreted to allow you to -fail to fairly credit me, or to remove my credits, without my -permission, unless you are an end user not redistributing to others. -If you have doubts about how to properly do that, or about what is -fair, ask. (Last I spoke with him Richard was contemplating how best -to address the fair crediting issue in the next GPL version.) - -[END LICENSING] - -Reiserfs is a file system based on balanced tree algorithms, which is -described at https://reiser4.wiki.kernel.org/index.php/Main_Page - -Stop reading here. Go there, then return. - -Send bug reports to yura@namesys.botik.ru. - -mkreiserfs and other utilities are in reiserfs/utils, or wherever your -Linux provider put them. There is some disagreement about how useful -it is for users to get their fsck and mkreiserfs out of sync with the -version of reiserfs that is in their kernel, with many important -distributors wanting them out of sync.:-) Please try to remember to -recompile and reinstall fsck and mkreiserfs with every update of -reiserfs, this is a common source of confusion. Note that some of the -utilities cannot be compiled without accessing the balancing code -which is in the kernel code, and relocating the utilities may require -you to specify where that code can be found. - -Yes, if you update your reiserfs kernel module you do have to -recompile your kernel, most of the time. The errors you get will be -quite cryptic if your forget to do so. - -Real users, as opposed to folks who want to hack and then understand -what went wrong, will want REISERFS_CHECK off. - -Hideous Commercial Pitch: Spread your development costs across other OS -vendors. Select from the best in the world, not the best in your -building, by buying from third party OS component suppliers. Leverage -the software component development power of the internet. Be the most -aggressive in taking advantage of the commercial possibilities of -decentralized internet development, and add value through your branded -integration that you sell as an operating system. Let your competitors -be the ones to compete against the entire internet by themselves. Be -hip, get with the new economic trend, before your competitors do. Send -email to hans@reiser.to. - -To understand the code, after reading the website, start reading the -code by reading reiserfs_fs.h first. - -Hans Reiser was the project initiator, primary architect, source of all -funding for the first 5.5 years, and one of the programmers. He owns -the copyright. - -Vladimir Saveljev was one of the programmers, and he worked long hours -writing the cleanest code. He always made the effort to be the best he -could be, and to make his code the best that it could be. What resulted -was quite remarkable. I don't think that money can ever motivate someone -to work the way he did, he is one of the most selfless men I know. - -Yura helps with benchmarking, coding hashes, and block pre-allocation -code. - -Anatoly Pinchuk is a former member of our team who worked closely with -Vladimir throughout the project's development. He wrote a quite -substantial portion of the total code. He realized that there was a -space problem with packing tails of files for files larger than a node -that start on a node aligned boundary (there are reasons to want to node -align files), and he invented and implemented indirect items and -unformatted nodes as the solution. - -Konstantin Shvachko was taking part in the early days. - -Mikhail Gilula was a brilliant innovator that has shown much generosity. - -Grigory Zaigralin was an extremely effective system administrator for -our group. - -Igor Krasheninnikov was wonderful at hardware procurement, repair, and -network installation. - -Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a -textbook he got the algorithm from in the code. Note that his analysis -of how we could use the hashing code in making 32 bit NFS cookies work -was probably more important than the actual algorithm. Colin Plumb also -contributed to it. - -Chris Mason dived right into our code, and in just a few months produced -the journaling code that dramatically increased the value of ReiserFS. -He is just an amazing programmer. - -Igor Zagorovsky is writing much of the new item handler and extent code -for our next major release. - -Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the -resizer, and is hard at work on implementing allocate on flush. SGI -implemented allocate on flush before us for XFS, and generously took -the time to convince me we should do it also. They are great people, -and a great company. - -Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. - -Vitaly Fertman is doing fsck. - -Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably -the endian safe patches which allow ReiserFS to run on any platform -supported by the Linux kernel. - -SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the -Alpha PC Company made it possible for me to not have a day job -anymore, and to dramatically increase our staffing. Ecila funded -hypertext feature development, MP3.com funded journaling, SuSE funded -core development, IntegratedLinux.com funded squid web cache -appliances, bigstorage.com funded HSM, and the alpha PC company funded -the alpha port. Many of these tasks were helped by sponsors other -than the ones just named. SuSE has helped in much more than just -funding.... - diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h deleted file mode 100644 index 2571b1a8be84..000000000000 --- a/fs/reiserfs/acl.h +++ /dev/null @@ -1,78 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/init.h> -#include <linux/posix_acl.h> - -#define REISERFS_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} reiserfs_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} reiserfs_acl_entry_short; - -typedef struct { - __le32 a_version; -} reiserfs_acl_header; - -static inline size_t reiserfs_acl_size(int count) -{ - if (count <= 4) { - return sizeof(reiserfs_acl_header) + - count * sizeof(reiserfs_acl_entry_short); - } else { - return sizeof(reiserfs_acl_header) + - 4 * sizeof(reiserfs_acl_entry_short) + - (count - 4) * sizeof(reiserfs_acl_entry); - } -} - -static inline int reiserfs_acl_count(size_t size) -{ - ssize_t s; - size -= sizeof(reiserfs_acl_header); - s = size - 4 * sizeof(reiserfs_acl_entry_short); - if (s < 0) { - if (size % sizeof(reiserfs_acl_entry_short)) - return -1; - return size / sizeof(reiserfs_acl_entry_short); - } else { - if (s % sizeof(reiserfs_acl_entry)) - return -1; - return s / sizeof(reiserfs_acl_entry) + 4; - } -} - -#ifdef CONFIG_REISERFS_FS_POSIX_ACL -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); -int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, - struct posix_acl *acl, int type); -int reiserfs_acl_chmod(struct dentry *dentry); -int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, - struct inode *dir, struct dentry *dentry, - struct inode *inode); -int reiserfs_cache_default_acl(struct inode *dir); - -#else - -#define reiserfs_cache_default_acl(inode) 0 -#define reiserfs_get_acl NULL -#define reiserfs_set_acl NULL - -static inline int reiserfs_acl_chmod(struct dentry *dentry) -{ - return 0; -} - -static inline int -reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, - const struct inode *dir, struct dentry *dentry, - struct inode *inode) -{ - return 0; -} -#endif diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c deleted file mode 100644 index bf708ac287b4..000000000000 --- a/fs/reiserfs/bitmap.c +++ /dev/null @@ -1,1476 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ -/* Reiserfs block (de)allocator, bitmap-based. */ - -#include <linux/time.h> -#include "reiserfs.h" -#include <linux/errno.h> -#include <linux/buffer_head.h> -#include <linux/kernel.h> -#include <linux/pagemap.h> -#include <linux/vmalloc.h> -#include <linux/quotaops.h> -#include <linux/seq_file.h> - -#define PREALLOCATION_SIZE 9 - -/* different reiserfs block allocator options */ - -#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) - -#define _ALLOC_concentrating_formatted_nodes 0 -#define _ALLOC_displacing_large_files 1 -#define _ALLOC_displacing_new_packing_localities 2 -#define _ALLOC_old_hashed_relocation 3 -#define _ALLOC_new_hashed_relocation 4 -#define _ALLOC_skip_busy 5 -#define _ALLOC_displace_based_on_dirid 6 -#define _ALLOC_hashed_formatted_nodes 7 -#define _ALLOC_old_way 8 -#define _ALLOC_hundredth_slices 9 -#define _ALLOC_dirid_groups 10 -#define _ALLOC_oid_groups 11 -#define _ALLOC_packing_groups 12 - -#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) -#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) -#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) - -#define SET_OPTION(optname) \ - do { \ - reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \ - set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ - } while(0) -#define TEST_OPTION(optname, s) \ - test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) - -static inline void get_bit_address(struct super_block *s, - b_blocknr_t block, - unsigned int *bmap_nr, - unsigned int *offset) -{ - /* - * It is in the bitmap block number equal to the block - * number divided by the number of bits in a block. - */ - *bmap_nr = block >> (s->s_blocksize_bits + 3); - /* Within that bitmap block it is located at bit offset *offset. */ - *offset = block & ((s->s_blocksize << 3) - 1); -} - -int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) -{ - unsigned int bmap, offset; - unsigned int bmap_count = reiserfs_bmap_count(s); - - if (block == 0 || block >= SB_BLOCK_COUNT(s)) { - reiserfs_error(s, "vs-4010", - "block number is out of range %lu (%u)", - block, SB_BLOCK_COUNT(s)); - return 0; - } - - get_bit_address(s, block, &bmap, &offset); - - /* - * Old format filesystem? Unlikely, but the bitmaps are all - * up front so we need to account for it. - */ - if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &REISERFS_SB(s)->s_properties))) { - b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; - if (block >= bmap1 && - block <= bmap1 + bmap_count) { - reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) " - "can't be freed or reused", - block, bmap_count); - return 0; - } - } else { - if (offset == 0) { - reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) " - "can't be freed or reused", - block, bmap_count); - return 0; - } - } - - if (bmap >= bmap_count) { - reiserfs_error(s, "vs-4030", "bitmap for requested block " - "is out of range: block=%lu, bitmap_nr=%u", - block, bmap); - return 0; - } - - if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { - reiserfs_error(s, "vs-4050", "this is root block (%u), " - "it must be busy", SB_ROOT_BLOCK(s)); - return 0; - } - - return 1; -} - -/* - * Searches in journal structures for a given block number (bmap, off). - * If block is found in reiserfs journal it suggests next free block - * candidate to test. - */ -static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, - int off, int *next) -{ - b_blocknr_t tmp; - - if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { - if (tmp) { /* hint supplied */ - *next = tmp; - PROC_INFO_INC(s, scan_bitmap.in_journal_hint); - } else { - (*next) = off + 1; /* inc offset to avoid looping. */ - PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); - } - PROC_INFO_INC(s, scan_bitmap.retry); - return 1; - } - return 0; -} - -/* - * Searches for a window of zero bits with given minimum and maximum - * lengths in one bitmap block - */ -static int scan_bitmap_block(struct reiserfs_transaction_handle *th, - unsigned int bmap_n, int *beg, int boundary, - int min, int max, int unfm) -{ - struct super_block *s = th->t_super; - struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; - struct buffer_head *bh; - int end, next; - int org = *beg; - - BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " - "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); - PROC_INFO_INC(s, scan_bitmap.bmap); - - if (!bi) { - reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " - "for bitmap %d", bmap_n); - return 0; - } - - bh = reiserfs_read_bitmap_block(s, bmap_n); - if (bh == NULL) - return 0; - - while (1) { -cont: - if (bi->free_count < min) { - brelse(bh); - return 0; /* No free blocks in this bitmap */ - } - - /* search for a first zero bit -- beginning of a window */ - *beg = reiserfs_find_next_zero_le_bit - ((unsigned long *)(bh->b_data), boundary, *beg); - - /* - * search for a zero bit fails or the rest of bitmap block - * cannot contain a zero window of minimum size - */ - if (*beg + min > boundary) { - brelse(bh); - return 0; - } - - if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) - continue; - /* first zero bit found; we check next bits */ - for (end = *beg + 1;; end++) { - if (end >= *beg + max || end >= boundary - || reiserfs_test_le_bit(end, bh->b_data)) { - next = end; - break; - } - - /* - * finding the other end of zero bit window requires - * looking into journal structures (in case of - * searching for free blocks for unformatted nodes) - */ - if (unfm && is_block_in_journal(s, bmap_n, end, &next)) - break; - } - - /* - * now (*beg) points to beginning of zero bits window, - * (end) points to one bit after the window end - */ - - /* found window of proper size */ - if (end - *beg >= min) { - int i; - reiserfs_prepare_for_journal(s, bh, 1); - /* - * try to set all blocks used checking are - * they still free - */ - for (i = *beg; i < end; i++) { - /* Don't check in journal again. */ - if (reiserfs_test_and_set_le_bit - (i, bh->b_data)) { - /* - * bit was set by another process while - * we slept in prepare_for_journal() - */ - PROC_INFO_INC(s, scan_bitmap.stolen); - - /* - * we can continue with smaller set - * of allocated blocks, if length of - * this set is more or equal to `min' - */ - if (i >= *beg + min) { - end = i; - break; - } - - /* - * otherwise we clear all bit - * were set ... - */ - while (--i >= *beg) - reiserfs_clear_le_bit - (i, bh->b_data); - reiserfs_restore_prepared_buffer(s, bh); - *beg = org; - - /* - * Search again in current block - * from beginning - */ - goto cont; - } - } - bi->free_count -= (end - *beg); - journal_mark_dirty(th, bh); - brelse(bh); - - /* free block count calculation */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), - 1); - PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); - journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); - - return end - (*beg); - } else { - *beg = next; - } - } -} - -static int bmap_hash_id(struct super_block *s, u32 id) -{ - char *hash_in = NULL; - unsigned long hash; - unsigned bm; - - if (id <= 2) { - bm = 1; - } else { - hash_in = (char *)(&id); - hash = keyed_hash(hash_in, 4); - bm = hash % reiserfs_bmap_count(s); - if (!bm) - bm = 1; - } - /* this can only be true when SB_BMAP_NR = 1 */ - if (bm >= reiserfs_bmap_count(s)) - bm = 0; - return bm; -} - -/* - * hashes the id and then returns > 0 if the block group for the - * corresponding hash is full - */ -static inline int block_group_used(struct super_block *s, u32 id) -{ - int bm = bmap_hash_id(s, id); - struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; - - /* - * If we don't have cached information on this bitmap block, we're - * going to have to load it later anyway. Loading it here allows us - * to make a better decision. This favors long-term performance gain - * with a better on-disk layout vs. a short term gain of skipping the - * read and potentially having a bad placement. - */ - if (info->free_count == UINT_MAX) { - struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); - brelse(bh); - } - - if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) { - return 0; - } - return 1; -} - -/* - * the packing is returned in disk byte order - */ -__le32 reiserfs_choose_packing(struct inode * dir) -{ - __le32 packing; - if (TEST_OPTION(packing_groups, dir->i_sb)) { - u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); - /* - * some versions of reiserfsck expect packing locality 1 to be - * special - */ - if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) - packing = INODE_PKEY(dir)->k_objectid; - else - packing = INODE_PKEY(dir)->k_dir_id; - } else - packing = INODE_PKEY(dir)->k_objectid; - return packing; -} - -/* - * Tries to find contiguous zero bit window (given size) in given region of - * bitmap and place new blocks there. Returns number of allocated blocks. - */ -static int scan_bitmap(struct reiserfs_transaction_handle *th, - b_blocknr_t * start, b_blocknr_t finish, - int min, int max, int unfm, sector_t file_block) -{ - int nr_allocated = 0; - struct super_block *s = th->t_super; - unsigned int bm, off; - unsigned int end_bm, end_off; - unsigned int off_max = s->s_blocksize << 3; - - BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, scan_bitmap.call); - - /* No point in looking for more free blocks */ - if (SB_FREE_BLOCKS(s) <= 0) - return 0; - - get_bit_address(s, *start, &bm, &off); - get_bit_address(s, finish, &end_bm, &end_off); - if (bm > reiserfs_bmap_count(s)) - return 0; - if (end_bm > reiserfs_bmap_count(s)) - end_bm = reiserfs_bmap_count(s); - - /* - * When the bitmap is more than 10% free, anyone can allocate. - * When it's less than 10% free, only files that already use the - * bitmap are allowed. Once we pass 80% full, this restriction - * is lifted. - * - * We do this so that files that grow later still have space close to - * their original allocation. This improves locality, and presumably - * performance as a result. - * - * This is only an allocation policy and does not make up for getting a - * bad hint. Decent hinting must be implemented for this to work well. - */ - if (TEST_OPTION(skip_busy, s) - && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { - for (; bm < end_bm; bm++, off = 0) { - if ((off && (!unfm || (file_block != 0))) - || SB_AP_BITMAP(s)[bm].free_count > - (s->s_blocksize << 3) / 10) - nr_allocated = - scan_bitmap_block(th, bm, &off, off_max, - min, max, unfm); - if (nr_allocated) - goto ret; - } - /* we know from above that start is a reasonable number */ - get_bit_address(s, *start, &bm, &off); - } - - for (; bm < end_bm; bm++, off = 0) { - nr_allocated = - scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); - if (nr_allocated) - goto ret; - } - - nr_allocated = - scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); - -ret: - *start = bm * off_max + off; - return nr_allocated; - -} - -static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block, - int for_unformatted) -{ - struct super_block *s = th->t_super; - struct reiserfs_super_block *rs; - struct buffer_head *sbh, *bmbh; - struct reiserfs_bitmap_info *apbi; - unsigned int nr, offset; - - BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, free_block); - rs = SB_DISK_SUPER_BLOCK(s); - sbh = SB_BUFFER_WITH_SB(s); - apbi = SB_AP_BITMAP(s); - - get_bit_address(s, block, &nr, &offset); - - if (nr >= reiserfs_bmap_count(s)) { - reiserfs_error(s, "vs-4075", "block %lu is out of range", - block); - return; - } - - bmbh = reiserfs_read_bitmap_block(s, nr); - if (!bmbh) - return; - - reiserfs_prepare_for_journal(s, bmbh, 1); - - /* clear bit for the given block in bit map */ - if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { - reiserfs_error(s, "vs-4080", - "block %lu: bit already cleared", block); - } - apbi[nr].free_count++; - journal_mark_dirty(th, bmbh); - brelse(bmbh); - - reiserfs_prepare_for_journal(s, sbh, 1); - /* update super block */ - set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); - - journal_mark_dirty(th, sbh); - if (for_unformatted) { - int depth = reiserfs_write_unlock_nested(s); - dquot_free_block_nodirty(inode, 1); - reiserfs_write_lock_nested(s, depth); - } -} - -void reiserfs_free_block(struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block, - int for_unformatted) -{ - struct super_block *s = th->t_super; - - BUG_ON(!th->t_trans_id); - RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); - if (!is_reusable(s, block, 1)) - return; - - if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { - reiserfs_error(th->t_super, "bitmap-4072", - "Trying to free block outside file system " - "boundaries (%lu > %lu)", - block, sb_block_count(REISERFS_SB(s)->s_rs)); - return; - } - /* mark it before we clear it, just in case */ - journal_mark_freed(th, s, block); - _reiserfs_free_block(th, inode, block, for_unformatted); -} - -/* preallocated blocks don't need to be run through journal_mark_freed */ -static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block) -{ - BUG_ON(!th->t_trans_id); - RFALSE(!th->t_super, - "vs-4060: trying to free block on nonexistent device"); - if (!is_reusable(th->t_super, block, 1)) - return; - _reiserfs_free_block(th, inode, block, 1); -} - -static void __discard_prealloc(struct reiserfs_transaction_handle *th, - struct reiserfs_inode_info *ei) -{ - unsigned long save = ei->i_prealloc_block; - int dirty = 0; - struct inode *inode = &ei->vfs_inode; - - BUG_ON(!th->t_trans_id); -#ifdef CONFIG_REISERFS_CHECK - if (ei->i_prealloc_count < 0) - reiserfs_error(th->t_super, "zam-4001", - "inode has negative prealloc blocks count."); -#endif - while (ei->i_prealloc_count > 0) { - b_blocknr_t block_to_free; - - /* - * reiserfs_free_prealloc_block can drop the write lock, - * which could allow another caller to free the same block. - * We can protect against it by modifying the prealloc - * state before calling it. - */ - block_to_free = ei->i_prealloc_block++; - ei->i_prealloc_count--; - reiserfs_free_prealloc_block(th, inode, block_to_free); - dirty = 1; - } - if (dirty) - reiserfs_update_sd(th, inode); - ei->i_prealloc_block = save; - list_del_init(&ei->i_prealloc_list); -} - -/* FIXME: It should be inline function */ -void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, - struct inode *inode) -{ - struct reiserfs_inode_info *ei = REISERFS_I(inode); - - BUG_ON(!th->t_trans_id); - if (ei->i_prealloc_count) - __discard_prealloc(th, ei); -} - -void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) -{ - struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; - - BUG_ON(!th->t_trans_id); - while (!list_empty(plist)) { - struct reiserfs_inode_info *ei; - ei = list_entry(plist->next, struct reiserfs_inode_info, - i_prealloc_list); -#ifdef CONFIG_REISERFS_CHECK - if (!ei->i_prealloc_count) { - reiserfs_error(th->t_super, "zam-4001", - "inode is in prealloc list but has " - "no preallocated blocks."); - } -#endif - __discard_prealloc(th, ei); - } -} - -void reiserfs_init_alloc_options(struct super_block *s) -{ - set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); - set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); - set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); -} - -/* block allocator related options are parsed here */ -int reiserfs_parse_alloc_options(struct super_block *s, char *options) -{ - char *this_char, *value; - - /* clear default settings */ - REISERFS_SB(s)->s_alloc_options.bits = 0; - - while ((this_char = strsep(&options, ":")) != NULL) { - if ((value = strchr(this_char, '=')) != NULL) - *value++ = 0; - - if (!strcmp(this_char, "concentrating_formatted_nodes")) { - int temp; - SET_OPTION(concentrating_formatted_nodes); - temp = (value - && *value) ? simple_strtoul(value, &value, - 0) : 10; - if (temp <= 0 || temp > 100) { - REISERFS_SB(s)->s_alloc_options.border = 10; - } else { - REISERFS_SB(s)->s_alloc_options.border = - 100 / temp; - } - continue; - } - if (!strcmp(this_char, "displacing_large_files")) { - SET_OPTION(displacing_large_files); - REISERFS_SB(s)->s_alloc_options.large_file_size = - (value - && *value) ? simple_strtoul(value, &value, 0) : 16; - continue; - } - if (!strcmp(this_char, "displacing_new_packing_localities")) { - SET_OPTION(displacing_new_packing_localities); - continue; - } - - if (!strcmp(this_char, "old_hashed_relocation")) { - SET_OPTION(old_hashed_relocation); - continue; - } - - if (!strcmp(this_char, "new_hashed_relocation")) { - SET_OPTION(new_hashed_relocation); - continue; - } - - if (!strcmp(this_char, "dirid_groups")) { - SET_OPTION(dirid_groups); - continue; - } - if (!strcmp(this_char, "oid_groups")) { - SET_OPTION(oid_groups); - continue; - } - if (!strcmp(this_char, "packing_groups")) { - SET_OPTION(packing_groups); - continue; - } - if (!strcmp(this_char, "hashed_formatted_nodes")) { - SET_OPTION(hashed_formatted_nodes); - continue; - } - - if (!strcmp(this_char, "skip_busy")) { - SET_OPTION(skip_busy); - continue; - } - - if (!strcmp(this_char, "hundredth_slices")) { - SET_OPTION(hundredth_slices); - continue; - } - - if (!strcmp(this_char, "old_way")) { - SET_OPTION(old_way); - continue; - } - - if (!strcmp(this_char, "displace_based_on_dirid")) { - SET_OPTION(displace_based_on_dirid); - continue; - } - - if (!strcmp(this_char, "preallocmin")) { - REISERFS_SB(s)->s_alloc_options.preallocmin = - (value - && *value) ? simple_strtoul(value, &value, 0) : 4; - continue; - } - - if (!strcmp(this_char, "preallocsize")) { - REISERFS_SB(s)->s_alloc_options.preallocsize = - (value - && *value) ? simple_strtoul(value, &value, - 0) : - PREALLOCATION_SIZE; - continue; - } - - reiserfs_warning(s, "zam-4001", "unknown option - %s", - this_char); - return 1; - } - - reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); - return 0; -} - -static void print_sep(struct seq_file *seq, int *first) -{ - if (!*first) - seq_puts(seq, ":"); - else - *first = 0; -} - -void show_alloc_options(struct seq_file *seq, struct super_block *s) -{ - int first = 1; - - if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | - (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) - return; - - seq_puts(seq, ",alloc="); - - if (TEST_OPTION(concentrating_formatted_nodes, s)) { - print_sep(seq, &first); - if (REISERFS_SB(s)->s_alloc_options.border != 10) { - seq_printf(seq, "concentrating_formatted_nodes=%d", - 100 / REISERFS_SB(s)->s_alloc_options.border); - } else - seq_puts(seq, "concentrating_formatted_nodes"); - } - if (TEST_OPTION(displacing_large_files, s)) { - print_sep(seq, &first); - if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { - seq_printf(seq, "displacing_large_files=%lu", - REISERFS_SB(s)->s_alloc_options.large_file_size); - } else - seq_puts(seq, "displacing_large_files"); - } - if (TEST_OPTION(displacing_new_packing_localities, s)) { - print_sep(seq, &first); - seq_puts(seq, "displacing_new_packing_localities"); - } - if (TEST_OPTION(old_hashed_relocation, s)) { - print_sep(seq, &first); - seq_puts(seq, "old_hashed_relocation"); - } - if (TEST_OPTION(new_hashed_relocation, s)) { - print_sep(seq, &first); - seq_puts(seq, "new_hashed_relocation"); - } - if (TEST_OPTION(dirid_groups, s)) { - print_sep(seq, &first); - seq_puts(seq, "dirid_groups"); - } - if (TEST_OPTION(oid_groups, s)) { - print_sep(seq, &first); - seq_puts(seq, "oid_groups"); - } - if (TEST_OPTION(packing_groups, s)) { - print_sep(seq, &first); - seq_puts(seq, "packing_groups"); - } - if (TEST_OPTION(hashed_formatted_nodes, s)) { - print_sep(seq, &first); - seq_puts(seq, "hashed_formatted_nodes"); - } - if (TEST_OPTION(skip_busy, s)) { - print_sep(seq, &first); - seq_puts(seq, "skip_busy"); - } - if (TEST_OPTION(hundredth_slices, s)) { - print_sep(seq, &first); - seq_puts(seq, "hundredth_slices"); - } - if (TEST_OPTION(old_way, s)) { - print_sep(seq, &first); - seq_puts(seq, "old_way"); - } - if (TEST_OPTION(displace_based_on_dirid, s)) { - print_sep(seq, &first); - seq_puts(seq, "displace_based_on_dirid"); - } - if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { - print_sep(seq, &first); - seq_printf(seq, "preallocmin=%d", - REISERFS_SB(s)->s_alloc_options.preallocmin); - } - if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { - print_sep(seq, &first); - seq_printf(seq, "preallocsize=%d", - REISERFS_SB(s)->s_alloc_options.preallocsize); - } -} - -static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) -{ - char *hash_in; - - if (hint->formatted_node) { - hash_in = (char *)&hint->key.k_dir_id; - } else { - if (!hint->inode) { - /*hint->search_start = hint->beg;*/ - hash_in = (char *)&hint->key.k_dir_id; - } else - if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); - else - hash_in = - (char *)(&INODE_PKEY(hint->inode)->k_objectid); - } - - hint->search_start = - hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); -} - -/* - * Relocation based on dirid, hashing them into a given bitmap block - * files. Formatted nodes are unaffected, a separate policy covers them - */ -static void dirid_groups(reiserfs_blocknr_hint_t * hint) -{ - unsigned long hash; - __u32 dirid = 0; - int bm = 0; - struct super_block *sb = hint->th->t_super; - - if (hint->inode) - dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - else if (hint->formatted_node) - dirid = hint->key.k_dir_id; - - if (dirid) { - bm = bmap_hash_id(sb, dirid); - hash = bm * (sb->s_blocksize << 3); - /* give a portion of the block group to metadata */ - if (hint->inode) - hash += sb->s_blocksize / 2; - hint->search_start = hash; - } -} - -/* - * Relocation based on oid, hashing them into a given bitmap block - * files. Formatted nodes are unaffected, a separate policy covers them - */ -static void oid_groups(reiserfs_blocknr_hint_t * hint) -{ - if (hint->inode) { - unsigned long hash; - __u32 oid; - __u32 dirid; - int bm; - - dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - - /* - * keep the root dir and it's first set of subdirs close to - * the start of the disk - */ - if (dirid <= 2) - hash = (hint->inode->i_sb->s_blocksize << 3); - else { - oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); - bm = bmap_hash_id(hint->inode->i_sb, oid); - hash = bm * (hint->inode->i_sb->s_blocksize << 3); - } - hint->search_start = hash; - } -} - -/* - * returns 1 if it finds an indirect item and gets valid hint info - * from it, otherwise 0 - */ -static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) -{ - struct treepath *path; - struct buffer_head *bh; - struct item_head *ih; - int pos_in_item; - __le32 *item; - int ret = 0; - - /* - * reiserfs code can call this function w/o pointer to path - * structure supplied; then we rely on supplied search_start - */ - if (!hint->path) - return 0; - - path = hint->path; - bh = get_last_bh(path); - RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); - ih = tp_item_head(path); - pos_in_item = path->pos_in_item; - item = tp_item_body(path); - - hint->search_start = bh->b_blocknr; - - /* - * for indirect item: go to left and look for the first non-hole entry - * in the indirect item - */ - if (!hint->formatted_node && is_indirect_le_ih(ih)) { - if (pos_in_item == I_UNFM_NUM(ih)) - pos_in_item--; - while (pos_in_item >= 0) { - int t = get_block_num(item, pos_in_item); - if (t) { - hint->search_start = t; - ret = 1; - break; - } - pos_in_item--; - } - } - - /* does result value fit into specified region? */ - return ret; -} - -/* - * should be, if formatted node, then try to put on first part of the device - * specified as number of percent with mount option device, else try to put - * on last of device. This is not to say it is good code to do so, - * but the effect should be measured. - */ -static inline void set_border_in_hint(struct super_block *s, - reiserfs_blocknr_hint_t * hint) -{ - b_blocknr_t border = - SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; - - if (hint->formatted_node) - hint->end = border - 1; - else - hint->beg = border; -} - -static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) -{ - if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hint->search_start = - hint->beg + - keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), - 4) % (hint->end - hint->beg); - else - hint->search_start = - hint->beg + - keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), - 4) % (hint->end - hint->beg); -} - -static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) -{ - char *hash_in; - - if (!hint->inode) - hash_in = (char *)&hint->key.k_dir_id; - else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); - else - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); - - hint->search_start = - hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); -} - -static inline int -this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * - hint) -{ - return hint->block == - REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; -} - -#ifdef DISPLACE_NEW_PACKING_LOCALITIES -static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) -{ - struct in_core_key *key = &hint->key; - - hint->th->displace_new_blocks = 0; - hint->search_start = - hint->beg + keyed_hash((char *)(&key->k_objectid), - 4) % (hint->end - hint->beg); -} -#endif - -static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) -{ - b_blocknr_t border; - u32 hash_in; - - if (hint->formatted_node || hint->inode == NULL) { - return 0; - } - - hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); - border = - hint->beg + (u32) keyed_hash(((char *)(&hash_in)), - 4) % (hint->end - hint->beg - 1); - if (border > hint->search_start) - hint->search_start = border; - - return 1; -} - -static inline int old_way(reiserfs_blocknr_hint_t * hint) -{ - b_blocknr_t border; - - if (hint->formatted_node || hint->inode == NULL) { - return 0; - } - - border = - hint->beg + - le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - - hint->beg); - if (border > hint->search_start) - hint->search_start = border; - - return 1; -} - -static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) -{ - struct in_core_key *key = &hint->key; - b_blocknr_t slice_start; - - slice_start = - (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); - if (slice_start > hint->search_start - || slice_start + (hint->end / 100) <= hint->search_start) { - hint->search_start = slice_start; - } -} - -static void determine_search_start(reiserfs_blocknr_hint_t * hint, - int amount_needed) -{ - struct super_block *s = hint->th->t_super; - int unfm_hint; - - hint->beg = 0; - hint->end = SB_BLOCK_COUNT(s) - 1; - - /* This is former border algorithm. Now with tunable border offset */ - if (concentrating_formatted_nodes(s)) - set_border_in_hint(s, hint); - -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* - * whenever we create a new directory, we displace it. At first - * we will hash for location, later we might look for a moderately - * empty place for it - */ - if (displacing_new_packing_localities(s) - && hint->th->displace_new_blocks) { - displace_new_packing_locality(hint); - - /* - * we do not continue determine_search_start, - * if new packing locality is being displaced - */ - return; - } -#endif - - /* - * all persons should feel encouraged to add more special cases - * here and test them - */ - - if (displacing_large_files(s) && !hint->formatted_node - && this_blocknr_allocation_would_make_it_a_large_file(hint)) { - displace_large_file(hint); - return; - } - - /* - * if none of our special cases is relevant, use the left - * neighbor in the tree order of the new node we are allocating for - */ - if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { - hash_formatted_node(hint); - return; - } - - unfm_hint = get_left_neighbor(hint); - - /* - * Mimic old block allocator behaviour, that is if VFS allowed for - * preallocation, new blocks are displaced based on directory ID. - * Also, if suggested search_start is less than last preallocated - * block, we start searching from it, assuming that HDD dataflow - * is faster in forward direction - */ - if (TEST_OPTION(old_way, s)) { - if (!hint->formatted_node) { - if (!reiserfs_hashed_relocation(s)) - old_way(hint); - else if (!reiserfs_no_unhashed_relocation(s)) - old_hashed_relocation(hint); - - if (hint->inode - && hint->search_start < - REISERFS_I(hint->inode)->i_prealloc_block) - hint->search_start = - REISERFS_I(hint->inode)->i_prealloc_block; - } - return; - } - - /* This is an approach proposed by Hans */ - if (TEST_OPTION(hundredth_slices, s) - && !(displacing_large_files(s) && !hint->formatted_node)) { - hundredth_slices(hint); - return; - } - - /* old_hashed_relocation only works on unformatted */ - if (!unfm_hint && !hint->formatted_node && - TEST_OPTION(old_hashed_relocation, s)) { - old_hashed_relocation(hint); - } - - /* new_hashed_relocation works with both formatted/unformatted nodes */ - if ((!unfm_hint || hint->formatted_node) && - TEST_OPTION(new_hashed_relocation, s)) { - new_hashed_relocation(hint); - } - - /* dirid grouping works only on unformatted nodes */ - if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { - dirid_groups(hint); - } -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { - dirid_groups(hint); - } -#endif - - /* oid grouping works only on unformatted nodes */ - if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { - oid_groups(hint); - } - return; -} - -static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) -{ - /* make minimum size a mount option and benchmark both ways */ - /* we preallocate blocks only for regular files, specific size */ - /* benchmark preallocating always and see what happens */ - - hint->prealloc_size = 0; - - if (!hint->formatted_node && hint->preallocate) { - if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode) - && hint->inode->i_size >= - REISERFS_SB(hint->th->t_super)->s_alloc_options. - preallocmin * hint->inode->i_sb->s_blocksize) - hint->prealloc_size = - REISERFS_SB(hint->th->t_super)->s_alloc_options. - preallocsize - 1; - } - return CARRY_ON; -} - -static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, - b_blocknr_t * new_blocknrs, - b_blocknr_t start, - b_blocknr_t finish, int min, - int amount_needed, - int prealloc_size) -{ - int rest = amount_needed; - int nr_allocated; - - while (rest > 0 && start <= finish) { - nr_allocated = scan_bitmap(hint->th, &start, finish, min, - rest + prealloc_size, - !hint->formatted_node, hint->block); - - if (nr_allocated == 0) /* no new blocks allocated, return */ - break; - - /* fill free_blocknrs array first */ - while (rest > 0 && nr_allocated > 0) { - *new_blocknrs++ = start++; - rest--; - nr_allocated--; - } - - /* do we have something to fill prealloc. array also ? */ - if (nr_allocated > 0) { - /* - * it means prealloc_size was greater that 0 and - * we do preallocation - */ - list_add(&REISERFS_I(hint->inode)->i_prealloc_list, - &SB_JOURNAL(hint->th->t_super)-> - j_prealloc_list); - REISERFS_I(hint->inode)->i_prealloc_block = start; - REISERFS_I(hint->inode)->i_prealloc_count = - nr_allocated; - break; - } - } - - return (amount_needed - rest); -} - -static inline int blocknrs_and_prealloc_arrays_from_search_start - (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, - int amount_needed) { - struct super_block *s = hint->th->t_super; - b_blocknr_t start = hint->search_start; - b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; - int passno = 0; - int nr_allocated = 0; - int depth; - - determine_prealloc_size(hint); - if (!hint->formatted_node) { - int quota_ret; -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(s, REISERFS_DEBUG_CODE, - "reiserquota: allocating %d blocks id=%u", - amount_needed, hint->inode->i_uid); -#endif - depth = reiserfs_write_unlock_nested(s); - quota_ret = - dquot_alloc_block_nodirty(hint->inode, amount_needed); - if (quota_ret) { /* Quota exceeded? */ - reiserfs_write_lock_nested(s, depth); - return QUOTA_EXCEEDED; - } - if (hint->preallocate && hint->prealloc_size) { -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(s, REISERFS_DEBUG_CODE, - "reiserquota: allocating (prealloc) %d blocks id=%u", - hint->prealloc_size, hint->inode->i_uid); -#endif - quota_ret = dquot_prealloc_block_nodirty(hint->inode, - hint->prealloc_size); - if (quota_ret) - hint->preallocate = hint->prealloc_size = 0; - } - /* for unformatted nodes, force large allocations */ - reiserfs_write_lock_nested(s, depth); - } - - do { - switch (passno++) { - case 0: /* Search from hint->search_start to end of disk */ - start = hint->search_start; - finish = SB_BLOCK_COUNT(s) - 1; - break; - case 1: /* Search from hint->beg to hint->search_start */ - start = hint->beg; - finish = hint->search_start; - break; - case 2: /* Last chance: Search from 0 to hint->beg */ - start = 0; - finish = hint->beg; - break; - default: - /* We've tried searching everywhere, not enough space */ - /* Free the blocks */ - if (!hint->formatted_node) { -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(s, REISERFS_DEBUG_CODE, - "reiserquota: freeing (nospace) %d blocks id=%u", - amount_needed + - hint->prealloc_size - - nr_allocated, - hint->inode->i_uid); -#endif - /* Free not allocated blocks */ - depth = reiserfs_write_unlock_nested(s); - dquot_free_block_nodirty(hint->inode, - amount_needed + hint->prealloc_size - - nr_allocated); - reiserfs_write_lock_nested(s, depth); - } - while (nr_allocated--) - reiserfs_free_block(hint->th, hint->inode, - new_blocknrs[nr_allocated], - !hint->formatted_node); - - return NO_DISK_SPACE; - } - } while ((nr_allocated += allocate_without_wrapping_disk(hint, - new_blocknrs + - nr_allocated, - start, finish, - 1, - amount_needed - - nr_allocated, - hint-> - prealloc_size)) - < amount_needed); - if (!hint->formatted_node && - amount_needed + hint->prealloc_size > - nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { - /* Some of preallocation blocks were not allocated */ -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(s, REISERFS_DEBUG_CODE, - "reiserquota: freeing (failed prealloc) %d blocks id=%u", - amount_needed + hint->prealloc_size - - nr_allocated - - REISERFS_I(hint->inode)->i_prealloc_count, - hint->inode->i_uid); -#endif - - depth = reiserfs_write_unlock_nested(s); - dquot_free_block_nodirty(hint->inode, amount_needed + - hint->prealloc_size - nr_allocated - - REISERFS_I(hint->inode)-> - i_prealloc_count); - reiserfs_write_lock_nested(s, depth); - } - - return CARRY_ON; -} - -/* grab new blocknrs from preallocated list */ -/* return amount still needed after using them */ -static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, - b_blocknr_t * new_blocknrs, - int amount_needed) -{ - struct inode *inode = hint->inode; - - if (REISERFS_I(inode)->i_prealloc_count > 0) { - while (amount_needed) { - - *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; - REISERFS_I(inode)->i_prealloc_count--; - - amount_needed--; - - if (REISERFS_I(inode)->i_prealloc_count <= 0) { - list_del(&REISERFS_I(inode)->i_prealloc_list); - break; - } - } - } - /* return amount still needed after using preallocated blocks */ - return amount_needed; -} - -int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, - b_blocknr_t *new_blocknrs, - int amount_needed, - /* Amount of blocks we have already reserved */ - int reserved_by_us) -{ - int initial_amount_needed = amount_needed; - int ret; - struct super_block *s = hint->th->t_super; - - /* Check if there is enough space, taking into account reserved space */ - if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < - amount_needed - reserved_by_us) - return NO_DISK_SPACE; - /* should this be if !hint->inode && hint->preallocate? */ - /* do you mean hint->formatted_node can be removed ? - Zam */ - /* - * hint->formatted_node cannot be removed because we try to access - * inode information here, and there is often no inode associated with - * metadata allocations - green - */ - - if (!hint->formatted_node && hint->preallocate) { - amount_needed = use_preallocated_list_if_available - (hint, new_blocknrs, amount_needed); - - /* - * We have all the block numbers we need from the - * prealloc list - */ - if (amount_needed == 0) - return CARRY_ON; - new_blocknrs += (initial_amount_needed - amount_needed); - } - - /* find search start and save it in hint structure */ - determine_search_start(hint, amount_needed); - if (hint->search_start >= SB_BLOCK_COUNT(s)) - hint->search_start = SB_BLOCK_COUNT(s) - 1; - - /* allocation itself; fill new_blocknrs and preallocation arrays */ - ret = blocknrs_and_prealloc_arrays_from_search_start - (hint, new_blocknrs, amount_needed); - - /* - * We used prealloc. list to fill (partially) new_blocknrs array. - * If final allocation fails we need to return blocks back to - * prealloc. list or just free them. -- Zam (I chose second - * variant) - */ - if (ret != CARRY_ON) { - while (amount_needed++ < initial_amount_needed) { - reiserfs_free_block(hint->th, hint->inode, - *(--new_blocknrs), 1); - } - } - return ret; -} - -void reiserfs_cache_bitmap_metadata(struct super_block *sb, - struct buffer_head *bh, - struct reiserfs_bitmap_info *info) -{ - unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); - - /* The first bit must ALWAYS be 1 */ - if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)) - reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is " - "corrupted: first bit must be 1", bh->b_blocknr); - - info->free_count = 0; - - while (--cur >= (unsigned long *)bh->b_data) { - /* 0 and ~0 are special, we can optimize for them */ - if (*cur == 0) - info->free_count += BITS_PER_LONG; - else if (*cur != ~0L) /* A mix, investigate */ - info->free_count += BITS_PER_LONG - hweight_long(*cur); - } -} - -struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, - unsigned int bitmap) -{ - b_blocknr_t block = (sb->s_blocksize << 3) * bitmap; - struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; - struct buffer_head *bh; - - /* - * Way old format filesystems had the bitmaps packed up front. - * I doubt there are any of these left, but just in case... - */ - if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &REISERFS_SB(sb)->s_properties))) - block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; - else if (bitmap == 0) - block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; - - bh = sb_bread(sb, block); - if (bh == NULL) - reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " - "reading failed", __func__, block); - else { - if (buffer_locked(bh)) { - int depth; - PROC_INFO_INC(sb, scan_bitmap.wait); - depth = reiserfs_write_unlock_nested(sb); - __wait_on_buffer(bh); - reiserfs_write_lock_nested(sb, depth); - } - BUG_ON(!buffer_uptodate(bh)); - BUG_ON(atomic_read(&bh->b_count) == 0); - - if (info->free_count == UINT_MAX) - reiserfs_cache_bitmap_metadata(sb, bh, info); - } - - return bh; -} - -int reiserfs_init_bitmap_cache(struct super_block *sb) -{ - struct reiserfs_bitmap_info *bitmap; - unsigned int bmap_nr = reiserfs_bmap_count(sb); - - bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap))); - if (bitmap == NULL) - return -ENOMEM; - - memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); - - SB_AP_BITMAP(sb) = bitmap; - - return 0; -} - -void reiserfs_free_bitmap_cache(struct super_block *sb) -{ - if (SB_AP_BITMAP(sb)) { - vfree(SB_AP_BITMAP(sb)); - SB_AP_BITMAP(sb) = NULL; - } -} diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c deleted file mode 100644 index 79ee2b436685..000000000000 --- a/fs/reiserfs/dir.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include "reiserfs.h" -#include <linux/stat.h> -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -extern const struct reiserfs_key MIN_KEY; - -static int reiserfs_readdir(struct file *, struct dir_context *); -static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, - int datasync); - -const struct file_operations reiserfs_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = reiserfs_readdir, - .fsync = reiserfs_dir_fsync, - .unlocked_ioctl = reiserfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = reiserfs_compat_ioctl, -#endif -}; - -static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = filp->f_mapping->host; - int err; - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - - inode_lock(inode); - reiserfs_write_lock(inode->i_sb); - err = reiserfs_commit_for_inode(inode); - reiserfs_write_unlock(inode->i_sb); - inode_unlock(inode); - if (err < 0) - return err; - return 0; -} - -#define store_ih(where,what) copy_item_head (where, what) - -static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) -{ - struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; - return (d_really_is_positive(privroot) && - deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); -} - -int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) -{ - - /* key of current position in the directory (key of directory entry) */ - struct cpu_key pos_key; - - INITIALIZE_PATH(path_to_entry); - struct buffer_head *bh; - int item_num, entry_num; - const struct reiserfs_key *rkey; - struct item_head *ih, tmp_ih; - int search_res; - char *local_buf; - loff_t next_pos; - char small_buf[32]; /* avoid kmalloc if we can */ - struct reiserfs_dir_entry de; - int ret = 0; - int depth; - - reiserfs_write_lock(inode->i_sb); - - reiserfs_check_lock_depth(inode->i_sb, "readdir"); - - /* - * form key for search the next directory entry using - * f_pos field of file structure - */ - make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); - next_pos = cpu_key_k_offset(&pos_key); - - path_to_entry.reada = PATH_READA; - while (1) { -research: - /* - * search the directory item, containing entry with - * specified key - */ - search_res = - search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, - &de); - if (search_res == IO_ERROR) { - /* - * FIXME: we could just skip part of directory - * which could not be read - */ - ret = -EIO; - goto out; - } - entry_num = de.de_entry_num; - bh = de.de_bh; - item_num = de.de_item_num; - ih = de.de_ih; - store_ih(&tmp_ih, ih); - - /* we must have found item, that is item of this directory, */ - RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key), - "vs-9000: found item %h does not match to dir we readdir %K", - ih, &pos_key); - RFALSE(item_num > B_NR_ITEMS(bh) - 1, - "vs-9005 item_num == %d, item amount == %d", - item_num, B_NR_ITEMS(bh)); - - /* - * and entry must be not more than number of entries - * in the item - */ - RFALSE(ih_entry_count(ih) < entry_num, - "vs-9010: entry number is too big %d (%d)", - entry_num, ih_entry_count(ih)); - - /* - * go through all entries in the directory item beginning - * from the entry, that has been found - */ - if (search_res == POSITION_FOUND - || entry_num < ih_entry_count(ih)) { - struct reiserfs_de_head *deh = - B_I_DEH(bh, ih) + entry_num; - - for (; entry_num < ih_entry_count(ih); - entry_num++, deh++) { - int d_reclen; - char *d_name; - ino_t d_ino; - loff_t cur_pos = deh_offset(deh); - - /* it is hidden entry */ - if (!de_visible(deh)) - continue; - d_reclen = entry_length(bh, ih, entry_num); - d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); - - if (d_reclen <= 0 || - d_name + d_reclen > bh->b_data + bh->b_size) { - /* - * There is corrupted data in entry, - * We'd better stop here - */ - pathrelse(&path_to_entry); - ret = -EIO; - goto out; - } - - if (!d_name[d_reclen - 1]) - d_reclen = strlen(d_name); - - /* too big to send back to VFS */ - if (d_reclen > - REISERFS_MAX_NAME(inode->i_sb-> - s_blocksize)) { - continue; - } - - /* Ignore the .reiserfs_priv entry */ - if (is_privroot_deh(inode, deh)) - continue; - - ctx->pos = deh_offset(deh); - d_ino = deh_objectid(deh); - if (d_reclen <= 32) { - local_buf = small_buf; - } else { - local_buf = kmalloc(d_reclen, - GFP_NOFS); - if (!local_buf) { - pathrelse(&path_to_entry); - ret = -ENOMEM; - goto out; - } - if (item_moved(&tmp_ih, &path_to_entry)) { - kfree(local_buf); - goto research; - } - } - - /* - * Note, that we copy name to user space via - * temporary buffer (local_buf) because - * filldir will block if user space buffer is - * swapped out. At that time entry can move to - * somewhere else - */ - memcpy(local_buf, d_name, d_reclen); - - /* - * Since filldir might sleep, we can release - * the write lock here for other waiters - */ - depth = reiserfs_write_unlock_nested(inode->i_sb); - if (!dir_emit - (ctx, local_buf, d_reclen, d_ino, - DT_UNKNOWN)) { - reiserfs_write_lock_nested(inode->i_sb, depth); - if (local_buf != small_buf) { - kfree(local_buf); - } - goto end; - } - reiserfs_write_lock_nested(inode->i_sb, depth); - if (local_buf != small_buf) { - kfree(local_buf); - } - - /* deh_offset(deh) may be invalid now. */ - next_pos = cur_pos + 1; - - if (item_moved(&tmp_ih, &path_to_entry)) { - set_cpu_key_k_offset(&pos_key, - next_pos); - goto research; - } - } /* for */ - } - - /* end of directory has been reached */ - if (item_num != B_NR_ITEMS(bh) - 1) - goto end; - - /* - * item we went through is last item of node. Using right - * delimiting key check is it directory end - */ - rkey = get_rkey(&path_to_entry, inode->i_sb); - if (!comp_le_keys(rkey, &MIN_KEY)) { - /* - * set pos_key to key, that is the smallest and greater - * that key of the last entry in the item - */ - set_cpu_key_k_offset(&pos_key, next_pos); - continue; - } - - /* end of directory has been reached */ - if (COMP_SHORT_KEYS(rkey, &pos_key)) { - goto end; - } - - /* directory continues in the right neighboring block */ - set_cpu_key_k_offset(&pos_key, - le_key_k_offset(KEY_FORMAT_3_5, rkey)); - - } /* while */ - -end: - ctx->pos = next_pos; - pathrelse(&path_to_entry); - reiserfs_check_path(&path_to_entry); -out: - reiserfs_write_unlock(inode->i_sb); - return ret; -} - -static int reiserfs_readdir(struct file *file, struct dir_context *ctx) -{ - return reiserfs_readdir_inode(file_inode(file), ctx); -} - -/* - * compose directory item containing "." and ".." entries (entries are - * not aligned to 4 byte boundary) - */ -void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid) -{ - struct reiserfs_de_head *dot, *dotdot; - - memset(body, 0, EMPTY_DIR_SIZE_V1); - dot = (struct reiserfs_de_head *)body; - dotdot = dot + 1; - - /* direntry header of "." */ - put_deh_offset(dot, DOT_OFFSET); - /* these two are from make_le_item_head, and are LE */ - dot->deh_dir_id = dirid; - dot->deh_objectid = objid; - dot->deh_state = 0; /* Endian safe if 0 */ - put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen(".")); - mark_de_visible(dot); - - /* direntry header of ".." */ - put_deh_offset(dotdot, DOT_DOT_OFFSET); - /* key of ".." for the root directory */ - /* these two are from the inode, and are LE */ - dotdot->deh_dir_id = par_dirid; - dotdot->deh_objectid = par_objid; - dotdot->deh_state = 0; /* Endian safe if 0 */ - put_deh_location(dotdot, deh_location(dot) - strlen("..")); - mark_de_visible(dotdot); - - /* copy ".." and "." */ - memcpy(body + deh_location(dot), ".", 1); - memcpy(body + deh_location(dotdot), "..", 2); -} - -/* compose directory item containing "." and ".." entries */ -void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid) -{ - struct reiserfs_de_head *dot, *dotdot; - - memset(body, 0, EMPTY_DIR_SIZE); - dot = (struct reiserfs_de_head *)body; - dotdot = dot + 1; - - /* direntry header of "." */ - put_deh_offset(dot, DOT_OFFSET); - /* these two are from make_le_item_head, and are LE */ - dot->deh_dir_id = dirid; - dot->deh_objectid = objid; - dot->deh_state = 0; /* Endian safe if 0 */ - put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); - mark_de_visible(dot); - - /* direntry header of ".." */ - put_deh_offset(dotdot, DOT_DOT_OFFSET); - /* key of ".." for the root directory */ - /* these two are from the inode, and are LE */ - dotdot->deh_dir_id = par_dirid; - dotdot->deh_objectid = par_objid; - dotdot->deh_state = 0; /* Endian safe if 0 */ - put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen(".."))); - mark_de_visible(dotdot); - - /* copy ".." and "." */ - memcpy(body + deh_location(dot), ".", 1); - memcpy(body + deh_location(dotdot), "..", 2); -} diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c deleted file mode 100644 index 5129efc6f2e6..000000000000 --- a/fs/reiserfs/do_balan.c +++ /dev/null @@ -1,1900 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -/* - * Now we have all buffers that must be used in balancing of the tree - * Further calculations can not cause schedule(), and thus the buffer - * tree will be stable until the balancing will be finished - * balance the tree according to the analysis made before, - * and using buffers obtained after all above. - */ - -#include <linux/uaccess.h> -#include <linux/time.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> -#include <linux/kernel.h> - -static inline void buffer_info_init_left(struct tree_balance *tb, - struct buffer_info *bi) -{ - bi->tb = tb; - bi->bi_bh = tb->L[0]; - bi->bi_parent = tb->FL[0]; - bi->bi_position = get_left_neighbor_position(tb, 0); -} - -static inline void buffer_info_init_right(struct tree_balance *tb, - struct buffer_info *bi) -{ - bi->tb = tb; - bi->bi_bh = tb->R[0]; - bi->bi_parent = tb->FR[0]; - bi->bi_position = get_right_neighbor_position(tb, 0); -} - -static inline void buffer_info_init_tbS0(struct tree_balance *tb, - struct buffer_info *bi) -{ - bi->tb = tb; - bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); - bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - bi->bi_position = PATH_H_POSITION(tb->tb_path, 1); -} - -static inline void buffer_info_init_bh(struct tree_balance *tb, - struct buffer_info *bi, - struct buffer_head *bh) -{ - bi->tb = tb; - bi->bi_bh = bh; - bi->bi_parent = NULL; - bi->bi_position = 0; -} - -inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, - struct buffer_head *bh, int flag) -{ - journal_mark_dirty(tb->transaction_handle, bh); -} - -#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty -#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty - -/* - * summary: - * if deleting something ( tb->insert_size[0] < 0 ) - * return(balance_leaf_when_delete()); (flag d handled here) - * else - * if lnum is larger than 0 we put items into the left node - * if rnum is larger than 0 we put items into the right node - * if snum1 is larger than 0 we put items into the new node s1 - * if snum2 is larger than 0 we put items into the new node s2 - * Note that all *num* count new items being created. - */ - -static void balance_leaf_when_delete_del(struct tree_balance *tb) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); - struct buffer_info bi; -#ifdef CONFIG_REISERFS_CHECK - struct item_head *ih = item_head(tbS0, item_pos); -#endif - - RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], - "vs-12013: mode Delete, insert size %d, ih to be deleted %h", - -tb->insert_size[0], ih); - - buffer_info_init_tbS0(tb, &bi); - leaf_delete_items(&bi, 0, item_pos, 1, -1); - - if (!item_pos && tb->CFL[0]) { - if (B_NR_ITEMS(tbS0)) { - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); - } else { - if (!PATH_H_POSITION(tb->tb_path, 1)) - replace_key(tb, tb->CFL[0], tb->lkey[0], - PATH_H_PPARENT(tb->tb_path, 0), 0); - } - } - - RFALSE(!item_pos && !tb->CFL[0], - "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], - tb->L[0]); -} - -/* cut item in S[0] */ -static void balance_leaf_when_delete_cut(struct tree_balance *tb) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); - struct item_head *ih = item_head(tbS0, item_pos); - int pos_in_item = tb->tb_path->pos_in_item; - struct buffer_info bi; - buffer_info_init_tbS0(tb, &bi); - - if (is_direntry_le_ih(ih)) { - /* - * UFS unlink semantics are such that you can only - * delete one directory entry at a time. - * - * when we cut a directory tb->insert_size[0] means - * number of entries to be cut (always 1) - */ - tb->insert_size[0] = -1; - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); - - RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], - "PAP-12030: can not change delimiting key. CFL[0]=%p", - tb->CFL[0]); - - if (!item_pos && !pos_in_item && tb->CFL[0]) - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); - } else { - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); - - RFALSE(!ih_item_len(ih), - "PAP-12035: cut must leave non-zero dynamic " - "length of item"); - } -} - -static int balance_leaf_when_delete_left(struct tree_balance *tb) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - - /* L[0] must be joined with S[0] */ - if (tb->lnum[0] == -1) { - /* R[0] must be also joined with S[0] */ - if (tb->rnum[0] == -1) { - if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { - /* - * all contents of all the - * 3 buffers will be in L[0] - */ - if (PATH_H_POSITION(tb->tb_path, 1) == 0 && - 1 < B_NR_ITEMS(tb->FR[0])) - replace_key(tb, tb->CFL[0], - tb->lkey[0], tb->FR[0], 1); - - leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1, - NULL); - leaf_move_items(LEAF_FROM_R_TO_L, tb, - B_NR_ITEMS(tb->R[0]), -1, - NULL); - - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, tb->R[0]); - - return 0; - } - - /* all contents of all the 3 buffers will be in R[0] */ - leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL); - leaf_move_items(LEAF_FROM_L_TO_R, tb, - B_NR_ITEMS(tb->L[0]), -1, NULL); - - /* right_delimiting_key is correct in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, tb->L[0]); - - return -1; - } - - RFALSE(tb->rnum[0] != 0, - "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); - /* all contents of L[0] and S[0] will be in L[0] */ - leaf_shift_left(tb, n, -1); - - reiserfs_invalidate_buffer(tb, tbS0); - - return 0; - } - - /* - * a part of contents of S[0] will be in L[0] and - * the rest part of S[0] will be in R[0] - */ - - RFALSE((tb->lnum[0] + tb->rnum[0] < n) || - (tb->lnum[0] + tb->rnum[0] > n + 1), - "PAP-12050: rnum(%d) and lnum(%d) and item " - "number(%d) in S[0] are not consistent", - tb->rnum[0], tb->lnum[0], n); - RFALSE((tb->lnum[0] + tb->rnum[0] == n) && - (tb->lbytes != -1 || tb->rbytes != -1), - "PAP-12055: bad rbytes (%d)/lbytes (%d) " - "parameters when items are not split", - tb->rbytes, tb->lbytes); - RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && - (tb->lbytes < 1 || tb->rbytes != -1), - "PAP-12060: bad rbytes (%d)/lbytes (%d) " - "parameters when items are split", - tb->rbytes, tb->lbytes); - - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - - reiserfs_invalidate_buffer(tb, tbS0); - - return 0; -} - -/* - * Balance leaf node in case of delete or cut: insert_size[0] < 0 - * - * lnum, rnum can have values >= -1 - * -1 means that the neighbor must be joined with S - * 0 means that nothing should be done with the neighbor - * >0 means to shift entirely or partly the specified number of items - * to the neighbor - */ -static int balance_leaf_when_delete(struct tree_balance *tb, int flag) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct buffer_info bi; - int n; - - RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, - "vs- 12000: level: wrong FR %z", tb->FR[0]); - RFALSE(tb->blknum[0] > 1, - "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); - RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), - "PAP-12010: tree can not be empty"); - - buffer_info_init_tbS0(tb, &bi); - - /* Delete or truncate the item */ - - BUG_ON(flag != M_DELETE && flag != M_CUT); - if (flag == M_DELETE) - balance_leaf_when_delete_del(tb); - else /* M_CUT */ - balance_leaf_when_delete_cut(tb); - - - /* - * the rule is that no shifting occurs unless by shifting - * a node can be freed - */ - n = B_NR_ITEMS(tbS0); - - - /* L[0] takes part in balancing */ - if (tb->lnum[0]) - return balance_leaf_when_delete_left(tb); - - if (tb->rnum[0] == -1) { - /* all contents of R[0] and S[0] will be in R[0] */ - leaf_shift_right(tb, n, -1); - reiserfs_invalidate_buffer(tb, tbS0); - return 0; - } - - RFALSE(tb->rnum[0], - "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); - return 0; -} - -static unsigned int balance_leaf_insert_left(struct tree_balance *tb, - struct item_head *const ih, - const char * const body) -{ - int ret; - struct buffer_info bi; - int n = B_NR_ITEMS(tb->L[0]); - unsigned body_shift_bytes = 0; - - if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { - /* part of new item falls into L[0] */ - int new_item_len, shift; - - ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); - - /* Calculate item length to insert to S[0] */ - new_item_len = ih_item_len(ih) - tb->lbytes; - - /* Calculate and check item length to insert to L[0] */ - put_ih_item_len(ih, ih_item_len(ih) - new_item_len); - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12080: there is nothing to insert into L[0]: " - "ih_item_len=%d", ih_item_len(ih)); - - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, - min_t(int, tb->zeroes_num, ih_item_len(ih))); - - /* - * Calculate key component, item length and body to - * insert into S[0] - */ - shift = 0; - if (is_indirect_le_ih(ih)) - shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - - add_le_ih_k_offset(ih, tb->lbytes << shift); - - put_ih_item_len(ih, new_item_len); - if (tb->lbytes > tb->zeroes_num) { - body_shift_bytes = tb->lbytes - tb->zeroes_num; - tb->zeroes_num = 0; - } else - tb->zeroes_num -= tb->lbytes; - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12085: there is nothing to insert into S[0]: " - "ih_item_len=%d", ih_item_len(ih)); - } else { - /* new item in whole falls into L[0] */ - /* Shift lnum[0]-1 items to L[0] */ - ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); - - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, - tb->zeroes_num); - tb->insert_size[0] = 0; - tb->zeroes_num = 0; - } - return body_shift_bytes; -} - -static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - int n = B_NR_ITEMS(tb->L[0]); - struct buffer_info bi; - - RFALSE(tb->zeroes_num, - "PAP-12090: invalid parameter in case of a directory"); - - /* directory item */ - if (tb->lbytes > tb->pos_in_item) { - /* new directory entry falls into L[0] */ - struct item_head *pasted; - int ret, l_pos_in_item = tb->pos_in_item; - - /* - * Shift lnum[0] - 1 items in whole. - * Shift lbytes - 1 entries from given directory item - */ - ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); - if (ret && !tb->item_pos) { - pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); - l_pos_in_item += ih_entry_count(pasted) - - (tb->lbytes - 1); - } - - /* Append given directory entry to directory item */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, - l_pos_in_item, tb->insert_size[0], - body, tb->zeroes_num); - - /* - * previous string prepared space for pasting new entry, - * following string pastes this entry - */ - - /* - * when we have merge directory item, pos_in_item - * has been changed too - */ - - /* paste new directory entry. 1 is entry number */ - leaf_paste_entries(&bi, n + tb->item_pos - ret, - l_pos_in_item, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - tb->insert_size[0] = 0; - } else { - /* new directory item doesn't fall into L[0] */ - /* - * Shift lnum[0]-1 items in whole. Shift lbytes - * directory entries from directory item number lnum[0] - */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - } - - /* Calculate new position to append in item body */ - tb->pos_in_item -= tb->lbytes; -} - -static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tb->L[0]); - struct buffer_info bi; - int body_shift_bytes = 0; - - if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { - balance_leaf_paste_left_shift_dirent(tb, ih, body); - return 0; - } - - RFALSE(tb->lbytes <= 0, - "PAP-12095: there is nothing to shift to L[0]. " - "lbytes=%d", tb->lbytes); - RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), - "PAP-12100: incorrect position to paste: " - "item_len=%d, pos_in_item=%d", - ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item); - - /* appended item will be in L[0] in whole */ - if (tb->lbytes >= tb->pos_in_item) { - struct item_head *tbS0_pos_ih, *tbL0_ih; - struct item_head *tbS0_0_ih; - struct reiserfs_key *left_delim_key; - int ret, l_n, version, temp_l; - - tbS0_pos_ih = item_head(tbS0, tb->item_pos); - tbS0_0_ih = item_head(tbS0, 0); - - /* - * this bytes number must be appended - * to the last item of L[h] - */ - l_n = tb->lbytes - tb->pos_in_item; - - /* Calculate new insert_size[0] */ - tb->insert_size[0] -= l_n; - - RFALSE(tb->insert_size[0] <= 0, - "PAP-12105: there is nothing to paste into " - "L[0]. insert_size=%d", tb->insert_size[0]); - - ret = leaf_shift_left(tb, tb->lnum[0], - ih_item_len(tbS0_pos_ih)); - - tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret); - - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, - ih_item_len(tbL0_ih), l_n, body, - min_t(int, l_n, tb->zeroes_num)); - - /* - * 0-th item in S0 can be only of DIRECT type - * when l_n != 0 - */ - temp_l = l_n; - - RFALSE(ih_item_len(tbS0_0_ih), - "PAP-12106: item length must be 0"); - RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, - leaf_key(tb->L[0], n + tb->item_pos - ret)), - "PAP-12107: items must be of the same file"); - - if (is_indirect_le_ih(tbL0_ih)) { - int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - temp_l = l_n << shift; - } - /* update key of first item in S0 */ - version = ih_version(tbS0_0_ih); - add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l); - - /* update left delimiting key */ - left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]); - add_le_key_k_offset(version, left_delim_key, temp_l); - - /* - * Calculate new body, position in item and - * insert_size[0] - */ - if (l_n > tb->zeroes_num) { - body_shift_bytes = l_n - tb->zeroes_num; - tb->zeroes_num = 0; - } else - tb->zeroes_num -= l_n; - tb->pos_in_item = 0; - - RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, - leaf_key(tb->L[0], - B_NR_ITEMS(tb->L[0]) - 1)) || - !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) || - !op_is_left_mergeable(left_delim_key, tbS0->b_size), - "PAP-12120: item must be merge-able with left " - "neighboring item"); - } else { - /* only part of the appended item will be in L[0] */ - - /* Calculate position in item for append in S[0] */ - tb->pos_in_item -= tb->lbytes; - - RFALSE(tb->pos_in_item <= 0, - "PAP-12125: no place for paste. pos_in_item=%d", - tb->pos_in_item); - - /* - * Shift lnum[0] - 1 items in whole. - * Shift lbytes - 1 byte from item number lnum[0] - */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - } - return body_shift_bytes; -} - - -/* appended item will be in L[0] in whole */ -static void balance_leaf_paste_left_whole(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tb->L[0]); - struct buffer_info bi; - struct item_head *pasted; - int ret; - - /* if we paste into first item of S[0] and it is left mergable */ - if (!tb->item_pos && - op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) { - /* - * then increment pos_in_item by the size of the - * last item in L[0] - */ - pasted = item_head(tb->L[0], n - 1); - if (is_direntry_le_ih(pasted)) - tb->pos_in_item += ih_entry_count(pasted); - else - tb->pos_in_item += ih_item_len(pasted); - } - - /* - * Shift lnum[0] - 1 items in whole. - * Shift lbytes - 1 byte from item number lnum[0] - */ - ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item, - tb->insert_size[0], body, tb->zeroes_num); - - /* if appended item is directory, paste entry */ - pasted = item_head(tb->L[0], n + tb->item_pos - ret); - if (is_direntry_le_ih(pasted)) - leaf_paste_entries(&bi, n + tb->item_pos - ret, - tb->pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0]); - - /* - * if appended item is indirect item, put unformatted node - * into un list - */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - - tb->insert_size[0] = 0; - tb->zeroes_num = 0; -} - -static unsigned int balance_leaf_paste_left(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - /* we must shift the part of the appended item */ - if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) - return balance_leaf_paste_left_shift(tb, ih, body); - else - balance_leaf_paste_left_whole(tb, ih, body); - return 0; -} - -/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ -static unsigned int balance_leaf_left(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, int flag) -{ - if (tb->lnum[0] <= 0) - return 0; - - /* new item or it part falls to L[0], shift it too */ - if (tb->item_pos < tb->lnum[0]) { - BUG_ON(flag != M_INSERT && flag != M_PASTE); - - if (flag == M_INSERT) - return balance_leaf_insert_left(tb, ih, body); - else /* M_PASTE */ - return balance_leaf_paste_left(tb, ih, body); - } else - /* new item doesn't fall into L[0] */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - return 0; -} - - -static void balance_leaf_insert_right(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - struct buffer_info bi; - - /* new item or part of it doesn't fall into R[0] */ - if (n - tb->rnum[0] >= tb->item_pos) { - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - return; - } - - /* new item or its part falls to R[0] */ - - /* part of new item falls into R[0] */ - if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { - loff_t old_key_comp, old_len, r_zeroes_number; - const char *r_body; - int shift; - loff_t offset; - - leaf_shift_right(tb, tb->rnum[0] - 1, -1); - - /* Remember key component and item length */ - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* - * Calculate key component and item length to insert - * into R[0] - */ - shift = 0; - if (is_indirect_le_ih(ih)) - shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift); - set_le_ih_k_offset(ih, offset); - put_ih_item_len(ih, tb->rbytes); - - /* Insert part of the item into R[0] */ - buffer_info_init_right(tb, &bi); - if ((old_len - tb->rbytes) > tb->zeroes_num) { - r_zeroes_number = 0; - r_body = body + (old_len - tb->rbytes) - tb->zeroes_num; - } else { - r_body = body; - r_zeroes_number = tb->zeroes_num - - (old_len - tb->rbytes); - tb->zeroes_num -= r_zeroes_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); - - /* Replace right delimiting key by first key in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - - /* - * Calculate key component and item length to - * insert into S[0] - */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, old_len - tb->rbytes); - - tb->insert_size[0] -= tb->rbytes; - - } else { - /* whole new item falls into R[0] */ - - /* Shift rnum[0]-1 items to R[0] */ - leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); - - /* Insert new item into R[0] */ - buffer_info_init_right(tb, &bi); - leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1, - ih, body, tb->zeroes_num); - - if (tb->item_pos - n + tb->rnum[0] - 1 == 0) - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - - tb->zeroes_num = tb->insert_size[0] = 0; - } -} - - -static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct buffer_info bi; - int entry_count; - - RFALSE(tb->zeroes_num, - "PAP-12145: invalid parameter in case of a directory"); - entry_count = ih_entry_count(item_head(tbS0, tb->item_pos)); - - /* new directory entry falls into R[0] */ - if (entry_count - tb->rbytes < tb->pos_in_item) { - int paste_entry_position; - - RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0], - "PAP-12150: no enough of entries to shift to R[0]: " - "rbytes=%d, entry_count=%d", tb->rbytes, entry_count); - - /* - * Shift rnum[0]-1 items in whole. - * Shift rbytes-1 directory entries from directory - * item number rnum[0] - */ - leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); - - /* Paste given directory entry to directory item */ - paste_entry_position = tb->pos_in_item - entry_count + - tb->rbytes - 1; - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, 0, paste_entry_position, - tb->insert_size[0], body, tb->zeroes_num); - - /* paste entry */ - leaf_paste_entries(&bi, 0, paste_entry_position, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - - /* change delimiting keys */ - if (paste_entry_position == 0) - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - - tb->insert_size[0] = 0; - tb->pos_in_item++; - } else { - /* new directory entry doesn't fall into R[0] */ - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } -} - -static void balance_leaf_paste_right_shift(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n_shift, n_rem, r_zeroes_number, version; - unsigned long temp_rem; - const char *r_body; - struct buffer_info bi; - - /* we append to directory item */ - if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { - balance_leaf_paste_right_shift_dirent(tb, ih, body); - return; - } - - /* regular object */ - - /* - * Calculate number of bytes which must be shifted - * from appended item - */ - n_shift = tb->rbytes - tb->insert_size[0]; - if (n_shift < 0) - n_shift = 0; - - RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), - "PAP-12155: invalid position to paste. ih_item_len=%d, " - "pos_in_item=%d", tb->pos_in_item, - ih_item_len(item_head(tbS0, tb->item_pos))); - - leaf_shift_right(tb, tb->rnum[0], n_shift); - - /* - * Calculate number of bytes which must remain in body - * after appending to R[0] - */ - n_rem = tb->insert_size[0] - tb->rbytes; - if (n_rem < 0) - n_rem = 0; - - temp_rem = n_rem; - - version = ih_version(item_head(tb->R[0], 0)); - - if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) { - int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - temp_rem = n_rem << shift; - } - - add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem); - add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]), - temp_rem); - - do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); - - /* Append part of body into R[0] */ - buffer_info_init_right(tb, &bi); - if (n_rem > tb->zeroes_num) { - r_zeroes_number = 0; - r_body = body + n_rem - tb->zeroes_num; - } else { - r_body = body; - r_zeroes_number = tb->zeroes_num - n_rem; - tb->zeroes_num -= r_zeroes_number; - } - - leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, - r_body, r_zeroes_number); - - if (is_indirect_le_ih(item_head(tb->R[0], 0))) - set_ih_free_space(item_head(tb->R[0], 0), 0); - - tb->insert_size[0] = n_rem; - if (!n_rem) - tb->pos_in_item++; -} - -static void balance_leaf_paste_right_whole(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - struct item_head *pasted; - struct buffer_info bi; - - buffer_info_init_right(tb, &bi); - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - - /* append item in R[0] */ - if (tb->pos_in_item >= 0) { - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0], - tb->pos_in_item, tb->insert_size[0], body, - tb->zeroes_num); - } - - /* paste new entry, if item is directory item */ - pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]); - if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) { - leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0], - tb->pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0]); - - if (!tb->pos_in_item) { - - RFALSE(tb->item_pos - n + tb->rnum[0], - "PAP-12165: directory item must be first " - "item of node when pasting is in 0th position"); - - /* update delimiting keys */ - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - } - } - - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - tb->zeroes_num = tb->insert_size[0] = 0; -} - -static void balance_leaf_paste_right(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - - /* new item doesn't fall into R[0] */ - if (n - tb->rnum[0] > tb->item_pos) { - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - return; - } - - /* pasted item or part of it falls to R[0] */ - - if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1) - /* we must shift the part of the appended item */ - balance_leaf_paste_right_shift(tb, ih, body); - else - /* pasted item in whole falls into R[0] */ - balance_leaf_paste_right_whole(tb, ih, body); -} - -/* shift rnum[0] items from S[0] to the right neighbor R[0] */ -static void balance_leaf_right(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, int flag) -{ - if (tb->rnum[0] <= 0) - return; - - BUG_ON(flag != M_INSERT && flag != M_PASTE); - - if (flag == M_INSERT) - balance_leaf_insert_right(tb, ih, body); - else /* M_PASTE */ - balance_leaf_paste_right(tb, ih, body); -} - -static void balance_leaf_new_nodes_insert(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int i) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - struct buffer_info bi; - int shift; - - /* new item or it part don't falls into S_new[i] */ - if (n - tb->snum[i] >= tb->item_pos) { - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - tb->snum[i], tb->sbytes[i], tb->S_new[i]); - return; - } - - /* new item or it's part falls to first new node S_new[i] */ - - /* part of new item falls into S_new[i] */ - if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { - int old_key_comp, old_len, r_zeroes_number; - const char *r_body; - - /* Move snum[i]-1 items from S[0] to S_new[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, - tb->S_new[i]); - - /* Remember key component and item length */ - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* - * Calculate key component and item length to insert - * into S_new[i] - */ - shift = 0; - if (is_indirect_le_ih(ih)) - shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - ((old_len - tb->sbytes[i]) << shift)); - - put_ih_item_len(ih, tb->sbytes[i]); - - /* Insert part of the item into S_new[i] before 0-th item */ - buffer_info_init_bh(tb, &bi, tb->S_new[i]); - - if ((old_len - tb->sbytes[i]) > tb->zeroes_num) { - r_zeroes_number = 0; - r_body = body + (old_len - tb->sbytes[i]) - - tb->zeroes_num; - } else { - r_body = body; - r_zeroes_number = tb->zeroes_num - (old_len - - tb->sbytes[i]); - tb->zeroes_num -= r_zeroes_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); - - /* - * Calculate key component and item length to - * insert into S[i] - */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, old_len - tb->sbytes[i]); - tb->insert_size[0] -= tb->sbytes[i]; - } else { - /* whole new item falls into S_new[i] */ - - /* - * Shift snum[0] - 1 items to S_new[i] - * (sbytes[i] of split item) - */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]); - - /* Insert new item into S_new[i] */ - buffer_info_init_bh(tb, &bi, tb->S_new[i]); - leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1, - ih, body, tb->zeroes_num); - - tb->zeroes_num = tb->insert_size[0] = 0; - } -} - -/* we append to directory item */ -static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int i) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct item_head *aux_ih = item_head(tbS0, tb->item_pos); - int entry_count = ih_entry_count(aux_ih); - struct buffer_info bi; - - if (entry_count - tb->sbytes[i] < tb->pos_in_item && - tb->pos_in_item <= entry_count) { - /* new directory entry falls into S_new[i] */ - - RFALSE(!tb->insert_size[0], - "PAP-12215: insert_size is already 0"); - RFALSE(tb->sbytes[i] - 1 >= entry_count, - "PAP-12220: there are no so much entries (%d), only %d", - tb->sbytes[i] - 1, entry_count); - - /* - * Shift snum[i]-1 items in whole. - * Shift sbytes[i] directory entries - * from directory item number snum[i] - */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], - tb->sbytes[i] - 1, tb->S_new[i]); - - /* - * Paste given directory entry to - * directory item - */ - buffer_info_init_bh(tb, &bi, tb->S_new[i]); - leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count + - tb->sbytes[i] - 1, tb->insert_size[0], - body, tb->zeroes_num); - - /* paste new directory entry */ - leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count + - tb->sbytes[i] - 1, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - - tb->insert_size[0] = 0; - tb->pos_in_item++; - } else { - /* new directory entry doesn't fall into S_new[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], - tb->sbytes[i], tb->S_new[i]); - } - -} - -static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int i) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct item_head *aux_ih = item_head(tbS0, tb->item_pos); - int n_shift, n_rem, r_zeroes_number, shift; - const char *r_body; - struct item_head *tmp; - struct buffer_info bi; - - RFALSE(ih, "PAP-12210: ih must be 0"); - - if (is_direntry_le_ih(aux_ih)) { - balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key, - insert_ptr, i); - return; - } - - /* regular object */ - - - RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) || - tb->insert_size[0] <= 0, - "PAP-12225: item too short or insert_size <= 0"); - - /* - * Calculate number of bytes which must be shifted from appended item - */ - n_shift = tb->sbytes[i] - tb->insert_size[0]; - if (n_shift < 0) - n_shift = 0; - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift, - tb->S_new[i]); - - /* - * Calculate number of bytes which must remain in body after - * append to S_new[i] - */ - n_rem = tb->insert_size[0] - tb->sbytes[i]; - if (n_rem < 0) - n_rem = 0; - - /* Append part of body into S_new[0] */ - buffer_info_init_bh(tb, &bi, tb->S_new[i]); - if (n_rem > tb->zeroes_num) { - r_zeroes_number = 0; - r_body = body + n_rem - tb->zeroes_num; - } else { - r_body = body; - r_zeroes_number = tb->zeroes_num - n_rem; - tb->zeroes_num -= r_zeroes_number; - } - - leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, - r_body, r_zeroes_number); - - tmp = item_head(tb->S_new[i], 0); - shift = 0; - if (is_indirect_le_ih(tmp)) { - set_ih_free_space(tmp, 0); - shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; - } - add_le_ih_k_offset(tmp, n_rem << shift); - - tb->insert_size[0] = n_rem; - if (!n_rem) - tb->pos_in_item++; -} - -static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int i) - -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - int leaf_mi; - struct item_head *pasted; - struct buffer_info bi; - -#ifdef CONFIG_REISERFS_CHECK - struct item_head *ih_check = item_head(tbS0, tb->item_pos); - - if (!is_direntry_le_ih(ih_check) && - (tb->pos_in_item != ih_item_len(ih_check) || - tb->insert_size[0] <= 0)) - reiserfs_panic(tb->tb_sb, - "PAP-12235", - "pos_in_item must be equal to ih_item_len"); -#endif - - leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], - tb->sbytes[i], tb->S_new[i]); - - RFALSE(leaf_mi, - "PAP-12240: unexpected value returned by leaf_move_items (%d)", - leaf_mi); - - /* paste into item */ - buffer_info_init_bh(tb, &bi, tb->S_new[i]); - leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i], - tb->pos_in_item, tb->insert_size[0], - body, tb->zeroes_num); - - pasted = item_head(tb->S_new[i], tb->item_pos - n + - tb->snum[i]); - if (is_direntry_le_ih(pasted)) - leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i], - tb->pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0]); - - /* if we paste to indirect item update ih_free_space */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - - tb->zeroes_num = tb->insert_size[0] = 0; - -} -static void balance_leaf_new_nodes_paste(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int i) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int n = B_NR_ITEMS(tbS0); - - /* pasted item doesn't fall into S_new[i] */ - if (n - tb->snum[i] > tb->item_pos) { - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - tb->snum[i], tb->sbytes[i], tb->S_new[i]); - return; - } - - /* pasted item or part if it falls to S_new[i] */ - - if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1) - /* we must shift part of the appended item */ - balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key, - insert_ptr, i); - else - /* item falls wholly into S_new[i] */ - balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key, - insert_ptr, i); -} - -/* Fill new nodes that appear in place of S[0] */ -static void balance_leaf_new_nodes(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, - struct item_head *insert_key, - struct buffer_head **insert_ptr, - int flag) -{ - int i; - for (i = tb->blknum[0] - 2; i >= 0; i--) { - BUG_ON(flag != M_INSERT && flag != M_PASTE); - - RFALSE(!tb->snum[i], - "PAP-12200: snum[%d] == %d. Must be > 0", i, - tb->snum[i]); - - /* here we shift from S to S_new nodes */ - - tb->S_new[i] = get_FEB(tb); - - /* initialized block type and tree level */ - set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL); - - if (flag == M_INSERT) - balance_leaf_new_nodes_insert(tb, ih, body, insert_key, - insert_ptr, i); - else /* M_PASTE */ - balance_leaf_new_nodes_paste(tb, ih, body, insert_key, - insert_ptr, i); - - memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE); - insert_ptr[i] = tb->S_new[i]; - - RFALSE(!buffer_journaled(tb->S_new[i]) - || buffer_journal_dirty(tb->S_new[i]) - || buffer_dirty(tb->S_new[i]), - "PAP-12247: S_new[%d] : (%b)", - i, tb->S_new[i]); - } -} - -static void balance_leaf_finish_node_insert(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct buffer_info bi; - buffer_info_init_tbS0(tb, &bi); - leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num); - - /* If we insert the first key change the delimiting key */ - if (tb->item_pos == 0) { - if (tb->CFL[0]) /* can be 0 in reiserfsck */ - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); - - } -} - -static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct item_head *pasted = item_head(tbS0, tb->item_pos); - struct buffer_info bi; - - if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) { - RFALSE(!tb->insert_size[0], - "PAP-12260: insert_size is 0 already"); - - /* prepare space */ - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item, - tb->insert_size[0], body, tb->zeroes_num); - - /* paste entry */ - leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0]); - - if (!tb->item_pos && !tb->pos_in_item) { - RFALSE(!tb->CFL[0] || !tb->L[0], - "PAP-12270: CFL[0]/L[0] must be specified"); - if (tb->CFL[0]) - replace_key(tb, tb->CFL[0], tb->lkey[0], - tbS0, 0); - } - - tb->insert_size[0] = 0; - } -} - -static void balance_leaf_finish_node_paste(struct tree_balance *tb, - struct item_head * const ih, - const char * const body) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - struct buffer_info bi; - struct item_head *pasted = item_head(tbS0, tb->item_pos); - - /* when directory, may be new entry already pasted */ - if (is_direntry_le_ih(pasted)) { - balance_leaf_finish_node_paste_dirent(tb, ih, body); - return; - } - - /* regular object */ - - if (tb->pos_in_item == ih_item_len(pasted)) { - RFALSE(tb->insert_size[0] <= 0, - "PAP-12275: insert size must not be %d", - tb->insert_size[0]); - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, tb->item_pos, - tb->pos_in_item, tb->insert_size[0], body, - tb->zeroes_num); - - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - - tb->insert_size[0] = 0; - } -#ifdef CONFIG_REISERFS_CHECK - else if (tb->insert_size[0]) { - print_cur_tb("12285"); - reiserfs_panic(tb->tb_sb, "PAP-12285", - "insert_size must be 0 (%d)", tb->insert_size[0]); - } -#endif -} - -/* - * if the affected item was not wholly shifted then we - * perform all necessary operations on that part or whole - * of the affected item which remains in S - */ -static void balance_leaf_finish_node(struct tree_balance *tb, - struct item_head * const ih, - const char * const body, int flag) -{ - /* if we must insert or append into buffer S[0] */ - if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { - if (flag == M_INSERT) - balance_leaf_finish_node_insert(tb, ih, body); - else /* M_PASTE */ - balance_leaf_finish_node_paste(tb, ih, body); - } -} - -/** - * balance_leaf - reiserfs tree balancing algorithm - * @tb: tree balance state - * @ih: item header of inserted item (little endian) - * @body: body of inserted item or bytes to paste - * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance) - * passed back: - * @insert_key: key to insert new nodes - * @insert_ptr: array of nodes to insert at the next level - * - * In our processing of one level we sometimes determine what must be - * inserted into the next higher level. This insertion consists of a - * key or two keys and their corresponding pointers. - */ -static int balance_leaf(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag, - struct item_head *insert_key, - struct buffer_head **insert_ptr) -{ - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - - PROC_INFO_INC(tb->tb_sb, balance_at[0]); - - /* Make balance in case insert_size[0] < 0 */ - if (tb->insert_size[0] < 0) - return balance_leaf_when_delete(tb, flag); - - tb->item_pos = PATH_LAST_POSITION(tb->tb_path), - tb->pos_in_item = tb->tb_path->pos_in_item, - tb->zeroes_num = 0; - if (flag == M_INSERT && !body) - tb->zeroes_num = ih_item_len(ih); - - /* - * for indirect item pos_in_item is measured in unformatted node - * pointers. Recalculate to bytes - */ - if (flag != M_INSERT - && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) - tb->pos_in_item *= UNFM_P_SIZE; - - body += balance_leaf_left(tb, ih, body, flag); - - /* tb->lnum[0] > 0 */ - /* Calculate new item position */ - tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); - - balance_leaf_right(tb, ih, body, flag); - - /* tb->rnum[0] > 0 */ - RFALSE(tb->blknum[0] > 3, - "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); - RFALSE(tb->blknum[0] < 0, - "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); - - /* - * if while adding to a node we discover that it is possible to split - * it in two, and merge the left part into the left neighbor and the - * right part into the right neighbor, eliminating the node - */ - if (tb->blknum[0] == 0) { /* node S[0] is empty now */ - - RFALSE(!tb->lnum[0] || !tb->rnum[0], - "PAP-12190: lnum and rnum must not be zero"); - /* - * if insertion was done before 0-th position in R[0], right - * delimiting key of the tb->L[0]'s and left delimiting key are - * not set correctly - */ - if (tb->CFL[0]) { - if (!tb->CFR[0]) - reiserfs_panic(tb->tb_sb, "vs-12195", - "CFR not initialized"); - copy_key(internal_key(tb->CFL[0], tb->lkey[0]), - internal_key(tb->CFR[0], tb->rkey[0])); - do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); - } - - reiserfs_invalidate_buffer(tb, tbS0); - return 0; - } - - balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag); - - balance_leaf_finish_node(tb, ih, body, flag); - -#ifdef CONFIG_REISERFS_CHECK - if (flag == M_PASTE && tb->insert_size[0]) { - print_cur_tb("12290"); - reiserfs_panic(tb->tb_sb, - "PAP-12290", "insert_size is still not 0 (%d)", - tb->insert_size[0]); - } -#endif - - /* Leaf level of the tree is balanced (end of balance_leaf) */ - return 0; -} - -/* Make empty node */ -void make_empty_node(struct buffer_info *bi) -{ - struct block_head *blkh; - - RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); - - blkh = B_BLK_HEAD(bi->bi_bh); - set_blkh_nr_item(blkh, 0); - set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh)); - - if (bi->bi_parent) - B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ -} - -/* Get first empty buffer */ -struct buffer_head *get_FEB(struct tree_balance *tb) -{ - int i; - struct buffer_info bi; - - for (i = 0; i < MAX_FEB_SIZE; i++) - if (tb->FEB[i] != NULL) - break; - - if (i == MAX_FEB_SIZE) - reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty"); - - buffer_info_init_bh(tb, &bi, tb->FEB[i]); - make_empty_node(&bi); - set_buffer_uptodate(tb->FEB[i]); - tb->used[i] = tb->FEB[i]; - tb->FEB[i] = NULL; - - return tb->used[i]; -} - -/* This is now used because reiserfs_free_block has to be able to schedule. */ -static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) -{ - int i; - - if (buffer_dirty(bh)) - reiserfs_warning(tb->tb_sb, "reiserfs-12320", - "called with dirty buffer"); - for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) - if (!tb->thrown[i]) { - tb->thrown[i] = bh; - get_bh(bh); /* free_thrown puts this */ - return; - } - reiserfs_warning(tb->tb_sb, "reiserfs-12321", - "too many thrown buffers"); -} - -static void free_thrown(struct tree_balance *tb) -{ - int i; - b_blocknr_t blocknr; - for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) { - if (tb->thrown[i]) { - blocknr = tb->thrown[i]->b_blocknr; - if (buffer_dirty(tb->thrown[i])) - reiserfs_warning(tb->tb_sb, "reiserfs-12322", - "called with dirty buffer %d", - blocknr); - brelse(tb->thrown[i]); /* incremented in store_thrown */ - reiserfs_free_block(tb->transaction_handle, NULL, - blocknr, 0); - } - } -} - -void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh) -{ - struct block_head *blkh; - blkh = B_BLK_HEAD(bh); - set_blkh_level(blkh, FREE_LEVEL); - set_blkh_nr_item(blkh, 0); - - clear_buffer_dirty(bh); - store_thrown(tb, bh); -} - -/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ -void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, - struct buffer_head *src, int n_src) -{ - - RFALSE(dest == NULL || src == NULL, - "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", - src, dest); - RFALSE(!B_IS_KEYS_LEVEL(dest), - "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", - dest); - RFALSE(n_dest < 0 || n_src < 0, - "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); - RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), - "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", - n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); - - if (B_IS_ITEMS_LEVEL(src)) - /* source buffer contains leaf node */ - memcpy(internal_key(dest, n_dest), item_head(src, n_src), - KEY_SIZE); - else - memcpy(internal_key(dest, n_dest), internal_key(src, n_src), - KEY_SIZE); - - do_balance_mark_internal_dirty(tb, dest, 0); -} - -int get_left_neighbor_position(struct tree_balance *tb, int h) -{ - int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); - - RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL, - "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", - h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h)); - - if (Sh_position == 0) - return B_NR_ITEMS(tb->FL[h]); - else - return Sh_position - 1; -} - -int get_right_neighbor_position(struct tree_balance *tb, int h) -{ - int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); - - RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL, - "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", - h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]); - - if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h))) - return 0; - else - return Sh_position + 1; -} - -#ifdef CONFIG_REISERFS_CHECK - -int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); -static void check_internal_node(struct super_block *s, struct buffer_head *bh, - char *mes) -{ - struct disk_child *dc; - int i; - - RFALSE(!bh, "PAP-12336: bh == 0"); - - if (!bh || !B_IS_IN_TREE(bh)) - return; - - RFALSE(!buffer_dirty(bh) && - !(buffer_journaled(bh) || buffer_journal_dirty(bh)), - "PAP-12337: buffer (%b) must be dirty", bh); - dc = B_N_CHILD(bh, 0); - - for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { - if (!is_reusable(s, dc_block_number(dc), 1)) { - print_cur_tb(mes); - reiserfs_panic(s, "PAP-12338", - "invalid child pointer %y in %b", - dc, bh); - } - } -} - -static int locked_or_not_in_tree(struct tree_balance *tb, - struct buffer_head *bh, char *which) -{ - if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || - !B_IS_IN_TREE(bh)) { - reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh); - return 1; - } - return 0; -} - -static int check_before_balancing(struct tree_balance *tb) -{ - int retval = 0; - - if (REISERFS_SB(tb->tb_sb)->cur_tb) { - reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " - "occurred based on cur_tb not being null at " - "this point in code. do_balance cannot properly " - "handle concurrent tree accesses on a same " - "mount point."); - } - - /* - * double check that buffers that we will modify are unlocked. - * (fix_nodes should already have prepped all of these for us). - */ - if (tb->lnum[0]) { - retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); - retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); - retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]"); - check_leaf(tb->L[0]); - } - if (tb->rnum[0]) { - retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]"); - retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]"); - retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]"); - check_leaf(tb->R[0]); - } - retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path), - "S[0]"); - check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); - - return retval; -} - -static void check_after_balance_leaf(struct tree_balance *tb) -{ - if (tb->lnum[0]) { - if (B_FREE_SPACE(tb->L[0]) != - MAX_CHILD_SIZE(tb->L[0]) - - dc_size(B_N_CHILD - (tb->FL[0], get_left_neighbor_position(tb, 0)))) { - print_cur_tb("12221"); - reiserfs_panic(tb->tb_sb, "PAP-12355", - "shift to left was incorrect"); - } - } - if (tb->rnum[0]) { - if (B_FREE_SPACE(tb->R[0]) != - MAX_CHILD_SIZE(tb->R[0]) - - dc_size(B_N_CHILD - (tb->FR[0], get_right_neighbor_position(tb, 0)))) { - print_cur_tb("12222"); - reiserfs_panic(tb->tb_sb, "PAP-12360", - "shift to right was incorrect"); - } - } - if (PATH_H_PBUFFER(tb->tb_path, 1) && - (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) != - (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - - dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), - PATH_H_POSITION(tb->tb_path, 1)))))) { - int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)); - int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - - dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), - PATH_H_POSITION(tb->tb_path, - 1)))); - print_cur_tb("12223"); - reiserfs_warning(tb->tb_sb, "reiserfs-12363", - "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " - "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", - left, - MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)), - PATH_H_PBUFFER(tb->tb_path, 1), - PATH_H_POSITION(tb->tb_path, 1), - dc_size(B_N_CHILD - (PATH_H_PBUFFER(tb->tb_path, 1), - PATH_H_POSITION(tb->tb_path, 1))), - right); - reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect"); - } -} - -static void check_leaf_level(struct tree_balance *tb) -{ - check_leaf(tb->L[0]); - check_leaf(tb->R[0]); - check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); -} - -static void check_internal_levels(struct tree_balance *tb) -{ - int h; - - /* check all internal nodes */ - for (h = 1; tb->insert_size[h]; h++) { - check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h), - "BAD BUFFER ON PATH"); - if (tb->lnum[h]) - check_internal_node(tb->tb_sb, tb->L[h], "BAD L"); - if (tb->rnum[h]) - check_internal_node(tb->tb_sb, tb->R[h], "BAD R"); - } - -} - -#endif - -/* - * Now we have all of the buffers that must be used in balancing of - * the tree. We rely on the assumption that schedule() will not occur - * while do_balance works. ( Only interrupt handlers are acceptable.) - * We balance the tree according to the analysis made before this, - * using buffers already obtained. For SMP support it will someday be - * necessary to add ordered locking of tb. - */ - -/* - * Some interesting rules of balancing: - * we delete a maximum of two nodes per level per balancing: we never - * delete R, when we delete two of three nodes L, S, R then we move - * them into R. - * - * we only delete L if we are deleting two nodes, if we delete only - * one node we delete S - * - * if we shift leaves then we shift as much as we can: this is a - * deliberate policy of extremism in node packing which results in - * higher average utilization after repeated random balance operations - * at the cost of more memory copies and more balancing as a result of - * small insertions to full nodes. - * - * if we shift internal nodes we try to evenly balance the node - * utilization, with consequent less balancing at the cost of lower - * utilization. - * - * one could argue that the policy for directories in leaves should be - * that of internal nodes, but we will wait until another day to - * evaluate this.... It would be nice to someday measure and prove - * these assumptions as to what is optimal.... - */ - -static inline void do_balance_starts(struct tree_balance *tb) -{ - /* use print_cur_tb() to see initial state of struct tree_balance */ - - /* store_print_tb (tb); */ - - /* do not delete, just comment it out */ - /* - print_tb(flag, PATH_LAST_POSITION(tb->tb_path), - tb->tb_path->pos_in_item, tb, "check"); - */ - RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); -#ifdef CONFIG_REISERFS_CHECK - REISERFS_SB(tb->tb_sb)->cur_tb = tb; -#endif -} - -static inline void do_balance_completed(struct tree_balance *tb) -{ - -#ifdef CONFIG_REISERFS_CHECK - check_leaf_level(tb); - check_internal_levels(tb); - REISERFS_SB(tb->tb_sb)->cur_tb = NULL; -#endif - - /* - * reiserfs_free_block is no longer schedule safe. So, we need to - * put the buffers we want freed on the thrown list during do_balance, - * and then free them now - */ - - REISERFS_SB(tb->tb_sb)->s_do_balance++; - - /* release all nodes hold to perform the balancing */ - unfix_nodes(tb); - - free_thrown(tb); -} - -/* - * do_balance - balance the tree - * - * @tb: tree_balance structure - * @ih: item header of inserted item - * @body: body of inserted item or bytes to paste - * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste - * - * Cut means delete part of an item (includes removing an entry from a - * directory). - * - * Delete means delete whole item. - * - * Insert means add a new item into the tree. - * - * Paste means to append to the end of an existing file or to - * insert a directory entry. - */ -void do_balance(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag) -{ - int child_pos; /* position of a child node in its parent */ - int h; /* level of the tree being processed */ - - /* - * in our processing of one level we sometimes determine what - * must be inserted into the next higher level. This insertion - * consists of a key or two keys and their corresponding - * pointers - */ - struct item_head insert_key[2]; - - /* inserted node-ptrs for the next level */ - struct buffer_head *insert_ptr[2]; - - tb->tb_mode = flag; - tb->need_balance_dirty = 0; - - if (FILESYSTEM_CHANGED_TB(tb)) { - reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has " - "changed"); - } - /* if we have no real work to do */ - if (!tb->insert_size[0]) { - reiserfs_warning(tb->tb_sb, "PAP-12350", - "insert_size == 0, mode == %c", flag); - unfix_nodes(tb); - return; - } - - atomic_inc(&fs_generation(tb->tb_sb)); - do_balance_starts(tb); - - /* - * balance_leaf returns 0 except if combining L R and S into - * one node. see balance_internal() for explanation of this - * line of code. - */ - child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + - balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); - -#ifdef CONFIG_REISERFS_CHECK - check_after_balance_leaf(tb); -#endif - - /* Balance internal level of the tree. */ - for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) - child_pos = balance_internal(tb, h, child_pos, insert_key, - insert_ptr); - - do_balance_completed(tb); -} diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c deleted file mode 100644 index 8eb3ad3e8ae9..000000000000 --- a/fs/reiserfs/file.c +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/time.h> -#include "reiserfs.h" -#include "acl.h" -#include "xattr.h" -#include <linux/uaccess.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/quotaops.h> - -/* - * We pack the tails of files on file close, not at the time they are written. - * This implies an unnecessary copy of the tail and an unnecessary indirect item - * insertion/balancing, for files that are written in one write. - * It avoids unnecessary tail packings (balances) for files that are written in - * multiple writes and are small enough to have tails. - * - * file_release is called by the VFS layer when the file is closed. If - * this is the last open file descriptor, and the file - * small enough to have a tail, and the tail is currently in an - * unformatted node, the tail is converted back into a direct item. - * - * We use reiserfs_truncate_file to pack the tail, since it already has - * all the conditions coded. - */ -static int reiserfs_file_release(struct inode *inode, struct file *filp) -{ - - struct reiserfs_transaction_handle th; - int err; - int jbegin_failure = 0; - - BUG_ON(!S_ISREG(inode->i_mode)); - - if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers, - &REISERFS_I(inode)->tailpack)) - return 0; - - /* fast out for when nothing needs to be done */ - if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || - !tail_has_to_be_packed(inode)) && - REISERFS_I(inode)->i_prealloc_count <= 0) { - mutex_unlock(&REISERFS_I(inode)->tailpack); - return 0; - } - - reiserfs_write_lock(inode->i_sb); - /* - * freeing preallocation only involves relogging blocks that - * are already in the current transaction. preallocation gets - * freed at the end of each transaction, so it is impossible for - * us to log any additional blocks (including quota blocks) - */ - err = journal_begin(&th, inode->i_sb, 1); - if (err) { - /* - * uh oh, we can't allow the inode to go away while there - * is still preallocation blocks pending. Try to join the - * aborted transaction - */ - jbegin_failure = err; - err = journal_join_abort(&th, inode->i_sb); - - if (err) { - /* - * hmpf, our choices here aren't good. We can pin - * the inode which will disallow unmount from ever - * happening, we can do nothing, which will corrupt - * random memory on unmount, or we can forcibly - * remove the file from the preallocation list, which - * will leak blocks on disk. Lets pin the inode - * and let the admin know what is going on. - */ - igrab(inode); - reiserfs_warning(inode->i_sb, "clm-9001", - "pinning inode %lu because the " - "preallocation can't be freed", - inode->i_ino); - goto out; - } - } - reiserfs_update_inode_transaction(inode); - -#ifdef REISERFS_PREALLOCATE - reiserfs_discard_prealloc(&th, inode); -#endif - err = journal_end(&th); - - /* copy back the error code from journal_begin */ - if (!err) - err = jbegin_failure; - - if (!err && - (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && - tail_has_to_be_packed(inode)) { - - /* - * if regular file is released by last holder and it has been - * appended (we append by unformatted node only) or its direct - * item(s) had to be converted, then it may have to be - * indirect2direct converted - */ - err = reiserfs_truncate_file(inode, 0); - } -out: - reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&REISERFS_I(inode)->tailpack); - return err; -} - -static int reiserfs_file_open(struct inode *inode, struct file *file) -{ - int err = dquot_file_open(inode, file); - - /* somebody might be tailpacking on final close; wait for it */ - if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { - mutex_lock(&REISERFS_I(inode)->tailpack); - atomic_inc(&REISERFS_I(inode)->openers); - mutex_unlock(&REISERFS_I(inode)->tailpack); - } - return err; -} - -void reiserfs_vfs_truncate_file(struct inode *inode) -{ - mutex_lock(&REISERFS_I(inode)->tailpack); - reiserfs_truncate_file(inode, 1); - mutex_unlock(&REISERFS_I(inode)->tailpack); -} - -/* Sync a reiserfs file. */ - -/* - * FIXME: sync_mapping_buffers() never has anything to sync. Can - * be removed... - */ - -static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = filp->f_mapping->host; - int err; - int barrier_done; - - err = file_write_and_wait_range(filp, start, end); - if (err) - return err; - - inode_lock(inode); - BUG_ON(!S_ISREG(inode->i_mode)); - err = sync_mapping_buffers(inode->i_mapping); - reiserfs_write_lock(inode->i_sb); - barrier_done = reiserfs_commit_for_inode(inode); - reiserfs_write_unlock(inode->i_sb); - if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev); - inode_unlock(inode); - if (barrier_done < 0) - return barrier_done; - return (err < 0) ? -EIO : 0; -} - -/* taken fs/buffer.c:__block_commit_write */ -int reiserfs_commit_page(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - unsigned block_start, block_end; - int partial = 0; - unsigned blocksize; - struct buffer_head *bh, *head; - unsigned long i_size_index = inode->i_size >> PAGE_SHIFT; - int new; - int logit = reiserfs_file_data_log(inode); - struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_SIZE / s->s_blocksize; - struct reiserfs_transaction_handle th; - int ret = 0; - - th.t_trans_id = 0; - blocksize = i_blocksize(inode); - - if (logit) { - reiserfs_write_lock(s); - ret = journal_begin(&th, s, bh_per_page + 1); - if (ret) - goto drop_write_lock; - reiserfs_update_inode_transaction(inode); - } - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - new = buffer_new(bh); - clear_buffer_new(bh); - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (!buffer_uptodate(bh)) - partial = 1; - } else { - set_buffer_uptodate(bh); - if (logit) { - reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, bh); - } else if (!buffer_dirty(bh)) { - mark_buffer_dirty(bh); - /* - * do data=ordered on any page past the end - * of file and any buffer marked BH_New. - */ - if (reiserfs_data_ordered(inode->i_sb) && - (new || page->index >= i_size_index)) { - reiserfs_add_ordered_list(inode, bh); - } - } - } - } - if (logit) { - ret = journal_end(&th); -drop_write_lock: - reiserfs_write_unlock(s); - } - /* - * If this is a partial write which happened to make all buffers - * uptodate then we can optimize away a bogus read_folio() for - * the next read(). Here we 'discover' whether the page went - * uptodate as a result of this (potentially partial) write. - */ - if (!partial) - SetPageUptodate(page); - return ret; -} - -const struct file_operations reiserfs_file_operations = { - .unlocked_ioctl = reiserfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = reiserfs_compat_ioctl, -#endif - .mmap = generic_file_mmap, - .open = reiserfs_file_open, - .release = reiserfs_file_release, - .fsync = reiserfs_sync_file, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, - .llseek = generic_file_llseek, -}; - -const struct inode_operations reiserfs_file_inode_operations = { - .setattr = reiserfs_setattr, - .listxattr = reiserfs_listxattr, - .permission = reiserfs_permission, - .get_inode_acl = reiserfs_get_acl, - .set_acl = reiserfs_set_acl, - .fileattr_get = reiserfs_fileattr_get, - .fileattr_set = reiserfs_fileattr_set, -}; - -const struct inode_operations reiserfs_priv_file_inode_operations = { - .setattr = reiserfs_setattr, - .permission = reiserfs_permission, - .fileattr_get = reiserfs_fileattr_get, - .fileattr_set = reiserfs_fileattr_set, -}; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c deleted file mode 100644 index 6c13a8d9a73c..000000000000 --- a/fs/reiserfs/fix_node.c +++ /dev/null @@ -1,2822 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/time.h> -#include <linux/slab.h> -#include <linux/string.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> - -/* - * To make any changes in the tree we find a node that contains item - * to be changed/deleted or position in the node we insert a new item - * to. We call this node S. To do balancing we need to decide what we - * will shift to left/right neighbor, or to a new node, where new item - * will be etc. To make this analysis simpler we build virtual - * node. Virtual node is an array of items, that will replace items of - * node S. (For instance if we are going to delete an item, virtual - * node does not contain it). Virtual node keeps information about - * item sizes and types, mergeability of first and last items, sizes - * of all entries in directory item. We use this array of items when - * calculating what we can shift to neighbors and how many nodes we - * have to have if we do not any shiftings, if we shift to left/right - * neighbor or to both. - */ - -/* - * Takes item number in virtual node, returns number of item - * that it has in source buffer - */ -static inline int old_item_num(int new_num, int affected_item_num, int mode) -{ - if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) - return new_num; - - if (mode == M_INSERT) { - - RFALSE(new_num == 0, - "vs-8005: for INSERT mode and item number of inserted item"); - - return new_num - 1; - } - - RFALSE(mode != M_DELETE, - "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", - mode); - /* delete mode */ - return new_num + 1; -} - -static void create_virtual_node(struct tree_balance *tb, int h) -{ - struct item_head *ih; - struct virtual_node *vn = tb->tb_vn; - int new_num; - struct buffer_head *Sh; /* this comes from tb->S[h] */ - - Sh = PATH_H_PBUFFER(tb->tb_path, h); - - /* size of changed node */ - vn->vn_size = - MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; - - /* for internal nodes array if virtual items is not created */ - if (h) { - vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); - return; - } - - /* number of items in virtual node */ - vn->vn_nr_item = - B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - - ((vn->vn_mode == M_DELETE) ? 1 : 0); - - /* first virtual item */ - vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); - memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); - vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); - - /* first item in the node */ - ih = item_head(Sh, 0); - - /* define the mergeability for 0-th item (if it is not being deleted) */ - if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) - && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) - vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; - - /* - * go through all items that remain in the virtual - * node (except for the new (inserted) one) - */ - for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { - int j; - struct virtual_item *vi = vn->vn_vi + new_num; - int is_affected = - ((new_num != vn->vn_affected_item_num) ? 0 : 1); - - if (is_affected && vn->vn_mode == M_INSERT) - continue; - - /* get item number in source node */ - j = old_item_num(new_num, vn->vn_affected_item_num, - vn->vn_mode); - - vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; - vi->vi_ih = ih + j; - vi->vi_item = ih_item_body(Sh, ih + j); - vi->vi_uarea = vn->vn_free_ptr; - - /* - * FIXME: there is no check that item operation did not - * consume too much memory - */ - vn->vn_free_ptr += - op_create_vi(vn, vi, is_affected, tb->insert_size[0]); - if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) - reiserfs_panic(tb->tb_sb, "vs-8030", - "virtual node space consumed"); - - if (!is_affected) - /* this is not being changed */ - continue; - - if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { - vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; - /* pointer to data which is going to be pasted */ - vi->vi_new_data = vn->vn_data; - } - } - - /* virtual inserted item is not defined yet */ - if (vn->vn_mode == M_INSERT) { - struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; - - RFALSE(vn->vn_ins_ih == NULL, - "vs-8040: item header of inserted item is not specified"); - vi->vi_item_len = tb->insert_size[0]; - vi->vi_ih = vn->vn_ins_ih; - vi->vi_item = vn->vn_data; - vi->vi_uarea = vn->vn_free_ptr; - - op_create_vi(vn, vi, 0 /*not pasted or cut */ , - tb->insert_size[0]); - } - - /* - * set right merge flag we take right delimiting key and - * check whether it is a mergeable item - */ - if (tb->CFR[0]) { - struct reiserfs_key *key; - - key = internal_key(tb->CFR[0], tb->rkey[0]); - if (op_is_left_mergeable(key, Sh->b_size) - && (vn->vn_mode != M_DELETE - || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) - vn->vn_vi[vn->vn_nr_item - 1].vi_type |= - VI_TYPE_RIGHT_MERGEABLE; - -#ifdef CONFIG_REISERFS_CHECK - if (op_is_left_mergeable(key, Sh->b_size) && - !(vn->vn_mode != M_DELETE - || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { - /* - * we delete last item and it could be merged - * with right neighbor's first item - */ - if (! - (B_NR_ITEMS(Sh) == 1 - && is_direntry_le_ih(item_head(Sh, 0)) - && ih_entry_count(item_head(Sh, 0)) == 1)) { - /* - * node contains more than 1 item, or item - * is not directory item, or this item - * contains more than 1 entry - */ - print_block(Sh, 0, -1, -1); - reiserfs_panic(tb->tb_sb, "vs-8045", - "rdkey %k, affected item==%d " - "(mode==%c) Must be %c", - key, vn->vn_affected_item_num, - vn->vn_mode, M_DELETE); - } - } -#endif - - } -} - -/* - * Using virtual node check, how many items can be - * shifted to left neighbor - */ -static void check_left(struct tree_balance *tb, int h, int cur_free) -{ - int i; - struct virtual_node *vn = tb->tb_vn; - struct virtual_item *vi; - int d_size, ih_size; - - RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); - - /* internal level */ - if (h > 0) { - tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); - return; - } - - /* leaf level */ - - if (!cur_free || !vn->vn_nr_item) { - /* no free space or nothing to move */ - tb->lnum[h] = 0; - tb->lbytes = -1; - return; - } - - RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), - "vs-8055: parent does not exist or invalid"); - - vi = vn->vn_vi; - if ((unsigned int)cur_free >= - (vn->vn_size - - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { - /* all contents of S[0] fits into L[0] */ - - RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, - "vs-8055: invalid mode or balance condition failed"); - - tb->lnum[0] = vn->vn_nr_item; - tb->lbytes = -1; - return; - } - - d_size = 0, ih_size = IH_SIZE; - - /* first item may be merge with last item in left neighbor */ - if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) - d_size = -((int)IH_SIZE), ih_size = 0; - - tb->lnum[0] = 0; - for (i = 0; i < vn->vn_nr_item; - i++, ih_size = IH_SIZE, d_size = 0, vi++) { - d_size += vi->vi_item_len; - if (cur_free >= d_size) { - /* the item can be shifted entirely */ - cur_free -= d_size; - tb->lnum[0]++; - continue; - } - - /* the item cannot be shifted entirely, try to split it */ - /* - * check whether L[0] can hold ih and at least one byte - * of the item body - */ - - /* cannot shift even a part of the current item */ - if (cur_free <= ih_size) { - tb->lbytes = -1; - return; - } - cur_free -= ih_size; - - tb->lbytes = op_check_left(vi, cur_free, 0, 0); - if (tb->lbytes != -1) - /* count partially shifted item */ - tb->lnum[0]++; - - break; - } - - return; -} - -/* - * Using virtual node check, how many items can be - * shifted to right neighbor - */ -static void check_right(struct tree_balance *tb, int h, int cur_free) -{ - int i; - struct virtual_node *vn = tb->tb_vn; - struct virtual_item *vi; - int d_size, ih_size; - - RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); - - /* internal level */ - if (h > 0) { - tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); - return; - } - - /* leaf level */ - - if (!cur_free || !vn->vn_nr_item) { - /* no free space */ - tb->rnum[h] = 0; - tb->rbytes = -1; - return; - } - - RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), - "vs-8075: parent does not exist or invalid"); - - vi = vn->vn_vi + vn->vn_nr_item - 1; - if ((unsigned int)cur_free >= - (vn->vn_size - - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { - /* all contents of S[0] fits into R[0] */ - - RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, - "vs-8080: invalid mode or balance condition failed"); - - tb->rnum[h] = vn->vn_nr_item; - tb->rbytes = -1; - return; - } - - d_size = 0, ih_size = IH_SIZE; - - /* last item may be merge with first item in right neighbor */ - if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) - d_size = -(int)IH_SIZE, ih_size = 0; - - tb->rnum[0] = 0; - for (i = vn->vn_nr_item - 1; i >= 0; - i--, d_size = 0, ih_size = IH_SIZE, vi--) { - d_size += vi->vi_item_len; - if (cur_free >= d_size) { - /* the item can be shifted entirely */ - cur_free -= d_size; - tb->rnum[0]++; - continue; - } - - /* - * check whether R[0] can hold ih and at least one - * byte of the item body - */ - - /* cannot shift even a part of the current item */ - if (cur_free <= ih_size) { - tb->rbytes = -1; - return; - } - - /* - * R[0] can hold the header of the item and at least - * one byte of its body - */ - cur_free -= ih_size; /* cur_free is still > 0 */ - - tb->rbytes = op_check_right(vi, cur_free); - if (tb->rbytes != -1) - /* count partially shifted item */ - tb->rnum[0]++; - - break; - } - - return; -} - -/* - * from - number of items, which are shifted to left neighbor entirely - * to - number of item, which are shifted to right neighbor entirely - * from_bytes - number of bytes of boundary item (or directory entries) - * which are shifted to left neighbor - * to_bytes - number of bytes of boundary item (or directory entries) - * which are shifted to right neighbor - */ -static int get_num_ver(int mode, struct tree_balance *tb, int h, - int from, int from_bytes, - int to, int to_bytes, short *snum012, int flow) -{ - int i; - int units; - struct virtual_node *vn = tb->tb_vn; - int total_node_size, max_node_size, current_item_size; - int needed_nodes; - - /* position of item we start filling node from */ - int start_item; - - /* position of item we finish filling node by */ - int end_item; - - /* - * number of first bytes (entries for directory) of start_item-th item - * we do not include into node that is being filled - */ - int start_bytes; - - /* - * number of last bytes (entries for directory) of end_item-th item - * we do node include into node that is being filled - */ - int end_bytes; - - /* - * these are positions in virtual item of items, that are split - * between S[0] and S1new and S1new and S2new - */ - int split_item_positions[2]; - - split_item_positions[0] = -1; - split_item_positions[1] = -1; - - /* - * We only create additional nodes if we are in insert or paste mode - * or we are in replace mode at the internal level. If h is 0 and - * the mode is M_REPLACE then in fix_nodes we change the mode to - * paste or insert before we get here in the code. - */ - RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), - "vs-8100: insert_size < 0 in overflow"); - - max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); - - /* - * snum012 [0-2] - number of items, that lay - * to S[0], first new node and second new node - */ - snum012[3] = -1; /* s1bytes */ - snum012[4] = -1; /* s2bytes */ - - /* internal level */ - if (h > 0) { - i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); - if (i == max_node_size) - return 1; - return (i / max_node_size + 1); - } - - /* leaf level */ - needed_nodes = 1; - total_node_size = 0; - - /* start from 'from'-th item */ - start_item = from; - /* skip its first 'start_bytes' units */ - start_bytes = ((from_bytes != -1) ? from_bytes : 0); - - /* last included item is the 'end_item'-th one */ - end_item = vn->vn_nr_item - to - 1; - /* do not count last 'end_bytes' units of 'end_item'-th item */ - end_bytes = (to_bytes != -1) ? to_bytes : 0; - - /* - * go through all item beginning from the start_item-th item - * and ending by the end_item-th item. Do not count first - * 'start_bytes' units of 'start_item'-th item and last - * 'end_bytes' of 'end_item'-th item - */ - for (i = start_item; i <= end_item; i++) { - struct virtual_item *vi = vn->vn_vi + i; - int skip_from_end = ((i == end_item) ? end_bytes : 0); - - RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); - - /* get size of current item */ - current_item_size = vi->vi_item_len; - - /* - * do not take in calculation head part (from_bytes) - * of from-th item - */ - current_item_size -= - op_part_size(vi, 0 /*from start */ , start_bytes); - - /* do not take in calculation tail part of last item */ - current_item_size -= - op_part_size(vi, 1 /*from end */ , skip_from_end); - - /* if item fits into current node entierly */ - if (total_node_size + current_item_size <= max_node_size) { - snum012[needed_nodes - 1]++; - total_node_size += current_item_size; - start_bytes = 0; - continue; - } - - /* - * virtual item length is longer, than max size of item in - * a node. It is impossible for direct item - */ - if (current_item_size > max_node_size) { - RFALSE(is_direct_le_ih(vi->vi_ih), - "vs-8110: " - "direct item length is %d. It can not be longer than %d", - current_item_size, max_node_size); - /* we will try to split it */ - flow = 1; - } - - /* as we do not split items, take new node and continue */ - if (!flow) { - needed_nodes++; - i--; - total_node_size = 0; - continue; - } - - /* - * calculate number of item units which fit into node being - * filled - */ - { - int free_space; - - free_space = max_node_size - total_node_size - IH_SIZE; - units = - op_check_left(vi, free_space, start_bytes, - skip_from_end); - /* - * nothing fits into current node, take new - * node and continue - */ - if (units == -1) { - needed_nodes++, i--, total_node_size = 0; - continue; - } - } - - /* something fits into the current node */ - start_bytes += units; - snum012[needed_nodes - 1 + 3] = units; - - if (needed_nodes > 2) - reiserfs_warning(tb->tb_sb, "vs-8111", - "split_item_position is out of range"); - snum012[needed_nodes - 1]++; - split_item_positions[needed_nodes - 1] = i; - needed_nodes++; - /* continue from the same item with start_bytes != -1 */ - start_item = i; - i--; - total_node_size = 0; - } - - /* - * sum012[4] (if it is not -1) contains number of units of which - * are to be in S1new, snum012[3] - to be in S0. They are supposed - * to be S1bytes and S2bytes correspondingly, so recalculate - */ - if (snum012[4] > 0) { - int split_item_num; - int bytes_to_r, bytes_to_l; - int bytes_to_S1new; - - split_item_num = split_item_positions[1]; - bytes_to_l = - ((from == split_item_num - && from_bytes != -1) ? from_bytes : 0); - bytes_to_r = - ((end_item == split_item_num - && end_bytes != -1) ? end_bytes : 0); - bytes_to_S1new = - ((split_item_positions[0] == - split_item_positions[1]) ? snum012[3] : 0); - - /* s2bytes */ - snum012[4] = - op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - - bytes_to_r - bytes_to_l - bytes_to_S1new; - - if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && - vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) - reiserfs_warning(tb->tb_sb, "vs-8115", - "not directory or indirect item"); - } - - /* now we know S2bytes, calculate S1bytes */ - if (snum012[3] > 0) { - int split_item_num; - int bytes_to_r, bytes_to_l; - int bytes_to_S2new; - - split_item_num = split_item_positions[0]; - bytes_to_l = - ((from == split_item_num - && from_bytes != -1) ? from_bytes : 0); - bytes_to_r = - ((end_item == split_item_num - && end_bytes != -1) ? end_bytes : 0); - bytes_to_S2new = - ((split_item_positions[0] == split_item_positions[1] - && snum012[4] != -1) ? snum012[4] : 0); - - /* s1bytes */ - snum012[3] = - op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - - bytes_to_r - bytes_to_l - bytes_to_S2new; - } - - return needed_nodes; -} - - -/* - * Set parameters for balancing. - * Performs write of results of analysis of balancing into structure tb, - * where it will later be used by the functions that actually do the balancing. - * Parameters: - * tb tree_balance structure; - * h current level of the node; - * lnum number of items from S[h] that must be shifted to L[h]; - * rnum number of items from S[h] that must be shifted to R[h]; - * blk_num number of blocks that S[h] will be splitted into; - * s012 number of items that fall into splitted nodes. - * lbytes number of bytes which flow to the left neighbor from the - * item that is not shifted entirely - * rbytes number of bytes which flow to the right neighbor from the - * item that is not shifted entirely - * s1bytes number of bytes which flow to the first new node when - * S[0] splits (this number is contained in s012 array) - */ - -static void set_parameters(struct tree_balance *tb, int h, int lnum, - int rnum, int blk_num, short *s012, int lb, int rb) -{ - - tb->lnum[h] = lnum; - tb->rnum[h] = rnum; - tb->blknum[h] = blk_num; - - /* only for leaf level */ - if (h == 0) { - if (s012 != NULL) { - tb->s0num = *s012++; - tb->snum[0] = *s012++; - tb->snum[1] = *s012++; - tb->sbytes[0] = *s012++; - tb->sbytes[1] = *s012; - } - tb->lbytes = lb; - tb->rbytes = rb; - } - PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); - PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); - - PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); - PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); -} - -/* - * check if node disappears if we shift tb->lnum[0] items to left - * neighbor and tb->rnum[0] to the right one. - */ -static int is_leaf_removable(struct tree_balance *tb) -{ - struct virtual_node *vn = tb->tb_vn; - int to_left, to_right; - int size; - int remain_items; - - /* - * number of items that will be shifted to left (right) neighbor - * entirely - */ - to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); - to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); - remain_items = vn->vn_nr_item; - - /* how many items remain in S[0] after shiftings to neighbors */ - remain_items -= (to_left + to_right); - - /* all content of node can be shifted to neighbors */ - if (remain_items < 1) { - set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, - NULL, -1, -1); - return 1; - } - - /* S[0] is not removable */ - if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) - return 0; - - /* check whether we can divide 1 remaining item between neighbors */ - - /* get size of remaining item (in item units) */ - size = op_unit_num(&vn->vn_vi[to_left]); - - if (tb->lbytes + tb->rbytes >= size) { - set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, - tb->lbytes, -1); - return 1; - } - - return 0; -} - -/* check whether L, S, R can be joined in one node */ -static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) -{ - struct virtual_node *vn = tb->tb_vn; - int ih_size; - struct buffer_head *S0; - - S0 = PATH_H_PBUFFER(tb->tb_path, 0); - - ih_size = 0; - if (vn->vn_nr_item) { - if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) - ih_size += IH_SIZE; - - if (vn->vn_vi[vn->vn_nr_item - 1]. - vi_type & VI_TYPE_RIGHT_MERGEABLE) - ih_size += IH_SIZE; - } else { - /* there was only one item and it will be deleted */ - struct item_head *ih; - - RFALSE(B_NR_ITEMS(S0) != 1, - "vs-8125: item number must be 1: it is %d", - B_NR_ITEMS(S0)); - - ih = item_head(S0, 0); - if (tb->CFR[0] - && !comp_short_le_keys(&ih->ih_key, - internal_key(tb->CFR[0], - tb->rkey[0]))) - /* - * Directory must be in correct state here: that is - * somewhere at the left side should exist first - * directory item. But the item being deleted can - * not be that first one because its right neighbor - * is item of the same directory. (But first item - * always gets deleted in last turn). So, neighbors - * of deleted item can be merged, so we can save - * ih_size - */ - if (is_direntry_le_ih(ih)) { - ih_size = IH_SIZE; - - /* - * we might check that left neighbor exists - * and is of the same directory - */ - RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, - "vs-8130: first directory item can not be removed until directory is not empty"); - } - - } - - if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { - set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); - PROC_INFO_INC(tb->tb_sb, leaves_removable); - return 1; - } - return 0; - -} - -/* when we do not split item, lnum and rnum are numbers of entire items */ -#define SET_PAR_SHIFT_LEFT \ -if (h)\ -{\ - int to_l;\ - \ - to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ - (MAX_NR_KEY(Sh) + 1 - lpar);\ - \ - set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ -}\ -else \ -{\ - if (lset==LEFT_SHIFT_FLOW)\ - set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ - tb->lbytes, -1);\ - else\ - set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ - -1, -1);\ -} - -#define SET_PAR_SHIFT_RIGHT \ -if (h)\ -{\ - int to_r;\ - \ - to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ - \ - set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ -}\ -else \ -{\ - if (rset==RIGHT_SHIFT_FLOW)\ - set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ - -1, tb->rbytes);\ - else\ - set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ - -1, -1);\ -} - -static void free_buffers_in_tb(struct tree_balance *tb) -{ - int i; - - pathrelse(tb->tb_path); - - for (i = 0; i < MAX_HEIGHT; i++) { - brelse(tb->L[i]); - brelse(tb->R[i]); - brelse(tb->FL[i]); - brelse(tb->FR[i]); - brelse(tb->CFL[i]); - brelse(tb->CFR[i]); - - tb->L[i] = NULL; - tb->R[i] = NULL; - tb->FL[i] = NULL; - tb->FR[i] = NULL; - tb->CFL[i] = NULL; - tb->CFR[i] = NULL; - } -} - -/* - * Get new buffers for storing new nodes that are created while balancing. - * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; - * NO_DISK_SPACE - no disk space. - */ -/* The function is NOT SCHEDULE-SAFE! */ -static int get_empty_nodes(struct tree_balance *tb, int h) -{ - struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); - b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; - int counter, number_of_freeblk; - int amount_needed; /* number of needed empty blocks */ - int retval = CARRY_ON; - struct super_block *sb = tb->tb_sb; - - /* - * number_of_freeblk is the number of empty blocks which have been - * acquired for use by the balancing algorithm minus the number of - * empty blocks used in the previous levels of the analysis, - * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule - * occurs after empty blocks are acquired, and the balancing analysis - * is then restarted, amount_needed is the number needed by this - * level (h) of the balancing analysis. - * - * Note that for systems with many processes writing, it would be - * more layout optimal to calculate the total number needed by all - * levels and then to run reiserfs_new_blocks to get all of them at - * once. - */ - - /* - * Initiate number_of_freeblk to the amount acquired prior to the - * restart of the analysis or 0 if not restarted, then subtract the - * amount needed by all of the levels of the tree below h. - */ - /* blknum includes S[h], so we subtract 1 in this calculation */ - for (counter = 0, number_of_freeblk = tb->cur_blknum; - counter < h; counter++) - number_of_freeblk -= - (tb->blknum[counter]) ? (tb->blknum[counter] - - 1) : 0; - - /* Allocate missing empty blocks. */ - /* if Sh == 0 then we are getting a new root */ - amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; - /* - * Amount_needed = the amount that we need more than the - * amount that we have. - */ - if (amount_needed > number_of_freeblk) - amount_needed -= number_of_freeblk; - else /* If we have enough already then there is nothing to do. */ - return CARRY_ON; - - /* - * No need to check quota - is not allocated for blocks used - * for formatted nodes - */ - if (reiserfs_new_form_blocknrs(tb, blocknrs, - amount_needed) == NO_DISK_SPACE) - return NO_DISK_SPACE; - - /* for each blocknumber we just got, get a buffer and stick it on FEB */ - for (blocknr = blocknrs, counter = 0; - counter < amount_needed; blocknr++, counter++) { - - RFALSE(!*blocknr, - "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); - - new_bh = sb_getblk(sb, *blocknr); - RFALSE(buffer_dirty(new_bh) || - buffer_journaled(new_bh) || - buffer_journal_dirty(new_bh), - "PAP-8140: journaled or dirty buffer %b for the new block", - new_bh); - - /* Put empty buffers into the array. */ - RFALSE(tb->FEB[tb->cur_blknum], - "PAP-8141: busy slot for new buffer"); - - set_buffer_journal_new(new_bh); - tb->FEB[tb->cur_blknum++] = new_bh; - } - - if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) - retval = REPEAT_SEARCH; - - return retval; -} - -/* - * Get free space of the left neighbor, which is stored in the parent - * node of the left neighbor. - */ -static int get_lfree(struct tree_balance *tb, int h) -{ - struct buffer_head *l, *f; - int order; - - if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || - (l = tb->FL[h]) == NULL) - return 0; - - if (f == l) - order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; - else { - order = B_NR_ITEMS(l); - f = l; - } - - return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); -} - -/* - * Get free space of the right neighbor, - * which is stored in the parent node of the right neighbor. - */ -static int get_rfree(struct tree_balance *tb, int h) -{ - struct buffer_head *r, *f; - int order; - - if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || - (r = tb->FR[h]) == NULL) - return 0; - - if (f == r) - order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; - else { - order = 0; - f = r; - } - - return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); - -} - -/* Check whether left neighbor is in memory. */ -static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) -{ - struct buffer_head *father, *left; - struct super_block *sb = tb->tb_sb; - b_blocknr_t left_neighbor_blocknr; - int left_neighbor_position; - - /* Father of the left neighbor does not exist. */ - if (!tb->FL[h]) - return 0; - - /* Calculate father of the node to be balanced. */ - father = PATH_H_PBUFFER(tb->tb_path, h + 1); - - RFALSE(!father || - !B_IS_IN_TREE(father) || - !B_IS_IN_TREE(tb->FL[h]) || - !buffer_uptodate(father) || - !buffer_uptodate(tb->FL[h]), - "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", - father, tb->FL[h]); - - /* - * Get position of the pointer to the left neighbor - * into the left father. - */ - left_neighbor_position = (father == tb->FL[h]) ? - tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); - /* Get left neighbor block number. */ - left_neighbor_blocknr = - B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); - /* Look for the left neighbor in the cache. */ - if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { - - RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), - "vs-8170: left neighbor (%b %z) is not in the tree", - left, left); - put_bh(left); - return 1; - } - - return 0; -} - -#define LEFT_PARENTS 'l' -#define RIGHT_PARENTS 'r' - -static void decrement_key(struct cpu_key *key) -{ - /* call item specific function for this key */ - item_ops[cpu_key_k_type(key)]->decrement_key(key); -} - -/* - * Calculate far left/right parent of the left/right neighbor of the - * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor - * of the parent F[h]. - * Calculate left/right common parent of the current node and L[h]/R[h]. - * Calculate left/right delimiting key position. - * Returns: PATH_INCORRECT - path in the tree is not correct - * SCHEDULE_OCCURRED - schedule occurred while the function worked - * CARRY_ON - schedule didn't occur while the function - * worked - */ -static int get_far_parent(struct tree_balance *tb, - int h, - struct buffer_head **pfather, - struct buffer_head **pcom_father, char c_lr_par) -{ - struct buffer_head *parent; - INITIALIZE_PATH(s_path_to_neighbor_father); - struct treepath *path = tb->tb_path; - struct cpu_key s_lr_father_key; - int counter, - position = INT_MAX, - first_last_position = 0, - path_offset = PATH_H_PATH_OFFSET(path, h); - - /* - * Starting from F[h] go upwards in the tree, and look for the common - * ancestor of F[h], and its neighbor l/r, that should be obtained. - */ - - counter = path_offset; - - RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, - "PAP-8180: invalid path length"); - - for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { - /* - * Check whether parent of the current buffer in the path - * is really parent in the tree. - */ - if (!B_IS_IN_TREE - (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) - return REPEAT_SEARCH; - - /* Check whether position in the parent is correct. */ - if ((position = - PATH_OFFSET_POSITION(path, - counter - 1)) > - B_NR_ITEMS(parent)) - return REPEAT_SEARCH; - - /* - * Check whether parent at the path really points - * to the child. - */ - if (B_N_CHILD_NUM(parent, position) != - PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) - return REPEAT_SEARCH; - - /* - * Return delimiting key if position in the parent is not - * equal to first/last one. - */ - if (c_lr_par == RIGHT_PARENTS) - first_last_position = B_NR_ITEMS(parent); - if (position != first_last_position) { - *pcom_father = parent; - get_bh(*pcom_father); - /*(*pcom_father = parent)->b_count++; */ - break; - } - } - - /* if we are in the root of the tree, then there is no common father */ - if (counter == FIRST_PATH_ELEMENT_OFFSET) { - /* - * Check whether first buffer in the path is the - * root of the tree. - */ - if (PATH_OFFSET_PBUFFER - (tb->tb_path, - FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == - SB_ROOT_BLOCK(tb->tb_sb)) { - *pfather = *pcom_father = NULL; - return CARRY_ON; - } - return REPEAT_SEARCH; - } - - RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, - "PAP-8185: (%b %z) level too small", - *pcom_father, *pcom_father); - - /* Check whether the common parent is locked. */ - - if (buffer_locked(*pcom_father)) { - - /* Release the write lock while the buffer is busy */ - int depth = reiserfs_write_unlock_nested(tb->tb_sb); - __wait_on_buffer(*pcom_father); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (FILESYSTEM_CHANGED_TB(tb)) { - brelse(*pcom_father); - return REPEAT_SEARCH; - } - } - - /* - * So, we got common parent of the current node and its - * left/right neighbor. Now we are getting the parent of the - * left/right neighbor. - */ - - /* Form key to get parent of the left/right neighbor. */ - le_key2cpu_key(&s_lr_father_key, - internal_key(*pcom_father, - (c_lr_par == - LEFT_PARENTS) ? (tb->lkey[h - 1] = - position - - 1) : (tb->rkey[h - - 1] = - position))); - - if (c_lr_par == LEFT_PARENTS) - decrement_key(&s_lr_father_key); - - if (search_by_key - (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, - h + 1) == IO_ERROR) - /* path is released */ - return IO_ERROR; - - if (FILESYSTEM_CHANGED_TB(tb)) { - pathrelse(&s_path_to_neighbor_father); - brelse(*pcom_father); - return REPEAT_SEARCH; - } - - *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); - - RFALSE(B_LEVEL(*pfather) != h + 1, - "PAP-8190: (%b %z) level too small", *pfather, *pfather); - RFALSE(s_path_to_neighbor_father.path_length < - FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); - - s_path_to_neighbor_father.path_length--; - pathrelse(&s_path_to_neighbor_father); - return CARRY_ON; -} - -/* - * Get parents of neighbors of node in the path(S[path_offset]) and - * common parents of S[path_offset] and L[path_offset]/R[path_offset]: - * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], - * CFR[path_offset]. - * Calculate numbers of left and right delimiting keys position: - * lkey[path_offset], rkey[path_offset]. - * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked - * CARRY_ON - schedule didn't occur while the function worked - */ -static int get_parents(struct tree_balance *tb, int h) -{ - struct treepath *path = tb->tb_path; - int position, - ret, - path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); - struct buffer_head *curf, *curcf; - - /* Current node is the root of the tree or will be root of the tree */ - if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { - /* - * The root can not have parents. - * Release nodes which previously were obtained as - * parents of the current node neighbors. - */ - brelse(tb->FL[h]); - brelse(tb->CFL[h]); - brelse(tb->FR[h]); - brelse(tb->CFR[h]); - tb->FL[h] = NULL; - tb->CFL[h] = NULL; - tb->FR[h] = NULL; - tb->CFR[h] = NULL; - return CARRY_ON; - } - - /* Get parent FL[path_offset] of L[path_offset]. */ - position = PATH_OFFSET_POSITION(path, path_offset - 1); - if (position) { - /* Current node is not the first child of its parent. */ - curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); - curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); - get_bh(curf); - get_bh(curf); - tb->lkey[h] = position - 1; - } else { - /* - * Calculate current parent of L[path_offset], which is the - * left neighbor of the current node. Calculate current - * common parent of L[path_offset] and the current node. - * Note that CFL[path_offset] not equal FL[path_offset] and - * CFL[path_offset] not equal F[path_offset]. - * Calculate lkey[path_offset]. - */ - if ((ret = get_far_parent(tb, h + 1, &curf, - &curcf, - LEFT_PARENTS)) != CARRY_ON) - return ret; - } - - brelse(tb->FL[h]); - tb->FL[h] = curf; /* New initialization of FL[h]. */ - brelse(tb->CFL[h]); - tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ - - RFALSE((curf && !B_IS_IN_TREE(curf)) || - (curcf && !B_IS_IN_TREE(curcf)), - "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); - - /* Get parent FR[h] of R[h]. */ - - /* Current node is the last child of F[h]. FR[h] != F[h]. */ - if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { - /* - * Calculate current parent of R[h], which is the right - * neighbor of F[h]. Calculate current common parent of - * R[h] and current node. Note that CFR[h] not equal - * FR[path_offset] and CFR[h] not equal F[h]. - */ - if ((ret = - get_far_parent(tb, h + 1, &curf, &curcf, - RIGHT_PARENTS)) != CARRY_ON) - return ret; - } else { - /* Current node is not the last child of its parent F[h]. */ - curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); - curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); - get_bh(curf); - get_bh(curf); - tb->rkey[h] = position; - } - - brelse(tb->FR[h]); - /* New initialization of FR[path_offset]. */ - tb->FR[h] = curf; - - brelse(tb->CFR[h]); - /* New initialization of CFR[path_offset]. */ - tb->CFR[h] = curcf; - - RFALSE((curf && !B_IS_IN_TREE(curf)) || - (curcf && !B_IS_IN_TREE(curcf)), - "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); - - return CARRY_ON; -} - -/* - * it is possible to remove node as result of shiftings to - * neighbors even when we insert or paste item. - */ -static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, - struct tree_balance *tb, int h) -{ - struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); - int levbytes = tb->insert_size[h]; - struct item_head *ih; - struct reiserfs_key *r_key = NULL; - - ih = item_head(Sh, 0); - if (tb->CFR[h]) - r_key = internal_key(tb->CFR[h], tb->rkey[h]); - - if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes - /* shifting may merge items which might save space */ - - - ((!h - && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) - - - ((!h && r_key - && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) - + ((h) ? KEY_SIZE : 0)) { - /* node can not be removed */ - if (sfree >= levbytes) { - /* new item fits into node S[h] without any shifting */ - if (!h) - tb->s0num = - B_NR_ITEMS(Sh) + - ((mode == M_INSERT) ? 1 : 0); - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - } - PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); - return !NO_BALANCING_NEEDED; -} - -/* - * Check whether current node S[h] is balanced when increasing its size by - * Inserting or Pasting. - * Calculate parameters for balancing for current level h. - * Parameters: - * tb tree_balance structure; - * h current level of the node; - * inum item number in S[h]; - * mode i - insert, p - paste; - * Returns: 1 - schedule occurred; - * 0 - balancing for higher levels needed; - * -1 - no balancing for higher levels needed; - * -2 - no disk space. - */ -/* ip means Inserting or Pasting */ -static int ip_check_balance(struct tree_balance *tb, int h) -{ - struct virtual_node *vn = tb->tb_vn; - /* - * Number of bytes that must be inserted into (value is negative - * if bytes are deleted) buffer which contains node being balanced. - * The mnemonic is that the attempted change in node space used - * level is levbytes bytes. - */ - int levbytes; - int ret; - - int lfree, sfree, rfree /* free space in L, S and R */ ; - - /* - * nver is short for number of vertixes, and lnver is the number if - * we shift to the left, rnver is the number if we shift to the - * right, and lrnver is the number if we shift in both directions. - * The goal is to minimize first the number of vertixes, and second, - * the number of vertixes whose contents are changed by shifting, - * and third the number of uncached vertixes whose contents are - * changed by shifting and must be read from disk. - */ - int nver, lnver, rnver, lrnver; - - /* - * used at leaf level only, S0 = S[0] is the node being balanced, - * sInum [ I = 0,1,2 ] is the number of items that will - * remain in node SI after balancing. S1 and S2 are new - * nodes that might be created. - */ - - /* - * we perform 8 calls to get_num_ver(). For each call we - * calculate five parameters. where 4th parameter is s1bytes - * and 5th - s2bytes - * - * s0num, s1num, s2num for 8 cases - * 0,1 - do not shift and do not shift but bottle - * 2 - shift only whole item to left - * 3 - shift to left and bottle as much as possible - * 4,5 - shift to right (whole items and as much as possible - * 6,7 - shift to both directions (whole items and as much as possible) - */ - short snum012[40] = { 0, }; - - /* Sh is the node whose balance is currently being checked */ - struct buffer_head *Sh; - - Sh = PATH_H_PBUFFER(tb->tb_path, h); - levbytes = tb->insert_size[h]; - - /* Calculate balance parameters for creating new root. */ - if (!Sh) { - if (!h) - reiserfs_panic(tb->tb_sb, "vs-8210", - "S[0] can not be 0"); - switch (ret = get_empty_nodes(tb, h)) { - /* no balancing for higher levels needed */ - case CARRY_ON: - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - - case NO_DISK_SPACE: - case REPEAT_SEARCH: - return ret; - default: - reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " - "return value of get_empty_nodes"); - } - } - - /* get parents of S[h] neighbors. */ - ret = get_parents(tb, h); - if (ret != CARRY_ON) - return ret; - - sfree = B_FREE_SPACE(Sh); - - /* get free space of neighbors */ - rfree = get_rfree(tb, h); - lfree = get_lfree(tb, h); - - /* and new item fits into node S[h] without any shifting */ - if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == - NO_BALANCING_NEEDED) - return NO_BALANCING_NEEDED; - - create_virtual_node(tb, h); - - /* - * determine maximal number of items we can shift to the left - * neighbor (in tb structure) and the maximal number of bytes - * that can flow to the left neighbor from the left most liquid - * item that cannot be shifted from S[0] entirely (returned value) - */ - check_left(tb, h, lfree); - - /* - * determine maximal number of items we can shift to the right - * neighbor (in tb structure) and the maximal number of bytes - * that can flow to the right neighbor from the right most liquid - * item that cannot be shifted from S[0] entirely (returned value) - */ - check_right(tb, h, rfree); - - /* - * all contents of internal node S[h] can be moved into its - * neighbors, S[h] will be removed after balancing - */ - if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { - int to_r; - - /* - * Since we are working on internal nodes, and our internal - * nodes have fixed size entries, then we can balance by the - * number of items rather than the space they consume. In this - * routine we set the left node equal to the right node, - * allowing a difference of less than or equal to 1 child - * pointer. - */ - to_r = - ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + - vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - - tb->rnum[h]); - set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, - -1, -1); - return CARRY_ON; - } - - /* - * this checks balance condition, that any two neighboring nodes - * can not fit in one node - */ - RFALSE(h && - (tb->lnum[h] >= vn->vn_nr_item + 1 || - tb->rnum[h] >= vn->vn_nr_item + 1), - "vs-8220: tree is not balanced on internal level"); - RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || - (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), - "vs-8225: tree is not balanced on leaf level"); - - /* - * all contents of S[0] can be moved into its neighbors - * S[0] will be removed after balancing. - */ - if (!h && is_leaf_removable(tb)) - return CARRY_ON; - - /* - * why do we perform this check here rather than earlier?? - * Answer: we can win 1 node in some cases above. Moreover we - * checked it above, when we checked, that S[0] is not removable - * in principle - */ - - /* new item fits into node S[h] without any shifting */ - if (sfree >= levbytes) { - if (!h) - tb->s0num = vn->vn_nr_item; - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - - { - int lpar, rpar, nset, lset, rset, lrset; - /* regular overflowing of the node */ - - /* - * get_num_ver works in 2 modes (FLOW & NO_FLOW) - * lpar, rpar - number of items we can shift to left/right - * neighbor (including splitting item) - * nset, lset, rset, lrset - shows, whether flowing items - * give better packing - */ -#define FLOW 1 -#define NO_FLOW 0 /* do not any splitting */ - - /* we choose one of the following */ -#define NOTHING_SHIFT_NO_FLOW 0 -#define NOTHING_SHIFT_FLOW 5 -#define LEFT_SHIFT_NO_FLOW 10 -#define LEFT_SHIFT_FLOW 15 -#define RIGHT_SHIFT_NO_FLOW 20 -#define RIGHT_SHIFT_FLOW 25 -#define LR_SHIFT_NO_FLOW 30 -#define LR_SHIFT_FLOW 35 - - lpar = tb->lnum[h]; - rpar = tb->rnum[h]; - - /* - * calculate number of blocks S[h] must be split into when - * nothing is shifted to the neighbors, as well as number of - * items in each part of the split node (s012 numbers), - * and number of bytes (s1bytes) of the shared drop which - * flow to S1 if any - */ - nset = NOTHING_SHIFT_NO_FLOW; - nver = get_num_ver(vn->vn_mode, tb, h, - 0, -1, h ? vn->vn_nr_item : 0, -1, - snum012, NO_FLOW); - - if (!h) { - int nver1; - - /* - * note, that in this case we try to bottle - * between S[0] and S1 (S1 - the first new node) - */ - nver1 = get_num_ver(vn->vn_mode, tb, h, - 0, -1, 0, -1, - snum012 + NOTHING_SHIFT_FLOW, FLOW); - if (nver > nver1) - nset = NOTHING_SHIFT_FLOW, nver = nver1; - } - - /* - * calculate number of blocks S[h] must be split into when - * l_shift_num first items and l_shift_bytes of the right - * most liquid item to be shifted are shifted to the left - * neighbor, as well as number of items in each part of the - * splitted node (s012 numbers), and number of bytes - * (s1bytes) of the shared drop which flow to S1 if any - */ - lset = LEFT_SHIFT_NO_FLOW; - lnver = get_num_ver(vn->vn_mode, tb, h, - lpar - ((h || tb->lbytes == -1) ? 0 : 1), - -1, h ? vn->vn_nr_item : 0, -1, - snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); - if (!h) { - int lnver1; - - lnver1 = get_num_ver(vn->vn_mode, tb, h, - lpar - - ((tb->lbytes != -1) ? 1 : 0), - tb->lbytes, 0, -1, - snum012 + LEFT_SHIFT_FLOW, FLOW); - if (lnver > lnver1) - lset = LEFT_SHIFT_FLOW, lnver = lnver1; - } - - /* - * calculate number of blocks S[h] must be split into when - * r_shift_num first items and r_shift_bytes of the left most - * liquid item to be shifted are shifted to the right neighbor, - * as well as number of items in each part of the splitted - * node (s012 numbers), and number of bytes (s1bytes) of the - * shared drop which flow to S1 if any - */ - rset = RIGHT_SHIFT_NO_FLOW; - rnver = get_num_ver(vn->vn_mode, tb, h, - 0, -1, - h ? (vn->vn_nr_item - rpar) : (rpar - - ((tb-> - rbytes != - -1) ? 1 : - 0)), -1, - snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); - if (!h) { - int rnver1; - - rnver1 = get_num_ver(vn->vn_mode, tb, h, - 0, -1, - (rpar - - ((tb->rbytes != -1) ? 1 : 0)), - tb->rbytes, - snum012 + RIGHT_SHIFT_FLOW, FLOW); - - if (rnver > rnver1) - rset = RIGHT_SHIFT_FLOW, rnver = rnver1; - } - - /* - * calculate number of blocks S[h] must be split into when - * items are shifted in both directions, as well as number - * of items in each part of the splitted node (s012 numbers), - * and number of bytes (s1bytes) of the shared drop which - * flow to S1 if any - */ - lrset = LR_SHIFT_NO_FLOW; - lrnver = get_num_ver(vn->vn_mode, tb, h, - lpar - ((h || tb->lbytes == -1) ? 0 : 1), - -1, - h ? (vn->vn_nr_item - rpar) : (rpar - - ((tb-> - rbytes != - -1) ? 1 : - 0)), -1, - snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); - if (!h) { - int lrnver1; - - lrnver1 = get_num_ver(vn->vn_mode, tb, h, - lpar - - ((tb->lbytes != -1) ? 1 : 0), - tb->lbytes, - (rpar - - ((tb->rbytes != -1) ? 1 : 0)), - tb->rbytes, - snum012 + LR_SHIFT_FLOW, FLOW); - if (lrnver > lrnver1) - lrset = LR_SHIFT_FLOW, lrnver = lrnver1; - } - - /* - * Our general shifting strategy is: - * 1) to minimized number of new nodes; - * 2) to minimized number of neighbors involved in shifting; - * 3) to minimized number of disk reads; - */ - - /* we can win TWO or ONE nodes by shifting in both directions */ - if (lrnver < lnver && lrnver < rnver) { - RFALSE(h && - (tb->lnum[h] != 1 || - tb->rnum[h] != 1 || - lrnver != 1 || rnver != 2 || lnver != 2 - || h != 1), "vs-8230: bad h"); - if (lrset == LR_SHIFT_FLOW) - set_parameters(tb, h, tb->lnum[h], tb->rnum[h], - lrnver, snum012 + lrset, - tb->lbytes, tb->rbytes); - else - set_parameters(tb, h, - tb->lnum[h] - - ((tb->lbytes == -1) ? 0 : 1), - tb->rnum[h] - - ((tb->rbytes == -1) ? 0 : 1), - lrnver, snum012 + lrset, -1, -1); - - return CARRY_ON; - } - - /* - * if shifting doesn't lead to better packing - * then don't shift - */ - if (nver == lrnver) { - set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, - -1); - return CARRY_ON; - } - - /* - * now we know that for better packing shifting in only one - * direction either to the left or to the right is required - */ - - /* - * if shifting to the left is better than - * shifting to the right - */ - if (lnver < rnver) { - SET_PAR_SHIFT_LEFT; - return CARRY_ON; - } - - /* - * if shifting to the right is better than - * shifting to the left - */ - if (lnver > rnver) { - SET_PAR_SHIFT_RIGHT; - return CARRY_ON; - } - - /* - * now shifting in either direction gives the same number - * of nodes and we can make use of the cached neighbors - */ - if (is_left_neighbor_in_cache(tb, h)) { - SET_PAR_SHIFT_LEFT; - return CARRY_ON; - } - - /* - * shift to the right independently on whether the - * right neighbor in cache or not - */ - SET_PAR_SHIFT_RIGHT; - return CARRY_ON; - } -} - -/* - * Check whether current node S[h] is balanced when Decreasing its size by - * Deleting or Cutting for INTERNAL node of S+tree. - * Calculate parameters for balancing for current level h. - * Parameters: - * tb tree_balance structure; - * h current level of the node; - * inum item number in S[h]; - * mode i - insert, p - paste; - * Returns: 1 - schedule occurred; - * 0 - balancing for higher levels needed; - * -1 - no balancing for higher levels needed; - * -2 - no disk space. - * - * Note: Items of internal nodes have fixed size, so the balance condition for - * the internal part of S+tree is as for the B-trees. - */ -static int dc_check_balance_internal(struct tree_balance *tb, int h) -{ - struct virtual_node *vn = tb->tb_vn; - - /* - * Sh is the node whose balance is currently being checked, - * and Fh is its father. - */ - struct buffer_head *Sh, *Fh; - int ret; - int lfree, rfree /* free space in L and R */ ; - - Sh = PATH_H_PBUFFER(tb->tb_path, h); - Fh = PATH_H_PPARENT(tb->tb_path, h); - - /* - * using tb->insert_size[h], which is negative in this case, - * create_virtual_node calculates: - * new_nr_item = number of items node would have if operation is - * performed without balancing (new_nr_item); - */ - create_virtual_node(tb, h); - - if (!Fh) { /* S[h] is the root. */ - /* no balancing for higher levels needed */ - if (vn->vn_nr_item > 0) { - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - /* - * new_nr_item == 0. - * Current root will be deleted resulting in - * decrementing the tree height. - */ - set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - if ((ret = get_parents(tb, h)) != CARRY_ON) - return ret; - - /* get free space of neighbors */ - rfree = get_rfree(tb, h); - lfree = get_lfree(tb, h); - - /* determine maximal number of items we can fit into neighbors */ - check_left(tb, h, lfree); - check_right(tb, h, rfree); - - /* - * Balance condition for the internal node is valid. - * In this case we balance only if it leads to better packing. - */ - if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { - /* - * Here we join S[h] with one of its neighbors, - * which is impossible with greater values of new_nr_item. - */ - if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { - /* All contents of S[h] can be moved to L[h]. */ - if (tb->lnum[h] >= vn->vn_nr_item + 1) { - int n; - int order_L; - - order_L = - ((n = - PATH_H_B_ITEM_ORDER(tb->tb_path, - h)) == - 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; - n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / - (DC_SIZE + KEY_SIZE); - set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, - -1); - return CARRY_ON; - } - - /* All contents of S[h] can be moved to R[h]. */ - if (tb->rnum[h] >= vn->vn_nr_item + 1) { - int n; - int order_R; - - order_R = - ((n = - PATH_H_B_ITEM_ORDER(tb->tb_path, - h)) == - B_NR_ITEMS(Fh)) ? 0 : n + 1; - n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / - (DC_SIZE + KEY_SIZE); - set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, - -1); - return CARRY_ON; - } - } - - /* - * All contents of S[h] can be moved to the neighbors - * (L[h] & R[h]). - */ - if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { - int to_r; - - to_r = - ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); - set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, - 0, NULL, -1, -1); - return CARRY_ON; - } - - /* Balancing does not lead to better packing. */ - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - - /* - * Current node contain insufficient number of items. - * Balancing is required. - */ - /* Check whether we can merge S[h] with left neighbor. */ - if (tb->lnum[h] >= vn->vn_nr_item + 1) - if (is_left_neighbor_in_cache(tb, h) - || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { - int n; - int order_L; - - order_L = - ((n = - PATH_H_B_ITEM_ORDER(tb->tb_path, - h)) == - 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; - n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + - KEY_SIZE); - set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* Check whether we can merge S[h] with right neighbor. */ - if (tb->rnum[h] >= vn->vn_nr_item + 1) { - int n; - int order_R; - - order_R = - ((n = - PATH_H_B_ITEM_ORDER(tb->tb_path, - h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); - n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + - KEY_SIZE); - set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ - if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { - int to_r; - - to_r = - ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + - vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - - tb->rnum[h]); - set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, - -1, -1); - return CARRY_ON; - } - - /* For internal nodes try to borrow item from a neighbor */ - RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); - - /* Borrow one or two items from caching neighbor */ - if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { - int from_l; - - from_l = - (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + - 1) / 2 - (vn->vn_nr_item + 1); - set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); - return CARRY_ON; - } - - set_parameters(tb, h, 0, - -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + - 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); - return CARRY_ON; -} - -/* - * Check whether current node S[h] is balanced when Decreasing its size by - * Deleting or Truncating for LEAF node of S+tree. - * Calculate parameters for balancing for current level h. - * Parameters: - * tb tree_balance structure; - * h current level of the node; - * inum item number in S[h]; - * mode i - insert, p - paste; - * Returns: 1 - schedule occurred; - * 0 - balancing for higher levels needed; - * -1 - no balancing for higher levels needed; - * -2 - no disk space. - */ -static int dc_check_balance_leaf(struct tree_balance *tb, int h) -{ - struct virtual_node *vn = tb->tb_vn; - - /* - * Number of bytes that must be deleted from - * (value is negative if bytes are deleted) buffer which - * contains node being balanced. The mnemonic is that the - * attempted change in node space used level is levbytes bytes. - */ - int levbytes; - - /* the maximal item size */ - int maxsize, ret; - - /* - * S0 is the node whose balance is currently being checked, - * and F0 is its father. - */ - struct buffer_head *S0, *F0; - int lfree, rfree /* free space in L and R */ ; - - S0 = PATH_H_PBUFFER(tb->tb_path, 0); - F0 = PATH_H_PPARENT(tb->tb_path, 0); - - levbytes = tb->insert_size[h]; - - maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ - - if (!F0) { /* S[0] is the root now. */ - - RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), - "vs-8240: attempt to create empty buffer tree"); - - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - - if ((ret = get_parents(tb, h)) != CARRY_ON) - return ret; - - /* get free space of neighbors */ - rfree = get_rfree(tb, h); - lfree = get_lfree(tb, h); - - create_virtual_node(tb, h); - - /* if 3 leaves can be merge to one, set parameters and return */ - if (are_leaves_removable(tb, lfree, rfree)) - return CARRY_ON; - - /* - * determine maximal number of items we can shift to the left/right - * neighbor and the maximal number of bytes that can flow to the - * left/right neighbor from the left/right most liquid item that - * cannot be shifted from S[0] entirely - */ - check_left(tb, h, lfree); - check_right(tb, h, rfree); - - /* check whether we can merge S with left neighbor. */ - if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) - if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ - !tb->FR[h]) { - - RFALSE(!tb->FL[h], - "vs-8245: dc_check_balance_leaf: FL[h] must exist"); - - /* set parameter to merge S[0] with its left neighbor */ - set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* check whether we can merge S[0] with right neighbor. */ - if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { - set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* - * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). - * Set parameters and return - */ - if (is_leaf_removable(tb)) - return CARRY_ON; - - /* Balancing is not required. */ - tb->s0num = vn->vn_nr_item; - set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; -} - -/* - * Check whether current node S[h] is balanced when Decreasing its size by - * Deleting or Cutting. - * Calculate parameters for balancing for current level h. - * Parameters: - * tb tree_balance structure; - * h current level of the node; - * inum item number in S[h]; - * mode d - delete, c - cut. - * Returns: 1 - schedule occurred; - * 0 - balancing for higher levels needed; - * -1 - no balancing for higher levels needed; - * -2 - no disk space. - */ -static int dc_check_balance(struct tree_balance *tb, int h) -{ - RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), - "vs-8250: S is not initialized"); - - if (h) - return dc_check_balance_internal(tb, h); - else - return dc_check_balance_leaf(tb, h); -} - -/* - * Check whether current node S[h] is balanced. - * Calculate parameters for balancing for current level h. - * Parameters: - * - * tb tree_balance structure: - * - * tb is a large structure that must be read about in the header - * file at the same time as this procedure if the reader is - * to successfully understand this procedure - * - * h current level of the node; - * inum item number in S[h]; - * mode i - insert, p - paste, d - delete, c - cut. - * Returns: 1 - schedule occurred; - * 0 - balancing for higher levels needed; - * -1 - no balancing for higher levels needed; - * -2 - no disk space. - */ -static int check_balance(int mode, - struct tree_balance *tb, - int h, - int inum, - int pos_in_item, - struct item_head *ins_ih, const void *data) -{ - struct virtual_node *vn; - - vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); - vn->vn_free_ptr = (char *)(tb->tb_vn + 1); - vn->vn_mode = mode; - vn->vn_affected_item_num = inum; - vn->vn_pos_in_item = pos_in_item; - vn->vn_ins_ih = ins_ih; - vn->vn_data = data; - - RFALSE(mode == M_INSERT && !vn->vn_ins_ih, - "vs-8255: ins_ih can not be 0 in insert mode"); - - /* Calculate balance parameters when size of node is increasing. */ - if (tb->insert_size[h] > 0) - return ip_check_balance(tb, h); - - /* Calculate balance parameters when size of node is decreasing. */ - return dc_check_balance(tb, h); -} - -/* Check whether parent at the path is the really parent of the current node.*/ -static int get_direct_parent(struct tree_balance *tb, int h) -{ - struct buffer_head *bh; - struct treepath *path = tb->tb_path; - int position, - path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); - - /* We are in the root or in the new root. */ - if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { - - RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, - "PAP-8260: invalid offset in the path"); - - if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> - b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { - /* Root is not changed. */ - PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; - PATH_OFFSET_POSITION(path, path_offset - 1) = 0; - return CARRY_ON; - } - /* Root is changed and we must recalculate the path. */ - return REPEAT_SEARCH; - } - - /* Parent in the path is not in the tree. */ - if (!B_IS_IN_TREE - (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) - return REPEAT_SEARCH; - - if ((position = - PATH_OFFSET_POSITION(path, - path_offset - 1)) > B_NR_ITEMS(bh)) - return REPEAT_SEARCH; - - /* Parent in the path is not parent of the current node in the tree. */ - if (B_N_CHILD_NUM(bh, position) != - PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) - return REPEAT_SEARCH; - - if (buffer_locked(bh)) { - int depth = reiserfs_write_unlock_nested(tb->tb_sb); - __wait_on_buffer(bh); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (FILESYSTEM_CHANGED_TB(tb)) - return REPEAT_SEARCH; - } - - /* - * Parent in the path is unlocked and really parent - * of the current node. - */ - return CARRY_ON; -} - -/* - * Using lnum[h] and rnum[h] we should determine what neighbors - * of S[h] we - * need in order to balance S[h], and get them if necessary. - * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; - */ -static int get_neighbors(struct tree_balance *tb, int h) -{ - int child_position, - path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); - unsigned long son_number; - struct super_block *sb = tb->tb_sb; - struct buffer_head *bh; - int depth; - - PROC_INFO_INC(sb, get_neighbors[h]); - - if (tb->lnum[h]) { - /* We need left neighbor to balance S[h]. */ - PROC_INFO_INC(sb, need_l_neighbor[h]); - bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); - - RFALSE(bh == tb->FL[h] && - !PATH_OFFSET_POSITION(tb->tb_path, path_offset), - "PAP-8270: invalid position in the parent"); - - child_position = - (bh == - tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> - FL[h]); - son_number = B_N_CHILD_NUM(tb->FL[h], child_position); - depth = reiserfs_write_unlock_nested(tb->tb_sb); - bh = sb_bread(sb, son_number); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (!bh) - return IO_ERROR; - if (FILESYSTEM_CHANGED_TB(tb)) { - brelse(bh); - PROC_INFO_INC(sb, get_neighbors_restart[h]); - return REPEAT_SEARCH; - } - - RFALSE(!B_IS_IN_TREE(tb->FL[h]) || - child_position > B_NR_ITEMS(tb->FL[h]) || - B_N_CHILD_NUM(tb->FL[h], child_position) != - bh->b_blocknr, "PAP-8275: invalid parent"); - RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); - RFALSE(!h && - B_FREE_SPACE(bh) != - MAX_CHILD_SIZE(bh) - - dc_size(B_N_CHILD(tb->FL[0], child_position)), - "PAP-8290: invalid child size of left neighbor"); - - brelse(tb->L[h]); - tb->L[h] = bh; - } - - /* We need right neighbor to balance S[path_offset]. */ - if (tb->rnum[h]) { - PROC_INFO_INC(sb, need_r_neighbor[h]); - bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); - - RFALSE(bh == tb->FR[h] && - PATH_OFFSET_POSITION(tb->tb_path, - path_offset) >= - B_NR_ITEMS(bh), - "PAP-8295: invalid position in the parent"); - - child_position = - (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; - son_number = B_N_CHILD_NUM(tb->FR[h], child_position); - depth = reiserfs_write_unlock_nested(tb->tb_sb); - bh = sb_bread(sb, son_number); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (!bh) - return IO_ERROR; - if (FILESYSTEM_CHANGED_TB(tb)) { - brelse(bh); - PROC_INFO_INC(sb, get_neighbors_restart[h]); - return REPEAT_SEARCH; - } - brelse(tb->R[h]); - tb->R[h] = bh; - - RFALSE(!h - && B_FREE_SPACE(bh) != - MAX_CHILD_SIZE(bh) - - dc_size(B_N_CHILD(tb->FR[0], child_position)), - "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", - B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), - dc_size(B_N_CHILD(tb->FR[0], child_position))); - - } - return CARRY_ON; -} - -static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) -{ - int max_num_of_items; - int max_num_of_entries; - unsigned long blocksize = sb->s_blocksize; - -#define MIN_NAME_LEN 1 - - max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); - max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / - (DEH_SIZE + MIN_NAME_LEN); - - return sizeof(struct virtual_node) + - max(max_num_of_items * sizeof(struct virtual_item), - sizeof(struct virtual_item) + - struct_size_t(struct direntry_uarea, entry_sizes, - max_num_of_entries)); -} - -/* - * maybe we should fail balancing we are going to perform when kmalloc - * fails several times. But now it will loop until kmalloc gets - * required memory - */ -static int get_mem_for_virtual_node(struct tree_balance *tb) -{ - int check_fs = 0; - int size; - char *buf; - - size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); - - /* we have to allocate more memory for virtual node */ - if (size > tb->vn_buf_size) { - if (tb->vn_buf) { - /* free memory allocated before */ - kfree(tb->vn_buf); - /* this is not needed if kfree is atomic */ - check_fs = 1; - } - - /* virtual node requires now more memory */ - tb->vn_buf_size = size; - - /* get memory for virtual item */ - buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); - if (!buf) { - /* - * getting memory with GFP_KERNEL priority may involve - * balancing now (due to indirect_to_direct conversion - * on dcache shrinking). So, release path and collected - * resources here - */ - free_buffers_in_tb(tb); - buf = kmalloc(size, GFP_NOFS); - if (!buf) { - tb->vn_buf_size = 0; - } - tb->vn_buf = buf; - schedule(); - return REPEAT_SEARCH; - } - - tb->vn_buf = buf; - } - - if (check_fs && FILESYSTEM_CHANGED_TB(tb)) - return REPEAT_SEARCH; - - return CARRY_ON; -} - -#ifdef CONFIG_REISERFS_CHECK -static void tb_buffer_sanity_check(struct super_block *sb, - struct buffer_head *bh, - const char *descr, int level) -{ - if (bh) { - if (atomic_read(&(bh->b_count)) <= 0) - - reiserfs_panic(sb, "jmacd-1", "negative or zero " - "reference counter for buffer %s[%d] " - "(%b)", descr, level, bh); - - if (!buffer_uptodate(bh)) - reiserfs_panic(sb, "jmacd-2", "buffer is not up " - "to date %s[%d] (%b)", - descr, level, bh); - - if (!B_IS_IN_TREE(bh)) - reiserfs_panic(sb, "jmacd-3", "buffer is not " - "in tree %s[%d] (%b)", - descr, level, bh); - - if (bh->b_bdev != sb->s_bdev) - reiserfs_panic(sb, "jmacd-4", "buffer has wrong " - "device %s[%d] (%b)", - descr, level, bh); - - if (bh->b_size != sb->s_blocksize) - reiserfs_panic(sb, "jmacd-5", "buffer has wrong " - "blocksize %s[%d] (%b)", - descr, level, bh); - - if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) - reiserfs_panic(sb, "jmacd-6", "buffer block " - "number too high %s[%d] (%b)", - descr, level, bh); - } -} -#else -static void tb_buffer_sanity_check(struct super_block *sb, - struct buffer_head *bh, - const char *descr, int level) -{; -} -#endif - -static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) -{ - return reiserfs_prepare_for_journal(s, bh, 0); -} - -static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) -{ - struct buffer_head *locked; -#ifdef CONFIG_REISERFS_CHECK - int repeat_counter = 0; -#endif - int i; - - do { - - locked = NULL; - - for (i = tb->tb_path->path_length; - !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { - if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { - /* - * if I understand correctly, we can only - * be sure the last buffer in the path is - * in the tree --clm - */ -#ifdef CONFIG_REISERFS_CHECK - if (PATH_PLAST_BUFFER(tb->tb_path) == - PATH_OFFSET_PBUFFER(tb->tb_path, i)) - tb_buffer_sanity_check(tb->tb_sb, - PATH_OFFSET_PBUFFER - (tb->tb_path, - i), "S", - tb->tb_path-> - path_length - i); -#endif - if (!clear_all_dirty_bits(tb->tb_sb, - PATH_OFFSET_PBUFFER - (tb->tb_path, - i))) { - locked = - PATH_OFFSET_PBUFFER(tb->tb_path, - i); - } - } - } - - for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; - i++) { - - if (tb->lnum[i]) { - - if (tb->L[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->L[i], - "L", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->L[i])) - locked = tb->L[i]; - } - - if (!locked && tb->FL[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->FL[i], - "FL", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->FL[i])) - locked = tb->FL[i]; - } - - if (!locked && tb->CFL[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->CFL[i], - "CFL", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->CFL[i])) - locked = tb->CFL[i]; - } - - } - - if (!locked && (tb->rnum[i])) { - - if (tb->R[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->R[i], - "R", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->R[i])) - locked = tb->R[i]; - } - - if (!locked && tb->FR[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->FR[i], - "FR", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->FR[i])) - locked = tb->FR[i]; - } - - if (!locked && tb->CFR[i]) { - tb_buffer_sanity_check(tb->tb_sb, - tb->CFR[i], - "CFR", i); - if (!clear_all_dirty_bits - (tb->tb_sb, tb->CFR[i])) - locked = tb->CFR[i]; - } - } - } - - /* - * as far as I can tell, this is not required. The FEB list - * seems to be full of newly allocated nodes, which will - * never be locked, dirty, or anything else. - * To be safe, I'm putting in the checks and waits in. - * For the moment, they are needed to keep the code in - * journal.c from complaining about the buffer. - * That code is inside CONFIG_REISERFS_CHECK as well. --clm - */ - for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { - if (tb->FEB[i]) { - if (!clear_all_dirty_bits - (tb->tb_sb, tb->FEB[i])) - locked = tb->FEB[i]; - } - } - - if (locked) { - int depth; -#ifdef CONFIG_REISERFS_CHECK - repeat_counter++; - if ((repeat_counter % 10000) == 0) { - reiserfs_warning(tb->tb_sb, "reiserfs-8200", - "too many iterations waiting " - "for buffer to unlock " - "(%b)", locked); - - /* Don't loop forever. Try to recover from possible error. */ - - return (FILESYSTEM_CHANGED_TB(tb)) ? - REPEAT_SEARCH : CARRY_ON; - } -#endif - depth = reiserfs_write_unlock_nested(tb->tb_sb); - __wait_on_buffer(locked); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (FILESYSTEM_CHANGED_TB(tb)) - return REPEAT_SEARCH; - } - - } while (locked); - - return CARRY_ON; -} - -/* - * Prepare for balancing, that is - * get all necessary parents, and neighbors; - * analyze what and where should be moved; - * get sufficient number of new nodes; - * Balancing will start only after all resources will be collected at a time. - * - * When ported to SMP kernels, only at the last moment after all needed nodes - * are collected in cache, will the resources be locked using the usual - * textbook ordered lock acquisition algorithms. Note that ensuring that - * this code neither write locks what it does not need to write lock nor locks - * out of order will be a pain in the butt that could have been avoided. - * Grumble grumble. -Hans - * - * fix is meant in the sense of render unchanging - * - * Latency might be improved by first gathering a list of what buffers - * are needed and then getting as many of them in parallel as possible? -Hans - * - * Parameters: - * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) - * tb tree_balance structure; - * inum item number in S[h]; - * pos_in_item - comment this if you can - * ins_ih item head of item being inserted - * data inserted item or data to be pasted - * Returns: 1 - schedule occurred while the function worked; - * 0 - schedule didn't occur while the function worked; - * -1 - if no_disk_space - */ - -int fix_nodes(int op_mode, struct tree_balance *tb, - struct item_head *ins_ih, const void *data) -{ - int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); - int pos_in_item; - - /* - * we set wait_tb_buffers_run when we have to restore any dirty - * bits cleared during wait_tb_buffers_run - */ - int wait_tb_buffers_run = 0; - struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - - ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; - - pos_in_item = tb->tb_path->pos_in_item; - - tb->fs_gen = get_generation(tb->tb_sb); - - /* - * we prepare and log the super here so it will already be in the - * transaction when do_balance needs to change it. - * This way do_balance won't have to schedule when trying to prepare - * the super for logging - */ - reiserfs_prepare_for_journal(tb->tb_sb, - SB_BUFFER_WITH_SB(tb->tb_sb), 1); - journal_mark_dirty(tb->transaction_handle, - SB_BUFFER_WITH_SB(tb->tb_sb)); - if (FILESYSTEM_CHANGED_TB(tb)) - return REPEAT_SEARCH; - - /* if it possible in indirect_to_direct conversion */ - if (buffer_locked(tbS0)) { - int depth = reiserfs_write_unlock_nested(tb->tb_sb); - __wait_on_buffer(tbS0); - reiserfs_write_lock_nested(tb->tb_sb, depth); - if (FILESYSTEM_CHANGED_TB(tb)) - return REPEAT_SEARCH; - } -#ifdef CONFIG_REISERFS_CHECK - if (REISERFS_SB(tb->tb_sb)->cur_tb) { - print_cur_tb("fix_nodes"); - reiserfs_panic(tb->tb_sb, "PAP-8305", - "there is pending do_balance"); - } - - if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) - reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " - "not uptodate at the beginning of fix_nodes " - "or not in tree (mode %c)", - tbS0, tbS0, op_mode); - - /* Check parameters. */ - switch (op_mode) { - case M_INSERT: - if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) - reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " - "item number %d (in S0 - %d) in case " - "of insert", item_num, - B_NR_ITEMS(tbS0)); - break; - case M_PASTE: - case M_DELETE: - case M_CUT: - if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { - print_block(tbS0, 0, -1, -1); - reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " - "item number(%d); mode = %c " - "insert_size = %d", - item_num, op_mode, - tb->insert_size[0]); - } - break; - default: - reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " - "of operation"); - } -#endif - - if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) - /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ - return REPEAT_SEARCH; - - /* Starting from the leaf level; for all levels h of the tree. */ - for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { - ret = get_direct_parent(tb, h); - if (ret != CARRY_ON) - goto repeat; - - ret = check_balance(op_mode, tb, h, item_num, - pos_in_item, ins_ih, data); - if (ret != CARRY_ON) { - if (ret == NO_BALANCING_NEEDED) { - /* No balancing for higher levels needed. */ - ret = get_neighbors(tb, h); - if (ret != CARRY_ON) - goto repeat; - if (h != MAX_HEIGHT - 1) - tb->insert_size[h + 1] = 0; - /* - * ok, analysis and resource gathering - * are complete - */ - break; - } - goto repeat; - } - - ret = get_neighbors(tb, h); - if (ret != CARRY_ON) - goto repeat; - - /* - * No disk space, or schedule occurred and analysis may be - * invalid and needs to be redone. - */ - ret = get_empty_nodes(tb, h); - if (ret != CARRY_ON) - goto repeat; - - /* - * We have a positive insert size but no nodes exist on this - * level, this means that we are creating a new root. - */ - if (!PATH_H_PBUFFER(tb->tb_path, h)) { - - RFALSE(tb->blknum[h] != 1, - "PAP-8350: creating new empty root"); - - if (h < MAX_HEIGHT - 1) - tb->insert_size[h + 1] = 0; - } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { - /* - * The tree needs to be grown, so this node S[h] - * which is the root node is split into two nodes, - * and a new node (S[h+1]) will be created to - * become the root node. - */ - if (tb->blknum[h] > 1) { - - RFALSE(h == MAX_HEIGHT - 1, - "PAP-8355: attempt to create too high of a tree"); - - tb->insert_size[h + 1] = - (DC_SIZE + - KEY_SIZE) * (tb->blknum[h] - 1) + - DC_SIZE; - } else if (h < MAX_HEIGHT - 1) - tb->insert_size[h + 1] = 0; - } else - tb->insert_size[h + 1] = - (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); - } - - ret = wait_tb_buffers_until_unlocked(tb); - if (ret == CARRY_ON) { - if (FILESYSTEM_CHANGED_TB(tb)) { - wait_tb_buffers_run = 1; - ret = REPEAT_SEARCH; - goto repeat; - } else { - return CARRY_ON; - } - } else { - wait_tb_buffers_run = 1; - goto repeat; - } - -repeat: - /* - * fix_nodes was unable to perform its calculation due to - * filesystem got changed under us, lack of free disk space or i/o - * failure. If the first is the case - the search will be - * repeated. For now - free all resources acquired so far except - * for the new allocated nodes - */ - { - int i; - - /* Release path buffers. */ - if (wait_tb_buffers_run) { - pathrelse_and_restore(tb->tb_sb, tb->tb_path); - } else { - pathrelse(tb->tb_path); - } - /* brelse all resources collected for balancing */ - for (i = 0; i < MAX_HEIGHT; i++) { - if (wait_tb_buffers_run) { - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb->L[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb->R[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb->FL[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb->FR[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb-> - CFL[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, - tb-> - CFR[i]); - } - - brelse(tb->L[i]); - brelse(tb->R[i]); - brelse(tb->FL[i]); - brelse(tb->FR[i]); - brelse(tb->CFL[i]); - brelse(tb->CFR[i]); - - tb->L[i] = NULL; - tb->R[i] = NULL; - tb->FL[i] = NULL; - tb->FR[i] = NULL; - tb->CFL[i] = NULL; - tb->CFR[i] = NULL; - } - - if (wait_tb_buffers_run) { - for (i = 0; i < MAX_FEB_SIZE; i++) { - if (tb->FEB[i]) - reiserfs_restore_prepared_buffer - (tb->tb_sb, tb->FEB[i]); - } - } - return ret; - } - -} - -void unfix_nodes(struct tree_balance *tb) -{ - int i; - - /* Release path buffers. */ - pathrelse_and_restore(tb->tb_sb, tb->tb_path); - - /* brelse all resources collected for balancing */ - for (i = 0; i < MAX_HEIGHT; i++) { - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); - reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); - - brelse(tb->L[i]); - brelse(tb->R[i]); - brelse(tb->FL[i]); - brelse(tb->FR[i]); - brelse(tb->CFL[i]); - brelse(tb->CFR[i]); - } - - /* deal with list of allocated (used and unused) nodes */ - for (i = 0; i < MAX_FEB_SIZE; i++) { - if (tb->FEB[i]) { - b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; - /* - * de-allocated block which was not used by - * balancing and bforget about buffer for it - */ - brelse(tb->FEB[i]); - reiserfs_free_block(tb->transaction_handle, NULL, - blocknr, 0); - } - if (tb->used[i]) { - /* release used as new nodes including a new root */ - brelse(tb->used[i]); - } - } - - kfree(tb->vn_buf); - -} diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c deleted file mode 100644 index 7a26c4fe6c46..000000000000 --- a/fs/reiserfs/hashes.c +++ /dev/null @@ -1,177 +0,0 @@ - -/* - * Keyed 32-bit hash function using TEA in a Davis-Meyer function - * H0 = Key - * Hi = E Mi(Hi-1) + Hi-1 - * - * (see Applied Cryptography, 2nd edition, p448). - * - * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998 - * - * Jeremy has agreed to the contents of reiserfs/README. -Hans - * Yura's function is added (04/07/2000) - */ - -#include <linux/kernel.h> -#include "reiserfs.h" -#include <asm/types.h> - -#define DELTA 0x9E3779B9 -#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ -#define PARTROUNDS 6 /* 6 gets complete mixing */ - -/* a, b, c, d - data; h0, h1 - accumulated hash */ -#define TEACORE(rounds) \ - do { \ - u32 sum = 0; \ - int n = rounds; \ - u32 b0, b1; \ - \ - b0 = h0; \ - b1 = h1; \ - \ - do \ - { \ - sum += DELTA; \ - b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ - b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ - } while(--n); \ - \ - h0 += b0; \ - h1 += b1; \ - } while(0) - -u32 keyed_hash(const signed char *msg, int len) -{ - u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; - - u32 h0 = k[0], h1 = k[1]; - u32 a, b, c, d; - u32 pad; - int i; - - /* assert(len >= 0 && len < 256); */ - - pad = (u32) len | ((u32) len << 8); - pad |= pad << 16; - - while (len >= 16) { - a = (u32) msg[0] | - (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; - b = (u32) msg[4] | - (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; - c = (u32) msg[8] | - (u32) msg[9] << 8 | - (u32) msg[10] << 16 | (u32) msg[11] << 24; - d = (u32) msg[12] | - (u32) msg[13] << 8 | - (u32) msg[14] << 16 | (u32) msg[15] << 24; - - TEACORE(PARTROUNDS); - - len -= 16; - msg += 16; - } - - if (len >= 12) { - a = (u32) msg[0] | - (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; - b = (u32) msg[4] | - (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; - c = (u32) msg[8] | - (u32) msg[9] << 8 | - (u32) msg[10] << 16 | (u32) msg[11] << 24; - - d = pad; - for (i = 12; i < len; i++) { - d <<= 8; - d |= msg[i]; - } - } else if (len >= 8) { - a = (u32) msg[0] | - (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; - b = (u32) msg[4] | - (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; - - c = d = pad; - for (i = 8; i < len; i++) { - c <<= 8; - c |= msg[i]; - } - } else if (len >= 4) { - a = (u32) msg[0] | - (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; - - b = c = d = pad; - for (i = 4; i < len; i++) { - b <<= 8; - b |= msg[i]; - } - } else { - a = b = c = d = pad; - for (i = 0; i < len; i++) { - a <<= 8; - a |= msg[i]; - } - } - - TEACORE(FULLROUNDS); - -/* return 0;*/ - return h0 ^ h1; -} - -/* - * What follows in this file is copyright 2000 by Hans Reiser, and the - * licensing of what follows is governed by reiserfs/README - */ -u32 yura_hash(const signed char *msg, int len) -{ - int j, pow; - u32 a, c; - int i; - - for (pow = 1, i = 1; i < len; i++) - pow = pow * 10; - - if (len == 1) - a = msg[0] - 48; - else - a = (msg[0] - 48) * pow; - - for (i = 1; i < len; i++) { - c = msg[i] - 48; - for (pow = 1, j = i; j < len - 1; j++) - pow = pow * 10; - a = a + c * pow; - } - - for (; i < 40; i++) { - c = '0' - 48; - for (pow = 1, j = i; j < len - 1; j++) - pow = pow * 10; - a = a + c * pow; - } - - for (; i < 256; i++) { - c = i; - for (pow = 1, j = i; j < len - 1; j++) - pow = pow * 10; - a = a + c * pow; - } - - a = a << 7; - return a; -} - -u32 r5_hash(const signed char *msg, int len) -{ - u32 a = 0; - while (*msg) { - a += *msg << 4; - a += *msg >> 4; - a *= 11; - msg++; - } - return a; -} diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c deleted file mode 100644 index 5db6f45b3fed..000000000000 --- a/fs/reiserfs/ibalance.c +++ /dev/null @@ -1,1161 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/uaccess.h> -#include <linux/string.h> -#include <linux/time.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> - -/* this is one and only function that is used outside (do_balance.c) */ -int balance_internal(struct tree_balance *, - int, int, struct item_head *, struct buffer_head **); - -/* - * modes of internal_shift_left, internal_shift_right and - * internal_insert_childs - */ -#define INTERNAL_SHIFT_FROM_S_TO_L 0 -#define INTERNAL_SHIFT_FROM_R_TO_S 1 -#define INTERNAL_SHIFT_FROM_L_TO_S 2 -#define INTERNAL_SHIFT_FROM_S_TO_R 3 -#define INTERNAL_INSERT_TO_S 4 -#define INTERNAL_INSERT_TO_L 5 -#define INTERNAL_INSERT_TO_R 6 - -static void internal_define_dest_src_infos(int shift_mode, - struct tree_balance *tb, - int h, - struct buffer_info *dest_bi, - struct buffer_info *src_bi, - int *d_key, struct buffer_head **cf) -{ - memset(dest_bi, 0, sizeof(struct buffer_info)); - memset(src_bi, 0, sizeof(struct buffer_info)); - /* define dest, src, dest parent, dest position */ - switch (shift_mode) { - - /* used in internal_shift_left */ - case INTERNAL_SHIFT_FROM_S_TO_L: - src_bi->tb = tb; - src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); - src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[h]; - dest_bi->bi_parent = tb->FL[h]; - dest_bi->bi_position = get_left_neighbor_position(tb, h); - *d_key = tb->lkey[h]; - *cf = tb->CFL[h]; - break; - case INTERNAL_SHIFT_FROM_L_TO_S: - src_bi->tb = tb; - src_bi->bi_bh = tb->L[h]; - src_bi->bi_parent = tb->FL[h]; - src_bi->bi_position = get_left_neighbor_position(tb, h); - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - /* dest position is analog of dest->b_item_order */ - dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - *d_key = tb->lkey[h]; - *cf = tb->CFL[h]; - break; - - /* used in internal_shift_left */ - case INTERNAL_SHIFT_FROM_R_TO_S: - src_bi->tb = tb; - src_bi->bi_bh = tb->R[h]; - src_bi->bi_parent = tb->FR[h]; - src_bi->bi_position = get_right_neighbor_position(tb, h); - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - *d_key = tb->rkey[h]; - *cf = tb->CFR[h]; - break; - - case INTERNAL_SHIFT_FROM_S_TO_R: - src_bi->tb = tb; - src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); - src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[h]; - dest_bi->bi_parent = tb->FR[h]; - dest_bi->bi_position = get_right_neighbor_position(tb, h); - *d_key = tb->rkey[h]; - *cf = tb->CFR[h]; - break; - - case INTERNAL_INSERT_TO_L: - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[h]; - dest_bi->bi_parent = tb->FL[h]; - dest_bi->bi_position = get_left_neighbor_position(tb, h); - break; - - case INTERNAL_INSERT_TO_S: - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - break; - - case INTERNAL_INSERT_TO_R: - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[h]; - dest_bi->bi_parent = tb->FR[h]; - dest_bi->bi_position = get_right_neighbor_position(tb, h); - break; - - default: - reiserfs_panic(tb->tb_sb, "ibalance-1", - "shift type is unknown (%d)", - shift_mode); - } -} - -/* - * Insert count node pointers into buffer cur before position to + 1. - * Insert count items into buffer cur before position to. - * Items and node pointers are specified by inserted and bh respectively. - */ -static void internal_insert_childs(struct buffer_info *cur_bi, - int to, int count, - struct item_head *inserted, - struct buffer_head **bh) -{ - struct buffer_head *cur = cur_bi->bi_bh; - struct block_head *blkh; - int nr; - struct reiserfs_key *ih; - struct disk_child new_dc[2]; - struct disk_child *dc; - int i; - - if (count <= 0) - return; - - blkh = B_BLK_HEAD(cur); - nr = blkh_nr_item(blkh); - - RFALSE(count > 2, "too many children (%d) are to be inserted", count); - RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE), - "no enough free space (%d), needed %d bytes", - B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE)); - - /* prepare space for count disk_child */ - dc = B_N_CHILD(cur, to + 1); - - memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE); - - /* copy to_be_insert disk children */ - for (i = 0; i < count; i++) { - put_dc_size(&new_dc[i], - MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); - put_dc_block_number(&new_dc[i], bh[i]->b_blocknr); - } - memcpy(dc, new_dc, DC_SIZE * count); - - /* prepare space for count items */ - ih = internal_key(cur, ((to == -1) ? 0 : to)); - - memmove(ih + count, ih, - (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); - - /* copy item headers (keys) */ - memcpy(ih, inserted, KEY_SIZE); - if (count > 1) - memcpy(ih + 1, inserted + 1, KEY_SIZE); - - /* sizes, item number */ - set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count); - set_blkh_free_space(blkh, - blkh_free_space(blkh) - count * (DC_SIZE + - KEY_SIZE)); - - do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(cur); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - - if (cur_bi->bi_parent) { - struct disk_child *t_dc = - B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); - put_dc_size(t_dc, - dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); - do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, - 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(cur_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - } - -} - -/* - * Delete del_num items and node pointers from buffer cur starting from - * the first_i'th item and first_p'th pointers respectively. - */ -static void internal_delete_pointers_items(struct buffer_info *cur_bi, - int first_p, - int first_i, int del_num) -{ - struct buffer_head *cur = cur_bi->bi_bh; - int nr; - struct block_head *blkh; - struct reiserfs_key *key; - struct disk_child *dc; - - RFALSE(cur == NULL, "buffer is 0"); - RFALSE(del_num < 0, - "negative number of items (%d) can not be deleted", del_num); - RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1 - || first_i < 0, - "first pointer order (%d) < 0 or " - "no so many pointers (%d), only (%d) or " - "first key order %d < 0", first_p, first_p + del_num, - B_NR_ITEMS(cur) + 1, first_i); - if (del_num == 0) - return; - - blkh = B_BLK_HEAD(cur); - nr = blkh_nr_item(blkh); - - if (first_p == 0 && del_num == nr + 1) { - RFALSE(first_i != 0, - "1st deleted key must have order 0, not %d", first_i); - make_empty_node(cur_bi); - return; - } - - RFALSE(first_i + del_num > B_NR_ITEMS(cur), - "first_i = %d del_num = %d " - "no so many keys (%d) in the node (%b)(%z)", - first_i, del_num, first_i + del_num, cur, cur); - - /* deleting */ - dc = B_N_CHILD(cur, first_p); - - memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); - key = internal_key(cur, first_i); - memmove(key, key + del_num, - (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - - del_num) * DC_SIZE); - - /* sizes, item number */ - set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); - set_blkh_free_space(blkh, - blkh_free_space(blkh) + - (del_num * (KEY_SIZE + DC_SIZE))); - - do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); - /*&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(cur); - /*&&&&&&&&&&&&&&&&&&&&&&& */ - - if (cur_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); - put_dc_size(t_dc, - dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE))); - - do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, - 0); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(cur_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - } -} - -/* delete n node pointers and items starting from given position */ -static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) -{ - int i_from; - - i_from = (from == 0) ? from : from - 1; - - /* - * delete n pointers starting from `from' position in CUR; - * delete n keys starting from 'i_from' position in CUR; - */ - internal_delete_pointers_items(cur_bi, from, i_from, n); -} - -/* - * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer - * dest - * last_first == FIRST_TO_LAST means that we copy first items - * from src to tail of dest - * last_first == LAST_TO_FIRST means that we copy last items - * from src to head of dest - */ -static void internal_copy_pointers_items(struct buffer_info *dest_bi, - struct buffer_head *src, - int last_first, int cpy_num) -{ - /* - * ATTENTION! Number of node pointers in DEST is equal to number - * of items in DEST as delimiting key have already inserted to - * buffer dest. - */ - struct buffer_head *dest = dest_bi->bi_bh; - int nr_dest, nr_src; - int dest_order, src_order; - struct block_head *blkh; - struct reiserfs_key *key; - struct disk_child *dc; - - nr_src = B_NR_ITEMS(src); - - RFALSE(dest == NULL || src == NULL, - "src (%p) or dest (%p) buffer is 0", src, dest); - RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, - "invalid last_first parameter (%d)", last_first); - RFALSE(nr_src < cpy_num - 1, - "no so many items (%d) in src (%d)", cpy_num, nr_src); - RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); - RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), - "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", - cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); - - if (cpy_num == 0) - return; - - /* coping */ - blkh = B_BLK_HEAD(dest); - nr_dest = blkh_nr_item(blkh); - - /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */ - /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */ - (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = - nr_src - cpy_num + 1) : (dest_order = - nr_dest, - src_order = - 0); - - /* prepare space for cpy_num pointers */ - dc = B_N_CHILD(dest, dest_order); - - memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); - - /* insert pointers */ - memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); - - /* prepare space for cpy_num - 1 item headers */ - key = internal_key(dest, dest_order); - memmove(key + cpy_num - 1, key, - KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + - cpy_num)); - - /* insert headers */ - memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1)); - - /* sizes, item number */ - set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); - set_blkh_free_space(blkh, - blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + - DC_SIZE * cpy_num)); - - do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(dest); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); - put_dc_size(t_dc, - dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + - DC_SIZE * cpy_num)); - - do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, - 0); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(dest_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - } - -} - -/* - * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to - * buffer dest. - * Delete cpy_num - del_par items and node pointers from buffer src. - * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. - * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. - */ -static void internal_move_pointers_items(struct buffer_info *dest_bi, - struct buffer_info *src_bi, - int last_first, int cpy_num, - int del_par) -{ - int first_pointer; - int first_item; - - internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first, - cpy_num); - - if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ - first_pointer = 0; - first_item = 0; - /* - * delete cpy_num - del_par pointers and keys starting for - * pointers with first_pointer, for key - with first_item - */ - internal_delete_pointers_items(src_bi, first_pointer, - first_item, cpy_num - del_par); - } else { /* shift_right occurs */ - int i, j; - - i = (cpy_num - del_par == - (j = - B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num + - del_par; - - internal_delete_pointers_items(src_bi, - j + 1 - cpy_num + del_par, i, - cpy_num - del_par); - } -} - -/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ -static void internal_insert_key(struct buffer_info *dest_bi, - /* insert key before key with n_dest number */ - int dest_position_before, - struct buffer_head *src, int src_position) -{ - struct buffer_head *dest = dest_bi->bi_bh; - int nr; - struct block_head *blkh; - struct reiserfs_key *key; - - RFALSE(dest == NULL || src == NULL, - "source(%p) or dest(%p) buffer is 0", src, dest); - RFALSE(dest_position_before < 0 || src_position < 0, - "source(%d) or dest(%d) key number less than 0", - src_position, dest_position_before); - RFALSE(dest_position_before > B_NR_ITEMS(dest) || - src_position >= B_NR_ITEMS(src), - "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", - dest_position_before, B_NR_ITEMS(dest), - src_position, B_NR_ITEMS(src)); - RFALSE(B_FREE_SPACE(dest) < KEY_SIZE, - "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest)); - - blkh = B_BLK_HEAD(dest); - nr = blkh_nr_item(blkh); - - /* prepare space for inserting key */ - key = internal_key(dest, dest_position_before); - memmove(key + 1, key, - (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); - - /* insert key */ - memcpy(key, internal_key(src, src_position), KEY_SIZE); - - /* Change dirt, free space, item number fields. */ - - set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); - set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE); - - do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); - put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE); - - do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, - 0); - } -} - -/* - * Insert d_key'th (delimiting) key from buffer cfl to tail of dest. - * Copy pointer_amount node pointers and pointer_amount - 1 items from - * buffer src to buffer dest. - * Replace d_key'th key in buffer cfl. - * Delete pointer_amount items and node pointers from buffer src. - */ -/* this can be invoked both to shift from S to L and from R to S */ -static void internal_shift_left( - /* - * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S - */ - int mode, - struct tree_balance *tb, - int h, int pointer_amount) -{ - struct buffer_info dest_bi, src_bi; - struct buffer_head *cf; - int d_key_position; - - internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, - &d_key_position, &cf); - - /*printk("pointer_amount = %d\n",pointer_amount); */ - - if (pointer_amount) { - /* - * insert delimiting key from common father of dest and - * src to node dest into position B_NR_ITEM(dest) - */ - internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, - d_key_position); - - if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { - if (src_bi.bi_position /*src->b_item_order */ == 0) - replace_key(tb, cf, d_key_position, - src_bi. - bi_parent /*src->b_parent */ , 0); - } else - replace_key(tb, cf, d_key_position, src_bi.bi_bh, - pointer_amount - 1); - } - /* last parameter is del_parameter */ - internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, - pointer_amount, 0); - -} - -/* - * Insert delimiting key to L[h]. - * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. - * Delete n - 1 items and node pointers from buffer S[h]. - */ -/* it always shifts from S[h] to L[h] */ -static void internal_shift1_left(struct tree_balance *tb, - int h, int pointer_amount) -{ - struct buffer_info dest_bi, src_bi; - struct buffer_head *cf; - int d_key_position; - - internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, - &dest_bi, &src_bi, &d_key_position, &cf); - - /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ - if (pointer_amount > 0) - internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, - d_key_position); - - /* last parameter is del_parameter */ - internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, - pointer_amount, 1); -} - -/* - * Insert d_key'th (delimiting) key from buffer cfr to head of dest. - * Copy n node pointers and n - 1 items from buffer src to buffer dest. - * Replace d_key'th key in buffer cfr. - * Delete n items and node pointers from buffer src. - */ -static void internal_shift_right( - /* - * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S - */ - int mode, - struct tree_balance *tb, - int h, int pointer_amount) -{ - struct buffer_info dest_bi, src_bi; - struct buffer_head *cf; - int d_key_position; - int nr; - - internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, - &d_key_position, &cf); - - nr = B_NR_ITEMS(src_bi.bi_bh); - - if (pointer_amount > 0) { - /* - * insert delimiting key from common father of dest - * and src to dest node into position 0 - */ - internal_insert_key(&dest_bi, 0, cf, d_key_position); - if (nr == pointer_amount - 1) { - RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || - dest_bi.bi_bh != tb->R[h], - "src (%p) must be == tb->S[h](%p) when it disappears", - src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h)); - /* when S[h] disappers replace left delemiting key as well */ - if (tb->CFL[h]) - replace_key(tb, cf, d_key_position, tb->CFL[h], - tb->lkey[h]); - } else - replace_key(tb, cf, d_key_position, src_bi.bi_bh, - nr - pointer_amount); - } - - /* last parameter is del_parameter */ - internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, - pointer_amount, 0); -} - -/* - * Insert delimiting key to R[h]. - * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. - * Delete n - 1 items and node pointers from buffer S[h]. - */ -/* it always shift from S[h] to R[h] */ -static void internal_shift1_right(struct tree_balance *tb, - int h, int pointer_amount) -{ - struct buffer_info dest_bi, src_bi; - struct buffer_head *cf; - int d_key_position; - - internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, - &dest_bi, &src_bi, &d_key_position, &cf); - - /* insert rkey from CFR[h] to right neighbor R[h] */ - if (pointer_amount > 0) - internal_insert_key(&dest_bi, 0, cf, d_key_position); - - /* last parameter is del_parameter */ - internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, - pointer_amount, 1); -} - -/* - * Delete insert_num node pointers together with their left items - * and balance current node. - */ -static void balance_internal_when_delete(struct tree_balance *tb, - int h, int child_pos) -{ - int insert_num; - int n; - struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); - struct buffer_info bi; - - insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); - - /* delete child-node-pointer(s) together with their left item(s) */ - bi.tb = tb; - bi.bi_bh = tbSh; - bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); - bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - - internal_delete_childs(&bi, child_pos, -insert_num); - - RFALSE(tb->blknum[h] > 1, - "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); - - n = B_NR_ITEMS(tbSh); - - if (tb->lnum[h] == 0 && tb->rnum[h] == 0) { - if (tb->blknum[h] == 0) { - /* node S[h] (root of the tree) is empty now */ - struct buffer_head *new_root; - - RFALSE(n - || B_FREE_SPACE(tbSh) != - MAX_CHILD_SIZE(tbSh) - DC_SIZE, - "buffer must have only 0 keys (%d)", n); - RFALSE(bi.bi_parent, "root has parent (%p)", - bi.bi_parent); - - /* choose a new root */ - if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1])) - new_root = tb->R[h - 1]; - else - new_root = tb->L[h - 1]; - /* - * switch super block's tree root block - * number to the new value */ - PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); - /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */ - PUT_SB_TREE_HEIGHT(tb->tb_sb, - SB_TREE_HEIGHT(tb->tb_sb) - 1); - - do_balance_mark_sb_dirty(tb, - REISERFS_SB(tb->tb_sb)->s_sbh, - 1); - /*&&&&&&&&&&&&&&&&&&&&&& */ - /* use check_internal if new root is an internal node */ - if (h > 1) - check_internal(new_root); - /*&&&&&&&&&&&&&&&&&&&&&& */ - - /* do what is needed for buffer thrown from tree */ - reiserfs_invalidate_buffer(tb, tbSh); - return; - } - return; - } - - /* join S[h] with L[h] */ - if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { - - RFALSE(tb->rnum[h] != 0, - "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", - h, tb->rnum[h]); - - internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); - reiserfs_invalidate_buffer(tb, tbSh); - - return; - } - - /* join S[h] with R[h] */ - if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { - RFALSE(tb->lnum[h] != 0, - "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", - h, tb->lnum[h]); - - internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); - - reiserfs_invalidate_buffer(tb, tbSh); - return; - } - - /* borrow from left neighbor L[h] */ - if (tb->lnum[h] < 0) { - RFALSE(tb->rnum[h] != 0, - "wrong tb->rnum[%d]==%d when borrow from L[h]", h, - tb->rnum[h]); - internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, - -tb->lnum[h]); - return; - } - - /* borrow from right neighbor R[h] */ - if (tb->rnum[h] < 0) { - RFALSE(tb->lnum[h] != 0, - "invalid tb->lnum[%d]==%d when borrow from R[h]", - h, tb->lnum[h]); - internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */ - return; - } - - /* split S[h] into two parts and put them into neighbors */ - if (tb->lnum[h] > 0) { - RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, - "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", - h, tb->lnum[h], h, tb->rnum[h], n); - - internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */ - internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, - tb->rnum[h]); - - reiserfs_invalidate_buffer(tb, tbSh); - - return; - } - reiserfs_panic(tb->tb_sb, "ibalance-2", - "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", - h, tb->lnum[h], h, tb->rnum[h]); -} - -/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ -static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) -{ - RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL, - "L[h](%p) and CFL[h](%p) must exist in replace_lkey", - tb->L[h], tb->CFL[h]); - - if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) - return; - - memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); - - do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); -} - -/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ -static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) -{ - RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL, - "R[h](%p) and CFR[h](%p) must exist in replace_rkey", - tb->R[h], tb->CFR[h]); - RFALSE(B_NR_ITEMS(tb->R[h]) == 0, - "R[h] can not be empty if it exists (item number=%d)", - B_NR_ITEMS(tb->R[h])); - - memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); - - do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); -} - - -/* - * if inserting/pasting { - * child_pos is the position of the node-pointer in S[h] that - * pointed to S[h-1] before balancing of the h-1 level; - * this means that new pointers and items must be inserted AFTER - * child_pos - * } else { - * it is the position of the leftmost pointer that must be deleted - * (together with its corresponding key to the left of the pointer) - * as a result of the previous level's balancing. - * } - */ - -int balance_internal(struct tree_balance *tb, - int h, /* level of the tree */ - int child_pos, - /* key for insertion on higher level */ - struct item_head *insert_key, - /* node for insertion on higher level */ - struct buffer_head **insert_ptr) -{ - struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); - struct buffer_info bi; - - /* - * we return this: it is 0 if there is no S[h], - * else it is tb->S[h]->b_item_order - */ - int order; - int insert_num, n, k; - struct buffer_head *S_new; - struct item_head new_insert_key; - struct buffer_head *new_insert_ptr = NULL; - struct item_head *new_insert_key_addr = insert_key; - - RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); - - PROC_INFO_INC(tb->tb_sb, balance_at[h]); - - order = - (tbSh) ? PATH_H_POSITION(tb->tb_path, - h + 1) /*tb->S[h]->b_item_order */ : 0; - - /* - * Using insert_size[h] calculate the number insert_num of items - * that must be inserted to or deleted from S[h]. - */ - insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); - - /* Check whether insert_num is proper * */ - RFALSE(insert_num < -2 || insert_num > 2, - "incorrect number of items inserted to the internal node (%d)", - insert_num); - RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), - "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", - insert_num, h); - - /* Make balance in case insert_num < 0 */ - if (insert_num < 0) { - balance_internal_when_delete(tb, h, child_pos); - return order; - } - - k = 0; - if (tb->lnum[h] > 0) { - /* - * shift lnum[h] items from S[h] to the left neighbor L[h]. - * check how many of new items fall into L[h] or CFL[h] after - * shifting - */ - n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ - if (tb->lnum[h] <= child_pos) { - /* new items don't fall into L[h] or CFL[h] */ - internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, - tb->lnum[h]); - child_pos -= tb->lnum[h]; - } else if (tb->lnum[h] > child_pos + insert_num) { - /* all new items fall into L[h] */ - internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, - tb->lnum[h] - insert_num); - /* insert insert_num keys and node-pointers into L[h] */ - bi.tb = tb; - bi.bi_bh = tb->L[h]; - bi.bi_parent = tb->FL[h]; - bi.bi_position = get_left_neighbor_position(tb, h); - internal_insert_childs(&bi, - /*tb->L[h], tb->S[h-1]->b_next */ - n + child_pos + 1, - insert_num, insert_key, - insert_ptr); - - insert_num = 0; - } else { - struct disk_child *dc; - - /* - * some items fall into L[h] or CFL[h], - * but some don't fall - */ - internal_shift1_left(tb, h, child_pos + 1); - /* calculate number of new items that fall into L[h] */ - k = tb->lnum[h] - child_pos - 1; - bi.tb = tb; - bi.bi_bh = tb->L[h]; - bi.bi_parent = tb->FL[h]; - bi.bi_position = get_left_neighbor_position(tb, h); - internal_insert_childs(&bi, - /*tb->L[h], tb->S[h-1]->b_next, */ - n + child_pos + 1, k, - insert_key, insert_ptr); - - replace_lkey(tb, h, insert_key + k); - - /* - * replace the first node-ptr in S[h] by - * node-ptr to insert_ptr[k] - */ - dc = B_N_CHILD(tbSh, 0); - put_dc_size(dc, - MAX_CHILD_SIZE(insert_ptr[k]) - - B_FREE_SPACE(insert_ptr[k])); - put_dc_block_number(dc, insert_ptr[k]->b_blocknr); - - do_balance_mark_internal_dirty(tb, tbSh, 0); - - k++; - insert_key += k; - insert_ptr += k; - insert_num -= k; - child_pos = 0; - } - } - /* tb->lnum[h] > 0 */ - if (tb->rnum[h] > 0) { - /*shift rnum[h] items from S[h] to the right neighbor R[h] */ - /* - * check how many of new items fall into R or CFR - * after shifting - */ - n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ - if (n - tb->rnum[h] >= child_pos) - /* new items fall into S[h] */ - internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, - tb->rnum[h]); - else if (n + insert_num - tb->rnum[h] < child_pos) { - /* all new items fall into R[h] */ - internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, - tb->rnum[h] - insert_num); - - /* insert insert_num keys and node-pointers into R[h] */ - bi.tb = tb; - bi.bi_bh = tb->R[h]; - bi.bi_parent = tb->FR[h]; - bi.bi_position = get_right_neighbor_position(tb, h); - internal_insert_childs(&bi, - /*tb->R[h],tb->S[h-1]->b_next */ - child_pos - n - insert_num + - tb->rnum[h] - 1, - insert_num, insert_key, - insert_ptr); - insert_num = 0; - } else { - struct disk_child *dc; - - /* one of the items falls into CFR[h] */ - internal_shift1_right(tb, h, n - child_pos + 1); - /* calculate number of new items that fall into R[h] */ - k = tb->rnum[h] - n + child_pos - 1; - bi.tb = tb; - bi.bi_bh = tb->R[h]; - bi.bi_parent = tb->FR[h]; - bi.bi_position = get_right_neighbor_position(tb, h); - internal_insert_childs(&bi, - /*tb->R[h], tb->R[h]->b_child, */ - 0, k, insert_key + 1, - insert_ptr + 1); - - replace_rkey(tb, h, insert_key + insert_num - k - 1); - - /* - * replace the first node-ptr in R[h] by - * node-ptr insert_ptr[insert_num-k-1] - */ - dc = B_N_CHILD(tb->R[h], 0); - put_dc_size(dc, - MAX_CHILD_SIZE(insert_ptr - [insert_num - k - 1]) - - B_FREE_SPACE(insert_ptr - [insert_num - k - 1])); - put_dc_block_number(dc, - insert_ptr[insert_num - k - - 1]->b_blocknr); - - do_balance_mark_internal_dirty(tb, tb->R[h], 0); - - insert_num -= (k + 1); - } - } - - /** Fill new node that appears instead of S[h] **/ - RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); - RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); - - if (!tb->blknum[h]) { /* node S[h] is empty now */ - RFALSE(!tbSh, "S[h] is equal NULL"); - - /* do what is needed for buffer thrown from tree */ - reiserfs_invalidate_buffer(tb, tbSh); - return order; - } - - if (!tbSh) { - /* create new root */ - struct disk_child *dc; - struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); - struct block_head *blkh; - - if (tb->blknum[h] != 1) - reiserfs_panic(NULL, "ibalance-3", "One new node " - "required for creating the new root"); - /* S[h] = empty buffer from the list FEB. */ - tbSh = get_FEB(tb); - blkh = B_BLK_HEAD(tbSh); - set_blkh_level(blkh, h + 1); - - /* Put the unique node-pointer to S[h] that points to S[h-1]. */ - - dc = B_N_CHILD(tbSh, 0); - put_dc_block_number(dc, tbSh_1->b_blocknr); - put_dc_size(dc, - (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); - - tb->insert_size[h] -= DC_SIZE; - set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); - - do_balance_mark_internal_dirty(tb, tbSh, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - check_internal(tbSh); - /*&&&&&&&&&&&&&&&&&&&&&&&& */ - - /* put new root into path structure */ - PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = - tbSh; - - /* Change root in structure super block. */ - PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); - PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); - do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); - } - - if (tb->blknum[h] == 2) { - int snum; - struct buffer_info dest_bi, src_bi; - - /* S_new = free buffer from list FEB */ - S_new = get_FEB(tb); - - set_blkh_level(B_BLK_HEAD(S_new), h + 1); - - dest_bi.tb = tb; - dest_bi.bi_bh = S_new; - dest_bi.bi_parent = NULL; - dest_bi.bi_position = 0; - src_bi.tb = tb; - src_bi.bi_bh = tbSh; - src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); - src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - - n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ - snum = (insert_num + n + 1) / 2; - if (n - snum >= child_pos) { - /* new items don't fall into S_new */ - /* store the delimiting key for the next level */ - /* new_insert_key = (n - snum)'th key in S[h] */ - memcpy(&new_insert_key, internal_key(tbSh, n - snum), - KEY_SIZE); - /* last parameter is del_par */ - internal_move_pointers_items(&dest_bi, &src_bi, - LAST_TO_FIRST, snum, 0); - } else if (n + insert_num - snum < child_pos) { - /* all new items fall into S_new */ - /* store the delimiting key for the next level */ - /* - * new_insert_key = (n + insert_item - snum)'th - * key in S[h] - */ - memcpy(&new_insert_key, - internal_key(tbSh, n + insert_num - snum), - KEY_SIZE); - /* last parameter is del_par */ - internal_move_pointers_items(&dest_bi, &src_bi, - LAST_TO_FIRST, - snum - insert_num, 0); - - /* - * insert insert_num keys and node-pointers - * into S_new - */ - internal_insert_childs(&dest_bi, - /*S_new,tb->S[h-1]->b_next, */ - child_pos - n - insert_num + - snum - 1, - insert_num, insert_key, - insert_ptr); - - insert_num = 0; - } else { - struct disk_child *dc; - - /* some items fall into S_new, but some don't fall */ - /* last parameter is del_par */ - internal_move_pointers_items(&dest_bi, &src_bi, - LAST_TO_FIRST, - n - child_pos + 1, 1); - /* calculate number of new items that fall into S_new */ - k = snum - n + child_pos - 1; - - internal_insert_childs(&dest_bi, /*S_new, */ 0, k, - insert_key + 1, insert_ptr + 1); - - /* new_insert_key = insert_key[insert_num - k - 1] */ - memcpy(&new_insert_key, insert_key + insert_num - k - 1, - KEY_SIZE); - /* - * replace first node-ptr in S_new by node-ptr - * to insert_ptr[insert_num-k-1] - */ - - dc = B_N_CHILD(S_new, 0); - put_dc_size(dc, - (MAX_CHILD_SIZE - (insert_ptr[insert_num - k - 1]) - - B_FREE_SPACE(insert_ptr - [insert_num - k - 1]))); - put_dc_block_number(dc, - insert_ptr[insert_num - k - - 1]->b_blocknr); - - do_balance_mark_internal_dirty(tb, S_new, 0); - - insert_num -= (k + 1); - } - /* new_insert_ptr = node_pointer to S_new */ - new_insert_ptr = S_new; - - RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) - || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", - S_new); - - /* S_new is released in unfix_nodes */ - } - - n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ - - if (0 <= child_pos && child_pos <= n && insert_num > 0) { - bi.tb = tb; - bi.bi_bh = tbSh; - bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); - bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); - internal_insert_childs(&bi, /*tbSh, */ - /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ - child_pos, insert_num, insert_key, - insert_ptr); - } - - insert_ptr[0] = new_insert_ptr; - if (new_insert_ptr) - memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); - - return order; -} diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c deleted file mode 100644 index d39ee5f6c075..000000000000 --- a/fs/reiserfs/inode.c +++ /dev/null @@ -1,3416 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include "reiserfs.h" -#include "acl.h" -#include "xattr.h" -#include <linux/exportfs.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/unaligned.h> -#include <linux/buffer_head.h> -#include <linux/mpage.h> -#include <linux/writeback.h> -#include <linux/quotaops.h> -#include <linux/swap.h> -#include <linux/uio.h> -#include <linux/bio.h> - -int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to); - -void reiserfs_evict_inode(struct inode *inode) -{ - /* - * We need blocks for transaction + (user+group) quota - * update (possibly delete) - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 2 + - 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); - struct reiserfs_transaction_handle th; - int err; - - if (!inode->i_nlink && !is_bad_inode(inode)) - dquot_initialize(inode); - - truncate_inode_pages_final(&inode->i_data); - if (inode->i_nlink) - goto no_delete; - - /* - * The = 0 happens when we abort creating a new inode - * for some reason like lack of space.. - * also handles bad_inode case - */ - if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { - - reiserfs_delete_xattrs(inode); - - reiserfs_write_lock(inode->i_sb); - - if (journal_begin(&th, inode->i_sb, jbegin_count)) - goto out; - reiserfs_update_inode_transaction(inode); - - reiserfs_discard_prealloc(&th, inode); - - err = reiserfs_delete_object(&th, inode); - - /* - * Do quota update inside a transaction for journaled quotas. - * We must do that after delete_object so that quota updates - * go into the same transaction as stat data deletion - */ - if (!err) { - int depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_free_inode(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); - } - - if (journal_end(&th)) - goto out; - - /* - * check return value from reiserfs_delete_object after - * ending the transaction - */ - if (err) - goto out; - - /* - * all items of file are deleted, so we can remove - * "save" link - * we can't do anything about an error here - */ - remove_save_link(inode, 0 /* not truncate */); -out: - reiserfs_write_unlock(inode->i_sb); - } else { - /* no object items are in the tree */ - ; - } - - /* note this must go after the journal_end to prevent deadlock */ - clear_inode(inode); - - dquot_drop(inode); - inode->i_blocks = 0; - return; - -no_delete: - clear_inode(inode); - dquot_drop(inode); -} - -static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, - __u32 objectid, loff_t offset, int type, int length) -{ - key->version = version; - - key->on_disk_key.k_dir_id = dirid; - key->on_disk_key.k_objectid = objectid; - set_cpu_key_k_offset(key, offset); - set_cpu_key_k_type(key, type); - key->key_length = length; -} - -/* - * take base of inode_key (it comes from inode always) (dirid, objectid) - * and version from an inode, set offset and type of key - */ -void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, - int type, int length) -{ - _make_cpu_key(key, get_inode_item_key_version(inode), - le32_to_cpu(INODE_PKEY(inode)->k_dir_id), - le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, - length); -} - -/* when key is 0, do not set version and short key */ -inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, - int version, - loff_t offset, int type, int length, - int entry_count /*or ih_free_space */ ) -{ - if (key) { - ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); - ih->ih_key.k_objectid = - cpu_to_le32(key->on_disk_key.k_objectid); - } - put_ih_version(ih, version); - set_le_ih_k_offset(ih, offset); - set_le_ih_k_type(ih, type); - put_ih_item_len(ih, length); - /* set_ih_free_space (ih, 0); */ - /* - * for directory items it is entry count, for directs and stat - * datas - 0xffff, for indirects - 0 - */ - put_ih_entry_count(ih, entry_count); -} - -/* - * FIXME: we might cache recently accessed indirect item - * Ugh. Not too eager for that.... - * I cut the code until such time as I see a convincing argument (benchmark). - * I don't want a bloated inode struct..., and I don't like code complexity.... - */ - -/* - * cutting the code is fine, since it really isn't in use yet and is easy - * to add back in. But, Vladimir has a really good idea here. Think - * about what happens for reading a file. For each page, - * The VFS layer calls reiserfs_read_folio, who searches the tree to find - * an indirect item. This indirect item has X number of pointers, where - * X is a big number if we've done the block allocation right. But, - * we only use one or two of these pointers during each call to read_folio, - * needlessly researching again later on. - * - * The size of the cache could be dynamic based on the size of the file. - * - * I'd also like to see us cache the location the stat data item, since - * we are needlessly researching for that frequently. - * - * --chris - */ - -/* - * If this page has a file tail in it, and - * it was read in by get_block_create_0, the page data is valid, - * but tail is still sitting in a direct item, and we can't write to - * it. So, look through this page, and check all the mapped buffers - * to make sure they have valid block numbers. Any that don't need - * to be unmapped, so that __block_write_begin will correctly call - * reiserfs_get_block to convert the tail into an unformatted node - */ -static inline void fix_tail_page_for_writing(struct page *page) -{ - struct buffer_head *head, *next, *bh; - - if (page && page_has_buffers(page)) { - head = page_buffers(page); - bh = head; - do { - next = bh->b_this_page; - if (buffer_mapped(bh) && bh->b_blocknr == 0) { - reiserfs_unmap_buffer(bh); - } - bh = next; - } while (bh != head); - } -} - -/* - * reiserfs_get_block does not need to allocate a block only if it has been - * done already or non-hole position has been found in the indirect item - */ -static inline int allocation_needed(int retval, b_blocknr_t allocated, - struct item_head *ih, - __le32 * item, int pos_in_item) -{ - if (allocated) - return 0; - if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && - get_block_num(item, pos_in_item)) - return 0; - return 1; -} - -static inline int indirect_item_found(int retval, struct item_head *ih) -{ - return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); -} - -static inline void set_block_dev_mapped(struct buffer_head *bh, - b_blocknr_t block, struct inode *inode) -{ - map_bh(bh, inode->i_sb, block); -} - -/* - * files which were created in the earlier version can not be longer, - * than 2 gb - */ -static int file_capable(struct inode *inode, sector_t block) -{ - /* it is new file. */ - if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || - /* old file, but 'block' is inside of 2gb */ - block < (1 << (31 - inode->i_sb->s_blocksize_bits))) - return 1; - - return 0; -} - -static int restart_transaction(struct reiserfs_transaction_handle *th, - struct inode *inode, struct treepath *path) -{ - struct super_block *s = th->t_super; - int err; - - BUG_ON(!th->t_trans_id); - BUG_ON(!th->t_refcount); - - pathrelse(path); - - /* we cannot restart while nested */ - if (th->t_refcount > 1) { - return 0; - } - reiserfs_update_sd(th, inode); - err = journal_end(th); - if (!err) { - err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); - if (!err) - reiserfs_update_inode_transaction(inode); - } - return err; -} - -/* - * it is called by get_block when create == 0. Returns block number - * for 'block'-th logical block of file. When it hits direct item it - * returns 0 (being called from bmap) or read direct item into piece - * of page (bh_result) - * Please improve the english/clarity in the comment above, as it is - * hard to understand. - */ -static int _get_block_create_0(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int args) -{ - INITIALIZE_PATH(path); - struct cpu_key key; - struct buffer_head *bh; - struct item_head *ih, tmp_ih; - b_blocknr_t blocknr; - char *p; - int chars; - int ret; - int result; - int done = 0; - unsigned long offset; - - /* prepare the key to look for the 'block'-th block of file */ - make_cpu_key(&key, inode, - (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, - 3); - - result = search_for_position_by_key(inode->i_sb, &key, &path); - if (result != POSITION_FOUND) { - pathrelse(&path); - if (result == IO_ERROR) - return -EIO; - /* - * We do not return -ENOENT if there is a hole but page is - * uptodate, because it means that there is some MMAPED data - * associated with it that is yet to be written to disk. - */ - if ((args & GET_BLOCK_NO_HOLE) - && !PageUptodate(bh_result->b_page)) { - return -ENOENT; - } - return 0; - } - - bh = get_last_bh(&path); - ih = tp_item_head(&path); - if (is_indirect_le_ih(ih)) { - __le32 *ind_item = (__le32 *) ih_item_body(bh, ih); - - /* - * FIXME: here we could cache indirect item or part of it in - * the inode to avoid search_by_key in case of subsequent - * access to file - */ - blocknr = get_block_num(ind_item, path.pos_in_item); - ret = 0; - if (blocknr) { - map_bh(bh_result, inode->i_sb, blocknr); - if (path.pos_in_item == - ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { - set_buffer_boundary(bh_result); - } - } else - /* - * We do not return -ENOENT if there is a hole but - * page is uptodate, because it means that there is - * some MMAPED data associated with it that is - * yet to be written to disk. - */ - if ((args & GET_BLOCK_NO_HOLE) - && !PageUptodate(bh_result->b_page)) { - ret = -ENOENT; - } - - pathrelse(&path); - return ret; - } - /* requested data are in direct item(s) */ - if (!(args & GET_BLOCK_READ_DIRECT)) { - /* - * we are called by bmap. FIXME: we can not map block of file - * when it is stored in direct item(s) - */ - pathrelse(&path); - return -ENOENT; - } - - /* - * if we've got a direct item, and the buffer or page was uptodate, - * we don't want to pull data off disk again. skip to the - * end, where we map the buffer and return - */ - if (buffer_uptodate(bh_result)) { - goto finished; - } else - /* - * grab_tail_page can trigger calls to reiserfs_get_block on - * up to date pages without any buffers. If the page is up - * to date, we don't want read old data off disk. Set the up - * to date bit on the buffer instead and jump to the end - */ - if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { - set_buffer_uptodate(bh_result); - goto finished; - } - /* read file tail into part of page */ - offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1); - copy_item_head(&tmp_ih, ih); - - /* - * we only want to kmap if we are reading the tail into the page. - * this is not the common case, so we don't kmap until we are - * sure we need to. But, this means the item might move if - * kmap schedules - */ - p = (char *)kmap(bh_result->b_page); - p += offset; - memset(p, 0, inode->i_sb->s_blocksize); - do { - if (!is_direct_le_ih(ih)) { - BUG(); - } - /* - * make sure we don't read more bytes than actually exist in - * the file. This can happen in odd cases where i_size isn't - * correct, and when direct item padding results in a few - * extra bytes at the end of the direct item - */ - if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) - break; - if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { - chars = - inode->i_size - (le_ih_k_offset(ih) - 1) - - path.pos_in_item; - done = 1; - } else { - chars = ih_item_len(ih) - path.pos_in_item; - } - memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars); - - if (done) - break; - - p += chars; - - /* - * we done, if read direct item is not the last item of - * node FIXME: we could try to check right delimiting key - * to see whether direct item continues in the right - * neighbor or rely on i_size - */ - if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) - break; - - /* update key to look for the next piece */ - set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); - result = search_for_position_by_key(inode->i_sb, &key, &path); - if (result != POSITION_FOUND) - /* i/o error most likely */ - break; - bh = get_last_bh(&path); - ih = tp_item_head(&path); - } while (1); - - flush_dcache_page(bh_result->b_page); - kunmap(bh_result->b_page); - -finished: - pathrelse(&path); - - if (result == IO_ERROR) - return -EIO; - - /* - * this buffer has valid data, but isn't valid for io. mapping it to - * block #0 tells the rest of reiserfs it just has a tail in it - */ - map_bh(bh_result, inode->i_sb, 0); - set_buffer_uptodate(bh_result); - return 0; -} - -/* - * this is called to create file map. So, _get_block_create_0 will not - * read direct item - */ -static int reiserfs_bmap(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create) -{ - if (!file_capable(inode, block)) - return -EFBIG; - - reiserfs_write_lock(inode->i_sb); - /* do not read the direct item */ - _get_block_create_0(inode, block, bh_result, 0); - reiserfs_write_unlock(inode->i_sb); - return 0; -} - -/* - * special version of get_block that is only used by grab_tail_page right - * now. It is sent to __block_write_begin, and when you try to get a - * block past the end of the file (or a block from a hole) it returns - * -ENOENT instead of a valid buffer. __block_write_begin expects to - * be able to do i/o on the buffers returned, unless an error value - * is also returned. - * - * So, this allows __block_write_begin to be used for reading a single block - * in a page. Where it does not produce a valid page for holes, or past the - * end of the file. This turns out to be exactly what we need for reading - * tails for conversion. - * - * The point of the wrapper is forcing a certain value for create, even - * though the VFS layer is calling this function with create==1. If you - * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, - * don't use this function. -*/ -static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, - struct buffer_head *bh_result, - int create) -{ - return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); -} - -/* - * This is special helper for reiserfs_get_block in case we are executing - * direct_IO request. - */ -static int reiserfs_get_blocks_direct_io(struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - int ret; - - bh_result->b_page = NULL; - - /* - * We set the b_size before reiserfs_get_block call since it is - * referenced in convert_tail_for_hole() that may be called from - * reiserfs_get_block() - */ - bh_result->b_size = i_blocksize(inode); - - ret = reiserfs_get_block(inode, iblock, bh_result, - create | GET_BLOCK_NO_DANGLE); - if (ret) - goto out; - - /* don't allow direct io onto tail pages */ - if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* - * make sure future calls to the direct io funcs for this - * offset in the file fail by unmapping the buffer - */ - clear_buffer_mapped(bh_result); - ret = -EINVAL; - } - - /* - * Possible unpacked tail. Flush the data before pages have - * disappeared - */ - if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { - int err; - - reiserfs_write_lock(inode->i_sb); - - err = reiserfs_commit_for_inode(inode); - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - - reiserfs_write_unlock(inode->i_sb); - - if (err < 0) - ret = err; - } -out: - return ret; -} - -/* - * helper function for when reiserfs_get_block is called for a hole - * but the file tail is still in a direct item - * bh_result is the buffer head for the hole - * tail_offset is the offset of the start of the tail in the file - * - * This calls prepare_write, which will start a new transaction - * you should not be in a transaction, or have any paths held when you - * call this. - */ -static int convert_tail_for_hole(struct inode *inode, - struct buffer_head *bh_result, - loff_t tail_offset) -{ - unsigned long index; - unsigned long tail_end; - unsigned long tail_start; - struct page *tail_page; - struct page *hole_page = bh_result->b_page; - int retval = 0; - - if ((tail_offset & (bh_result->b_size - 1)) != 1) - return -EIO; - - /* always try to read until the end of the block */ - tail_start = tail_offset & (PAGE_SIZE - 1); - tail_end = (tail_start | (bh_result->b_size - 1)) + 1; - - index = tail_offset >> PAGE_SHIFT; - /* - * hole_page can be zero in case of direct_io, we are sure - * that we cannot get here if we write with O_DIRECT into tail page - */ - if (!hole_page || index != hole_page->index) { - tail_page = grab_cache_page(inode->i_mapping, index); - retval = -ENOMEM; - if (!tail_page) { - goto out; - } - } else { - tail_page = hole_page; - } - - /* - * we don't have to make sure the conversion did not happen while - * we were locking the page because anyone that could convert - * must first take i_mutex. - * - * We must fix the tail page for writing because it might have buffers - * that are mapped, but have a block number of 0. This indicates tail - * data that has been read directly into the page, and - * __block_write_begin won't trigger a get_block in this case. - */ - fix_tail_page_for_writing(tail_page); - retval = __reiserfs_write_begin(tail_page, tail_start, - tail_end - tail_start); - if (retval) - goto unlock; - - /* tail conversion might change the data in the page */ - flush_dcache_page(tail_page); - - retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); - -unlock: - if (tail_page != hole_page) { - unlock_page(tail_page); - put_page(tail_page); - } -out: - return retval; -} - -static inline int _allocate_block(struct reiserfs_transaction_handle *th, - sector_t block, - struct inode *inode, - b_blocknr_t * allocated_block_nr, - struct treepath *path, int flags) -{ - BUG_ON(!th->t_trans_id); - -#ifdef REISERFS_PREALLOCATE - if (!(flags & GET_BLOCK_NO_IMUX)) { - return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, - path, block); - } -#endif - return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, - block); -} - -int reiserfs_get_block(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create) -{ - int repeat, retval = 0; - /* b_blocknr_t is (unsigned) 32 bit int*/ - b_blocknr_t allocated_block_nr = 0; - INITIALIZE_PATH(path); - int pos_in_item; - struct cpu_key key; - struct buffer_head *bh, *unbh = NULL; - struct item_head *ih, tmp_ih; - __le32 *item; - int done; - int fs_gen; - struct reiserfs_transaction_handle *th = NULL; - /* - * space reserved in transaction batch: - * . 3 balancings in direct->indirect conversion - * . 1 block involved into reiserfs_update_sd() - * XXX in practically impossible worst case direct2indirect() - * can incur (much) more than 3 balancings. - * quota update for user, group - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + 1 + - 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); - int version; - int dangle = 1; - loff_t new_offset = - (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - - reiserfs_write_lock(inode->i_sb); - version = get_inode_item_key_version(inode); - - if (!file_capable(inode, block)) { - reiserfs_write_unlock(inode->i_sb); - return -EFBIG; - } - - /* - * if !create, we aren't changing the FS, so we don't need to - * log anything, so we don't need to start a transaction - */ - if (!(create & GET_BLOCK_CREATE)) { - int ret; - /* find number of block-th logical block of the file */ - ret = _get_block_create_0(inode, block, bh_result, - create | GET_BLOCK_READ_DIRECT); - reiserfs_write_unlock(inode->i_sb); - return ret; - } - - /* - * if we're already in a transaction, make sure to close - * any new transactions we start in this func - */ - if ((create & GET_BLOCK_NO_DANGLE) || - reiserfs_transaction_running(inode->i_sb)) - dangle = 0; - - /* - * If file is of such a size, that it might have a tail and - * tails are enabled we should mark it as possibly needing - * tail packing on close - */ - if ((have_large_tails(inode->i_sb) - && inode->i_size < i_block_size(inode) * 4) - || (have_small_tails(inode->i_sb) - && inode->i_size < i_block_size(inode))) - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; - - /* set the key of the first byte in the 'block'-th block of file */ - make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); - if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { -start_trans: - th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); - if (!th) { - retval = -ENOMEM; - goto failure; - } - reiserfs_update_inode_transaction(inode); - } -research: - - retval = search_for_position_by_key(inode->i_sb, &key, &path); - if (retval == IO_ERROR) { - retval = -EIO; - goto failure; - } - - bh = get_last_bh(&path); - ih = tp_item_head(&path); - item = tp_item_body(&path); - pos_in_item = path.pos_in_item; - - fs_gen = get_generation(inode->i_sb); - copy_item_head(&tmp_ih, ih); - - if (allocation_needed - (retval, allocated_block_nr, ih, item, pos_in_item)) { - /* we have to allocate block for the unformatted node */ - if (!th) { - pathrelse(&path); - goto start_trans; - } - - repeat = - _allocate_block(th, block, inode, &allocated_block_nr, - &path, create); - - /* - * restart the transaction to give the journal a chance to free - * some blocks. releases the path, so we have to go back to - * research if we succeed on the second try - */ - if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { - SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; - retval = restart_transaction(th, inode, &path); - if (retval) - goto failure; - repeat = - _allocate_block(th, block, inode, - &allocated_block_nr, NULL, create); - - if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { - goto research; - } - if (repeat == QUOTA_EXCEEDED) - retval = -EDQUOT; - else - retval = -ENOSPC; - goto failure; - } - - if (fs_changed(fs_gen, inode->i_sb) - && item_moved(&tmp_ih, &path)) { - goto research; - } - } - - if (indirect_item_found(retval, ih)) { - b_blocknr_t unfm_ptr; - /* - * 'block'-th block is in the file already (there is - * corresponding cell in some indirect item). But it may be - * zero unformatted node pointer (hole) - */ - unfm_ptr = get_block_num(item, pos_in_item); - if (unfm_ptr == 0) { - /* use allocated block to plug the hole */ - reiserfs_prepare_for_journal(inode->i_sb, bh, 1); - if (fs_changed(fs_gen, inode->i_sb) - && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, - bh); - goto research; - } - set_buffer_new(bh_result); - if (buffer_dirty(bh_result) - && reiserfs_data_ordered(inode->i_sb)) - reiserfs_add_ordered_list(inode, bh_result); - put_block_num(item, pos_in_item, allocated_block_nr); - unfm_ptr = allocated_block_nr; - journal_mark_dirty(th, bh); - reiserfs_update_sd(th, inode); - } - set_block_dev_mapped(bh_result, unfm_ptr, inode); - pathrelse(&path); - retval = 0; - if (!dangle && th) - retval = reiserfs_end_persistent_transaction(th); - - reiserfs_write_unlock(inode->i_sb); - - /* - * the item was found, so new blocks were not added to the file - * there is no need to make sure the inode is updated with this - * transaction - */ - return retval; - } - - if (!th) { - pathrelse(&path); - goto start_trans; - } - - /* - * desired position is not found or is in the direct item. We have - * to append file with holes up to 'block'-th block converting - * direct items to indirect one if necessary - */ - done = 0; - do { - if (is_statdata_le_ih(ih)) { - __le32 unp = 0; - struct cpu_key tmp_key; - - /* indirect item has to be inserted */ - make_le_item_head(&tmp_ih, &key, version, 1, - TYPE_INDIRECT, UNFM_P_SIZE, - 0 /* free_space */ ); - - /* - * we are going to add 'block'-th block to the file. - * Use allocated block for that - */ - if (cpu_key_k_offset(&key) == 1) { - unp = cpu_to_le32(allocated_block_nr); - set_block_dev_mapped(bh_result, - allocated_block_nr, inode); - set_buffer_new(bh_result); - done = 1; - } - tmp_key = key; /* ;) */ - set_cpu_key_k_offset(&tmp_key, 1); - PATH_LAST_POSITION(&path)++; - - retval = - reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, - inode, (char *)&unp); - if (retval) { - reiserfs_free_block(th, inode, - allocated_block_nr, 1); - /* - * retval == -ENOSPC, -EDQUOT or -EIO - * or -EEXIST - */ - goto failure; - } - } else if (is_direct_le_ih(ih)) { - /* direct item has to be converted */ - loff_t tail_offset; - - tail_offset = - ((le_ih_k_offset(ih) - - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; - - /* - * direct item we just found fits into block we have - * to map. Convert it into unformatted node: use - * bh_result for the conversion - */ - if (tail_offset == cpu_key_k_offset(&key)) { - set_block_dev_mapped(bh_result, - allocated_block_nr, inode); - unbh = bh_result; - done = 1; - } else { - /* - * we have to pad file tail stored in direct - * item(s) up to block size and convert it - * to unformatted node. FIXME: this should - * also get into page cache - */ - - pathrelse(&path); - /* - * ugly, but we can only end the transaction if - * we aren't nested - */ - BUG_ON(!th->t_refcount); - if (th->t_refcount == 1) { - retval = - reiserfs_end_persistent_transaction - (th); - th = NULL; - if (retval) - goto failure; - } - - retval = - convert_tail_for_hole(inode, bh_result, - tail_offset); - if (retval) { - if (retval != -ENOSPC) - reiserfs_error(inode->i_sb, - "clm-6004", - "convert tail failed " - "inode %lu, error %d", - inode->i_ino, - retval); - if (allocated_block_nr) { - /* - * the bitmap, the super, - * and the stat data == 3 - */ - if (!th) - th = reiserfs_persistent_transaction(inode->i_sb, 3); - if (th) - reiserfs_free_block(th, - inode, - allocated_block_nr, - 1); - } - goto failure; - } - goto research; - } - retval = - direct2indirect(th, inode, &path, unbh, - tail_offset); - if (retval) { - reiserfs_unmap_buffer(unbh); - reiserfs_free_block(th, inode, - allocated_block_nr, 1); - goto failure; - } - /* - * it is important the set_buffer_uptodate is done - * after the direct2indirect. The buffer might - * contain valid data newer than the data on disk - * (read by read_folio, changed, and then sent here by - * writepage). direct2indirect needs to know if unbh - * was already up to date, so it can decide if the - * data in unbh needs to be replaced with data from - * the disk - */ - set_buffer_uptodate(unbh); - - /* - * unbh->b_page == NULL in case of DIRECT_IO request, - * this means buffer will disappear shortly, so it - * should not be added to - */ - if (unbh->b_page) { - /* - * we've converted the tail, so we must - * flush unbh before the transaction commits - */ - reiserfs_add_tail_list(inode, unbh); - - /* - * mark it dirty now to prevent commit_write - * from adding this buffer to the inode's - * dirty buffer list - */ - /* - * AKPM: changed __mark_buffer_dirty to - * mark_buffer_dirty(). It's still atomic, - * but it sets the page dirty too, which makes - * it eligible for writeback at any time by the - * VM (which was also the case with - * __mark_buffer_dirty()) - */ - mark_buffer_dirty(unbh); - } - } else { - /* - * append indirect item with holes if needed, when - * appending pointer to 'block'-th block use block, - * which is already allocated - */ - struct cpu_key tmp_key; - /* - * We use this in case we need to allocate - * only one block which is a fastpath - */ - unp_t unf_single = 0; - unp_t *un; - __u64 max_to_insert = - MAX_ITEM_LEN(inode->i_sb->s_blocksize) / - UNFM_P_SIZE; - __u64 blocks_needed; - - RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, - "vs-804: invalid position for append"); - /* - * indirect item has to be appended, - * set up key of that position - * (key type is unimportant) - */ - make_cpu_key(&tmp_key, inode, - le_key_k_offset(version, - &ih->ih_key) + - op_bytes_number(ih, - inode->i_sb->s_blocksize), - TYPE_INDIRECT, 3); - - RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), - "green-805: invalid offset"); - blocks_needed = - 1 + - ((cpu_key_k_offset(&key) - - cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> - s_blocksize_bits); - - if (blocks_needed == 1) { - un = &unf_single; - } else { - un = kcalloc(min(blocks_needed, max_to_insert), - UNFM_P_SIZE, GFP_NOFS); - if (!un) { - un = &unf_single; - blocks_needed = 1; - max_to_insert = 0; - } - } - if (blocks_needed <= max_to_insert) { - /* - * we are going to add target block to - * the file. Use allocated block for that - */ - un[blocks_needed - 1] = - cpu_to_le32(allocated_block_nr); - set_block_dev_mapped(bh_result, - allocated_block_nr, inode); - set_buffer_new(bh_result); - done = 1; - } else { - /* paste hole to the indirect item */ - /* - * If kcalloc failed, max_to_insert becomes - * zero and it means we only have space for - * one block - */ - blocks_needed = - max_to_insert ? max_to_insert : 1; - } - retval = - reiserfs_paste_into_item(th, &path, &tmp_key, inode, - (char *)un, - UNFM_P_SIZE * - blocks_needed); - - if (blocks_needed != 1) - kfree(un); - - if (retval) { - reiserfs_free_block(th, inode, - allocated_block_nr, 1); - goto failure; - } - if (!done) { - /* - * We need to mark new file size in case - * this function will be interrupted/aborted - * later on. And we may do this only for - * holes. - */ - inode->i_size += - inode->i_sb->s_blocksize * blocks_needed; - } - } - - if (done == 1) - break; - - /* - * this loop could log more blocks than we had originally - * asked for. So, we have to allow the transaction to end - * if it is too big or too full. Update the inode so things - * are consistent if we crash before the function returns - * release the path so that anybody waiting on the path before - * ending their transaction will be able to continue. - */ - if (journal_transaction_should_end(th, th->t_blocks_allocated)) { - retval = restart_transaction(th, inode, &path); - if (retval) - goto failure; - } - /* - * inserting indirect pointers for a hole can take a - * long time. reschedule if needed and also release the write - * lock for others. - */ - reiserfs_cond_resched(inode->i_sb); - - retval = search_for_position_by_key(inode->i_sb, &key, &path); - if (retval == IO_ERROR) { - retval = -EIO; - goto failure; - } - if (retval == POSITION_FOUND) { - reiserfs_warning(inode->i_sb, "vs-825", - "%K should not be found", &key); - retval = -EEXIST; - if (allocated_block_nr) - reiserfs_free_block(th, inode, - allocated_block_nr, 1); - pathrelse(&path); - goto failure; - } - bh = get_last_bh(&path); - ih = tp_item_head(&path); - item = tp_item_body(&path); - pos_in_item = path.pos_in_item; - } while (1); - - retval = 0; - -failure: - if (th && (!dangle || (retval && !th->t_trans_id))) { - int err; - if (th->t_trans_id) - reiserfs_update_sd(th, inode); - err = reiserfs_end_persistent_transaction(th); - if (err) - retval = err; - } - - reiserfs_write_unlock(inode->i_sb); - reiserfs_check_path(&path); - return retval; -} - -static void reiserfs_readahead(struct readahead_control *rac) -{ - mpage_readahead(rac, reiserfs_get_block); -} - -/* - * Compute real number of used bytes by file - * Following three functions can go away when we'll have enough space in - * stat item - */ -static int real_space_diff(struct inode *inode, int sd_size) -{ - int bytes; - loff_t blocksize = inode->i_sb->s_blocksize; - - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) - return sd_size; - - /* - * End of file is also in full block with indirect reference, so round - * up to the next block. - * - * there is just no way to know if the tail is actually packed - * on the file, so we have to assume it isn't. When we pack the - * tail, we add 4 bytes to pretend there really is an unformatted - * node pointer - */ - bytes = - ((inode->i_size + - (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + - sd_size; - return bytes; -} - -static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, - int sd_size) -{ - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { - return inode->i_size + - (loff_t) (real_space_diff(inode, sd_size)); - } - return ((loff_t) real_space_diff(inode, sd_size)) + - (((loff_t) blocks) << 9); -} - -/* Compute number of blocks used by file in ReiserFS counting */ -static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) -{ - loff_t bytes = inode_get_bytes(inode); - loff_t real_space = real_space_diff(inode, sd_size); - - /* keeps fsck and non-quota versions of reiserfs happy */ - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { - bytes += (loff_t) 511; - } - - /* - * files from before the quota patch might i_blocks such that - * bytes < real_space. Deal with that here to prevent it from - * going negative. - */ - if (bytes < real_space) - return 0; - return (bytes - real_space) >> 9; -} - -/* - * BAD: new directories have stat data of new type and all other items - * of old type. Version stored in the inode says about body items, so - * in update_stat_data we can not rely on inode, but have to check - * item version directly - */ - -/* called by read_locked_inode */ -static void init_inode(struct inode *inode, struct treepath *path) -{ - struct buffer_head *bh; - struct item_head *ih; - __u32 rdev; - - bh = PATH_PLAST_BUFFER(path); - ih = tp_item_head(path); - - copy_key(INODE_PKEY(inode), &ih->ih_key); - - INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); - REISERFS_I(inode)->i_flags = 0; - REISERFS_I(inode)->i_prealloc_block = 0; - REISERFS_I(inode)->i_prealloc_count = 0; - REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_jl = NULL; - reiserfs_init_xattr_rwsem(inode); - - if (stat_data_v1(ih)) { - struct stat_data_v1 *sd = - (struct stat_data_v1 *)ih_item_body(bh, ih); - unsigned long blocks; - - set_inode_item_key_version(inode, KEY_FORMAT_3_5); - set_inode_sd_version(inode, STAT_DATA_V1); - inode->i_mode = sd_v1_mode(sd); - set_nlink(inode, sd_v1_nlink(sd)); - i_uid_write(inode, sd_v1_uid(sd)); - i_gid_write(inode, sd_v1_gid(sd)); - inode->i_size = sd_v1_size(sd); - inode_set_atime(inode, sd_v1_atime(sd), 0); - inode_set_mtime(inode, sd_v1_mtime(sd), 0); - inode_set_ctime(inode, sd_v1_ctime(sd), 0); - - inode->i_blocks = sd_v1_blocks(sd); - inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); - blocks = (inode->i_size + 511) >> 9; - blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); - - /* - * there was a bug in <=3.5.23 when i_blocks could take - * negative values. Starting from 3.5.17 this value could - * even be stored in stat data. For such files we set - * i_blocks based on file size. Just 2 notes: this can be - * wrong for sparse files. On-disk value will be only - * updated if file's inode will ever change - */ - if (inode->i_blocks > blocks) { - inode->i_blocks = blocks; - } - - rdev = sd_v1_rdev(sd); - REISERFS_I(inode)->i_first_direct_byte = - sd_v1_first_direct_byte(sd); - - /* - * an early bug in the quota code can give us an odd - * number for the block count. This is incorrect, fix it here. - */ - if (inode->i_blocks & 1) { - inode->i_blocks++; - } - inode_set_bytes(inode, - to_real_used_space(inode, inode->i_blocks, - SD_V1_SIZE)); - /* - * nopack is initially zero for v1 objects. For v2 objects, - * nopack is initialised from sd_attrs - */ - REISERFS_I(inode)->i_flags &= ~i_nopack_mask; - } else { - /* - * new stat data found, but object may have old items - * (directories and symlinks) - */ - struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih); - - inode->i_mode = sd_v2_mode(sd); - set_nlink(inode, sd_v2_nlink(sd)); - i_uid_write(inode, sd_v2_uid(sd)); - inode->i_size = sd_v2_size(sd); - i_gid_write(inode, sd_v2_gid(sd)); - inode_set_mtime(inode, sd_v2_mtime(sd), 0); - inode_set_atime(inode, sd_v2_atime(sd), 0); - inode_set_ctime(inode, sd_v2_ctime(sd), 0); - inode->i_blocks = sd_v2_blocks(sd); - rdev = sd_v2_rdev(sd); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - inode->i_generation = - le32_to_cpu(INODE_PKEY(inode)->k_dir_id); - else - inode->i_generation = sd_v2_generation(sd); - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - set_inode_item_key_version(inode, KEY_FORMAT_3_5); - else - set_inode_item_key_version(inode, KEY_FORMAT_3_6); - REISERFS_I(inode)->i_first_direct_byte = 0; - set_inode_sd_version(inode, STAT_DATA_V2); - inode_set_bytes(inode, - to_real_used_space(inode, inode->i_blocks, - SD_V2_SIZE)); - /* - * read persistent inode attributes from sd and initialise - * generic inode flags from them - */ - REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); - sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); - } - - pathrelse(path); - if (S_ISREG(inode->i_mode)) { - inode->i_op = &reiserfs_file_inode_operations; - inode->i_fop = &reiserfs_file_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &reiserfs_dir_inode_operations; - inode->i_fop = &reiserfs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &reiserfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - } else { - inode->i_blocks = 0; - inode->i_op = &reiserfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); - } -} - -/* update new stat data with inode fields */ -static void inode2sd(void *sd, struct inode *inode, loff_t size) -{ - struct stat_data *sd_v2 = (struct stat_data *)sd; - - set_sd_v2_mode(sd_v2, inode->i_mode); - set_sd_v2_nlink(sd_v2, inode->i_nlink); - set_sd_v2_uid(sd_v2, i_uid_read(inode)); - set_sd_v2_size(sd_v2, size); - set_sd_v2_gid(sd_v2, i_gid_read(inode)); - set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode)); - set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode)); - set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode)); - set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); - else - set_sd_v2_generation(sd_v2, inode->i_generation); - set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs); -} - -/* used to copy inode's fields to old stat data */ -static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) -{ - struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; - - set_sd_v1_mode(sd_v1, inode->i_mode); - set_sd_v1_uid(sd_v1, i_uid_read(inode)); - set_sd_v1_gid(sd_v1, i_gid_read(inode)); - set_sd_v1_nlink(sd_v1, inode->i_nlink); - set_sd_v1_size(sd_v1, size); - set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode)); - set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode)); - set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode)); - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); - else - set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); - - /* Sigh. i_first_direct_byte is back */ - set_sd_v1_first_direct_byte(sd_v1, - REISERFS_I(inode)->i_first_direct_byte); -} - -/* - * NOTE, you must prepare the buffer head before sending it here, - * and then log it after the call - */ -static void update_stat_data(struct treepath *path, struct inode *inode, - loff_t size) -{ - struct buffer_head *bh; - struct item_head *ih; - - bh = PATH_PLAST_BUFFER(path); - ih = tp_item_head(path); - - if (!is_statdata_le_ih(ih)) - reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", - INODE_PKEY(inode), ih); - - /* path points to old stat data */ - if (stat_data_v1(ih)) { - inode2sd_v1(ih_item_body(bh, ih), inode, size); - } else { - inode2sd(ih_item_body(bh, ih), inode, size); - } - - return; -} - -void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, - struct inode *inode, loff_t size) -{ - struct cpu_key key; - INITIALIZE_PATH(path); - struct buffer_head *bh; - int fs_gen; - struct item_head *ih, tmp_ih; - int retval; - - BUG_ON(!th->t_trans_id); - - /* key type is unimportant */ - make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); - - for (;;) { - int pos; - /* look for the object's stat data */ - retval = search_item(inode->i_sb, &key, &path); - if (retval == IO_ERROR) { - reiserfs_error(inode->i_sb, "vs-13050", - "i/o failure occurred trying to " - "update %K stat data", &key); - return; - } - if (retval == ITEM_NOT_FOUND) { - pos = PATH_LAST_POSITION(&path); - pathrelse(&path); - if (inode->i_nlink == 0) { - /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ - return; - } - reiserfs_warning(inode->i_sb, "vs-13060", - "stat data of object %k (nlink == %d) " - "not found (pos %d)", - INODE_PKEY(inode), inode->i_nlink, - pos); - reiserfs_check_path(&path); - return; - } - - /* - * sigh, prepare_for_journal might schedule. When it - * schedules the FS might change. We have to detect that, - * and loop back to the search if the stat data item has moved - */ - bh = get_last_bh(&path); - ih = tp_item_head(&path); - copy_item_head(&tmp_ih, ih); - fs_gen = get_generation(inode->i_sb); - reiserfs_prepare_for_journal(inode->i_sb, bh, 1); - - /* Stat_data item has been moved after scheduling. */ - if (fs_changed(fs_gen, inode->i_sb) - && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh); - continue; - } - break; - } - update_stat_data(&path, inode, size); - journal_mark_dirty(th, bh); - pathrelse(&path); - return; -} - -/* - * reiserfs_read_locked_inode is called to read the inode off disk, and it - * does a make_bad_inode when things go wrong. But, we need to make sure - * and clear the key in the private portion of the inode, otherwise a - * corresponding iput might try to delete whatever object the inode last - * represented. - */ -static void reiserfs_make_bad_inode(struct inode *inode) -{ - memset(INODE_PKEY(inode), 0, KEY_SIZE); - make_bad_inode(inode); -} - -/* - * initially this function was derived from minix or ext2's analog and - * evolved as the prototype did - */ -int reiserfs_init_locked_inode(struct inode *inode, void *p) -{ - struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; - inode->i_ino = args->objectid; - INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); - return 0; -} - -/* - * looks for stat data in the tree, and fills up the fields of in-core - * inode stat data fields - */ -void reiserfs_read_locked_inode(struct inode *inode, - struct reiserfs_iget_args *args) -{ - INITIALIZE_PATH(path_to_sd); - struct cpu_key key; - unsigned long dirino; - int retval; - - dirino = args->dirid; - - /* - * set version 1, version 2 could be used too, because stat data - * key is the same in both versions - */ - _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3); - - /* look for the object's stat data */ - retval = search_item(inode->i_sb, &key, &path_to_sd); - if (retval == IO_ERROR) { - reiserfs_error(inode->i_sb, "vs-13070", - "i/o failure occurred trying to find " - "stat data of %K", &key); - reiserfs_make_bad_inode(inode); - return; - } - - /* a stale NFS handle can trigger this without it being an error */ - if (retval != ITEM_FOUND) { - pathrelse(&path_to_sd); - reiserfs_make_bad_inode(inode); - clear_nlink(inode); - return; - } - - init_inode(inode, &path_to_sd); - - /* - * It is possible that knfsd is trying to access inode of a file - * that is being removed from the disk by some other thread. As we - * update sd on unlink all that is required is to check for nlink - * here. This bug was first found by Sizif when debugging - * SquidNG/Butterfly, forgotten, and found again after Philippe - * Gramoulle <philippe.gramoulle@mmania.com> reproduced it. - - * More logical fix would require changes in fs/inode.c:iput() to - * remove inode from hash-table _after_ fs cleaned disk stuff up and - * in iget() to return NULL if I_FREEING inode is found in - * hash-table. - */ - - /* - * Currently there is one place where it's ok to meet inode with - * nlink==0: processing of open-unlinked and half-truncated files - * during mount (fs/reiserfs/super.c:finish_unfinished()). - */ - if ((inode->i_nlink == 0) && - !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { - reiserfs_warning(inode->i_sb, "vs-13075", - "dead inode read from disk %K. " - "This is likely to be race with knfsd. Ignore", - &key); - reiserfs_make_bad_inode(inode); - } - - /* init inode should be relsing */ - reiserfs_check_path(&path_to_sd); - - /* - * Stat data v1 doesn't support ACLs. - */ - if (get_inode_sd_version(inode) == STAT_DATA_V1) - cache_no_acl(inode); -} - -/* - * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). - * - * @inode: inode from hash table to check - * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. - * - * This function is called by iget5_locked() to distinguish reiserfs inodes - * having the same inode numbers. Such inodes can only exist due to some - * error condition. One of them should be bad. Inodes with identical - * inode numbers (objectids) are distinguished by parent directory ids. - * - */ -int reiserfs_find_actor(struct inode *inode, void *opaque) -{ - struct reiserfs_iget_args *args; - - args = opaque; - /* args is already in CPU order */ - return (inode->i_ino == args->objectid) && - (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); -} - -struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) -{ - struct inode *inode; - struct reiserfs_iget_args args; - int depth; - - args.objectid = key->on_disk_key.k_objectid; - args.dirid = key->on_disk_key.k_dir_id; - depth = reiserfs_write_unlock_nested(s); - inode = iget5_locked(s, key->on_disk_key.k_objectid, - reiserfs_find_actor, reiserfs_init_locked_inode, - (void *)(&args)); - reiserfs_write_lock_nested(s, depth); - if (!inode) - return ERR_PTR(-ENOMEM); - - if (inode->i_state & I_NEW) { - reiserfs_read_locked_inode(inode, &args); - unlock_new_inode(inode); - } - - if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { - /* either due to i/o error or a stale NFS handle */ - iput(inode); - inode = NULL; - } - return inode; -} - -static struct dentry *reiserfs_get_dentry(struct super_block *sb, - u32 objectid, u32 dir_id, u32 generation) - -{ - struct cpu_key key; - struct inode *inode; - - key.on_disk_key.k_objectid = objectid; - key.on_disk_key.k_dir_id = dir_id; - reiserfs_write_lock(sb); - inode = reiserfs_iget(sb, &key); - if (inode && !IS_ERR(inode) && generation != 0 && - generation != inode->i_generation) { - iput(inode); - inode = NULL; - } - reiserfs_write_unlock(sb); - - return d_obtain_alias(inode); -} - -struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - /* - * fhtype happens to reflect the number of u32s encoded. - * due to a bug in earlier code, fhtype might indicate there - * are more u32s then actually fitted. - * so if fhtype seems to be more than len, reduce fhtype. - * Valid types are: - * 2 - objectid + dir_id - legacy support - * 3 - objectid + dir_id + generation - * 4 - objectid + dir_id + objectid and dirid of parent - legacy - * 5 - objectid + dir_id + generation + objectid and dirid of parent - * 6 - as above plus generation of directory - * 6 does not fit in NFSv2 handles - */ - if (fh_type > fh_len) { - if (fh_type != 6 || fh_len != 5) - reiserfs_warning(sb, "reiserfs-13077", - "nfsd/reiserfs, fhtype=%d, len=%d - odd", - fh_type, fh_len); - fh_type = fh_len; - } - if (fh_len < 2) - return NULL; - - return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1], - (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0); -} - -struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - if (fh_type > fh_len) - fh_type = fh_len; - if (fh_type < 4) - return NULL; - - return reiserfs_get_dentry(sb, - (fh_type >= 5) ? fid->raw[3] : fid->raw[2], - (fh_type >= 5) ? fid->raw[4] : fid->raw[3], - (fh_type == 6) ? fid->raw[5] : 0); -} - -int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, - struct inode *parent) -{ - int maxlen = *lenp; - - if (parent && (maxlen < 5)) { - *lenp = 5; - return FILEID_INVALID; - } else if (maxlen < 3) { - *lenp = 3; - return FILEID_INVALID; - } - - data[0] = inode->i_ino; - data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); - data[2] = inode->i_generation; - *lenp = 3; - if (parent) { - data[3] = parent->i_ino; - data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id); - *lenp = 5; - if (maxlen >= 6) { - data[5] = parent->i_generation; - *lenp = 6; - } - } - return *lenp; -} - -/* - * looks for stat data, then copies fields to it, marks the buffer - * containing stat data as dirty - */ -/* - * reiserfs inodes are never really dirty, since the dirty inode call - * always logs them. This call allows the VFS inode marking routines - * to properly mark inodes for datasync and such, but only actually - * does something when called for a synchronous update. - */ -int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - struct reiserfs_transaction_handle th; - int jbegin_count = 1; - - if (sb_rdonly(inode->i_sb)) - return -EROFS; - /* - * memory pressure can sometimes initiate write_inode calls with - * sync == 1, - * these cases are just when the system needs ram, not when the - * inode needs to reach disk for safety, and they can safely be - * ignored because the altered inode has already been logged. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { - reiserfs_write_lock(inode->i_sb); - if (!journal_begin(&th, inode->i_sb, jbegin_count)) { - reiserfs_update_sd(&th, inode); - journal_end_sync(&th); - } - reiserfs_write_unlock(inode->i_sb); - } - return 0; -} - -/* - * stat data of new object is inserted already, this inserts the item - * containing "." and ".." entries - */ -static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct item_head *ih, struct treepath *path, - struct inode *dir) -{ - struct super_block *sb = th->t_super; - char empty_dir[EMPTY_DIR_SIZE]; - char *body = empty_dir; - struct cpu_key key; - int retval; - - BUG_ON(!th->t_trans_id); - - _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), - le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, - TYPE_DIRENTRY, 3 /*key length */ ); - - /* - * compose item head for new item. Directories consist of items of - * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it - * is done by reiserfs_new_inode - */ - if (old_format_only(sb)) { - make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, - TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); - - make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, - ih->ih_key.k_objectid, - INODE_PKEY(dir)->k_dir_id, - INODE_PKEY(dir)->k_objectid); - } else { - make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, - TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); - - make_empty_dir_item(body, ih->ih_key.k_dir_id, - ih->ih_key.k_objectid, - INODE_PKEY(dir)->k_dir_id, - INODE_PKEY(dir)->k_objectid); - } - - /* look for place in the tree for new item */ - retval = search_item(sb, &key, path); - if (retval == IO_ERROR) { - reiserfs_error(sb, "vs-13080", - "i/o failure occurred creating new directory"); - return -EIO; - } - if (retval == ITEM_FOUND) { - pathrelse(path); - reiserfs_warning(sb, "vs-13070", - "object with this key exists (%k)", - &(ih->ih_key)); - return -EEXIST; - } - - /* insert item, that is empty directory item */ - return reiserfs_insert_item(th, path, &key, ih, inode, body); -} - -/* - * stat data of object has been inserted, this inserts the item - * containing the body of symlink - */ -static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct item_head *ih, - struct treepath *path, const char *symname, - int item_len) -{ - struct super_block *sb = th->t_super; - struct cpu_key key; - int retval; - - BUG_ON(!th->t_trans_id); - - _make_cpu_key(&key, KEY_FORMAT_3_5, - le32_to_cpu(ih->ih_key.k_dir_id), - le32_to_cpu(ih->ih_key.k_objectid), - 1, TYPE_DIRECT, 3 /*key length */ ); - - make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, - 0 /*free_space */ ); - - /* look for place in the tree for new item */ - retval = search_item(sb, &key, path); - if (retval == IO_ERROR) { - reiserfs_error(sb, "vs-13080", - "i/o failure occurred creating new symlink"); - return -EIO; - } - if (retval == ITEM_FOUND) { - pathrelse(path); - reiserfs_warning(sb, "vs-13080", - "object with this key exists (%k)", - &(ih->ih_key)); - return -EEXIST; - } - - /* insert item, that is body of symlink */ - return reiserfs_insert_item(th, path, &key, ih, inode, symname); -} - -/* - * inserts the stat data into the tree, and then calls - * reiserfs_new_directory (to insert ".", ".." item if new object is - * directory) or reiserfs_new_symlink (to insert symlink body if new - * object is symlink) or nothing (if new object is regular file) - - * NOTE! uid and gid must already be set in the inode. If we return - * non-zero due to an error, we have to drop the quota previously allocated - * for the fresh inode. This can only be done outside a transaction, so - * if we return non-zero, we also end the transaction. - * - * @th: active transaction handle - * @dir: parent directory for new inode - * @mode: mode of new inode - * @symname: symlink contents if inode is symlink - * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for - * symlinks - * @inode: inode to be filled - * @security: optional security context to associate with this inode - */ -int reiserfs_new_inode(struct reiserfs_transaction_handle *th, - struct inode *dir, umode_t mode, const char *symname, - /* 0 for regular, EMTRY_DIR_SIZE for dirs, - strlen (symname) for symlinks) */ - loff_t i_size, struct dentry *dentry, - struct inode *inode, - struct reiserfs_security_handle *security) -{ - struct super_block *sb = dir->i_sb; - struct reiserfs_iget_args args; - INITIALIZE_PATH(path_to_key); - struct cpu_key key; - struct item_head ih; - struct stat_data sd; - int retval; - int err; - int depth; - - BUG_ON(!th->t_trans_id); - - depth = reiserfs_write_unlock_nested(sb); - err = dquot_alloc_inode(inode); - reiserfs_write_lock_nested(sb, depth); - if (err) - goto out_end_trans; - if (!dir->i_nlink) { - err = -EPERM; - goto out_bad_inode; - } - - /* item head of new item */ - ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); - ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); - if (!ih.ih_key.k_objectid) { - err = -ENOMEM; - goto out_bad_inode; - } - args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); - if (old_format_only(sb)) - make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, - TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); - else - make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, - TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE); - args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - - depth = reiserfs_write_unlock_nested(inode->i_sb); - err = insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args); - reiserfs_write_lock_nested(inode->i_sb, depth); - if (err) { - err = -EINVAL; - goto out_bad_inode; - } - - if (old_format_only(sb)) - /* - * not a perfect generation count, as object ids can be reused, - * but this is as good as reiserfs can do right now. - * note that the private part of inode isn't filled in yet, - * we have to use the directory. - */ - inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); - else -#if defined( USE_INODE_GENERATION_COUNTER ) - inode->i_generation = - le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); -#else - inode->i_generation = ++event; -#endif - - /* fill stat data */ - set_nlink(inode, (S_ISDIR(mode) ? 2 : 1)); - - /* uid and gid must already be set by the caller for quota init */ - - simple_inode_init_ts(inode); - inode->i_size = i_size; - inode->i_blocks = 0; - inode->i_bytes = 0; - REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : - U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; - - INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); - REISERFS_I(inode)->i_flags = 0; - REISERFS_I(inode)->i_prealloc_block = 0; - REISERFS_I(inode)->i_prealloc_count = 0; - REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_jl = NULL; - REISERFS_I(inode)->i_attrs = - REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; - sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); - reiserfs_init_xattr_rwsem(inode); - - /* key to search for correct place for new stat data */ - _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), - le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, - TYPE_STAT_DATA, 3 /*key length */ ); - - /* find proper place for inserting of stat data */ - retval = search_item(sb, &key, &path_to_key); - if (retval == IO_ERROR) { - err = -EIO; - goto out_bad_inode; - } - if (retval == ITEM_FOUND) { - pathrelse(&path_to_key); - err = -EEXIST; - goto out_bad_inode; - } - if (old_format_only(sb)) { - /* i_uid or i_gid is too big to be stored in stat data v3.5 */ - if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { - pathrelse(&path_to_key); - err = -EINVAL; - goto out_bad_inode; - } - inode2sd_v1(&sd, inode, inode->i_size); - } else { - inode2sd(&sd, inode, inode->i_size); - } - /* - * store in in-core inode the key of stat data and version all - * object items will have (directory items will have old offset - * format, other new objects will consist of new items) - */ - if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) - set_inode_item_key_version(inode, KEY_FORMAT_3_5); - else - set_inode_item_key_version(inode, KEY_FORMAT_3_6); - if (old_format_only(sb)) - set_inode_sd_version(inode, STAT_DATA_V1); - else - set_inode_sd_version(inode, STAT_DATA_V2); - - /* insert the stat data into the tree */ -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (REISERFS_I(dir)->new_packing_locality) - th->displace_new_blocks = 1; -#endif - retval = - reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, - (char *)(&sd)); - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key); - goto out_bad_inode; - } -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (!th->displace_new_blocks) - REISERFS_I(dir)->new_packing_locality = 0; -#endif - if (S_ISDIR(mode)) { - /* insert item with "." and ".." */ - retval = - reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); - } - - if (S_ISLNK(mode)) { - /* insert body of symlink */ - if (!old_format_only(sb)) - i_size = ROUND_UP(i_size); - retval = - reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, - i_size); - } - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key); - journal_end(th); - goto out_inserted_sd; - } - - /* - * Mark it private if we're creating the privroot - * or something under it. - */ - if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) - reiserfs_init_priv_inode(inode); - - if (reiserfs_posixacl(inode->i_sb)) { - reiserfs_write_unlock(inode->i_sb); - retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); - reiserfs_write_lock(inode->i_sb); - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key); - journal_end(th); - goto out_inserted_sd; - } - } else if (inode->i_sb->s_flags & SB_POSIXACL) { - reiserfs_warning(inode->i_sb, "jdm-13090", - "ACLs aren't enabled in the fs, " - "but vfs thinks they are!"); - } - - if (security->name) { - reiserfs_write_unlock(inode->i_sb); - retval = reiserfs_security_write(th, inode, security); - reiserfs_write_lock(inode->i_sb); - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key); - retval = journal_end(th); - if (retval) - err = retval; - goto out_inserted_sd; - } - } - - reiserfs_update_sd(th, inode); - reiserfs_check_path(&path_to_key); - - return 0; - -out_bad_inode: - /* Invalidate the object, nothing was inserted yet */ - INODE_PKEY(inode)->k_objectid = 0; - - /* Quota change must be inside a transaction for journaling */ - depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_free_inode(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); - -out_end_trans: - journal_end(th); - /* - * Drop can be outside and it needs more credits so it's better - * to have it outside - */ - depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_drop(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); - inode->i_flags |= S_NOQUOTA; - make_bad_inode(inode); - -out_inserted_sd: - clear_nlink(inode); - th->t_trans_id = 0; /* so the caller can't use this handle later */ - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - iput(inode); - return err; -} - -/* - * finds the tail page in the page cache, - * reads the last block in. - * - * On success, page_result is set to a locked, pinned page, and bh_result - * is set to an up to date buffer for the last block in the file. returns 0. - * - * tail conversion is not done, so bh_result might not be valid for writing - * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before - * trying to write the block. - * - * on failure, nonzero is returned, page_result and bh_result are untouched. - */ -static int grab_tail_page(struct inode *inode, - struct page **page_result, - struct buffer_head **bh_result) -{ - - /* - * we want the page with the last byte in the file, - * not the page that will hold the next byte for appending - */ - unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT; - unsigned long pos = 0; - unsigned long start = 0; - unsigned long blocksize = inode->i_sb->s_blocksize; - unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1); - struct buffer_head *bh; - struct buffer_head *head; - struct folio *folio; - int error; - - /* - * we know that we are only called with inode->i_size > 0. - * we also know that a file tail can never be as big as a block - * If i_size % blocksize == 0, our file is currently block aligned - * and it won't need converting or zeroing after a truncate. - */ - if ((offset & (blocksize - 1)) == 0) { - return -ENOENT; - } - folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, - mapping_gfp_mask(inode->i_mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); - /* start within the page of the last block in the file */ - start = (offset / blocksize) * blocksize; - - error = __block_write_begin(folio, start, offset - start, - reiserfs_get_block_create_0); - if (error) - goto unlock; - - head = folio_buffers(folio); - bh = head; - do { - if (pos >= start) { - break; - } - bh = bh->b_this_page; - pos += blocksize; - } while (bh != head); - - if (!buffer_uptodate(bh)) { - /* - * note, this should never happen, prepare_write should be - * taking care of this for us. If the buffer isn't up to - * date, I've screwed up the code to find the buffer, or the - * code to call prepare_write - */ - reiserfs_error(inode->i_sb, "clm-6000", - "error reading block %lu", bh->b_blocknr); - error = -EIO; - goto unlock; - } - *bh_result = bh; - *page_result = &folio->page; - - return error; - -unlock: - folio_unlock(folio); - folio_put(folio); - return error; -} - -/* - * vfs version of truncate file. Must NOT be called with - * a transaction already started. - * - * some code taken from block_truncate_page - */ -int reiserfs_truncate_file(struct inode *inode, int update_timestamps) -{ - struct reiserfs_transaction_handle th; - /* we want the offset for the first byte after the end of the file */ - unsigned long offset = inode->i_size & (PAGE_SIZE - 1); - unsigned blocksize = inode->i_sb->s_blocksize; - unsigned length; - struct page *page = NULL; - int error; - struct buffer_head *bh = NULL; - int err2; - - reiserfs_write_lock(inode->i_sb); - - if (inode->i_size > 0) { - error = grab_tail_page(inode, &page, &bh); - if (error) { - /* - * -ENOENT means we truncated past the end of the - * file, and get_block_create_0 could not find a - * block to read in, which is ok. - */ - if (error != -ENOENT) - reiserfs_error(inode->i_sb, "clm-6001", - "grab_tail_page failed %d", - error); - page = NULL; - bh = NULL; - } - } - - /* - * so, if page != NULL, we have a buffer head for the offset at - * the end of the file. if the bh is mapped, and bh->b_blocknr != 0, - * then we have an unformatted node. Otherwise, we have a direct item, - * and no zeroing is required on disk. We zero after the truncate, - * because the truncate might pack the item anyway - * (it will unmap bh if it packs). - * - * it is enough to reserve space in transaction for 2 balancings: - * one for "save" link adding and another for the first - * cut_from_item. 1 is for update_sd - */ - error = journal_begin(&th, inode->i_sb, - JOURNAL_PER_BALANCE_CNT * 2 + 1); - if (error) - goto out; - reiserfs_update_inode_transaction(inode); - if (update_timestamps) - /* - * we are doing real truncate: if the system crashes - * before the last transaction of truncating gets committed - * - on reboot the file either appears truncated properly - * or not truncated at all - */ - add_save_link(&th, inode, 1); - err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); - error = journal_end(&th); - if (error) - goto out; - - /* check reiserfs_do_truncate after ending the transaction */ - if (err2) { - error = err2; - goto out; - } - - if (update_timestamps) { - error = remove_save_link(inode, 1 /* truncate */); - if (error) - goto out; - } - - if (page) { - length = offset & (blocksize - 1); - /* if we are not on a block boundary */ - if (length) { - length = blocksize - length; - zero_user(page, offset, length); - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - mark_buffer_dirty(bh); - } - } - unlock_page(page); - put_page(page); - } - - reiserfs_write_unlock(inode->i_sb); - - return 0; -out: - if (page) { - unlock_page(page); - put_page(page); - } - - reiserfs_write_unlock(inode->i_sb); - - return error; -} - -static int map_block_for_writepage(struct inode *inode, - struct buffer_head *bh_result, - unsigned long block) -{ - struct reiserfs_transaction_handle th; - int fs_gen; - struct item_head tmp_ih; - struct item_head *ih; - struct buffer_head *bh; - __le32 *item; - struct cpu_key key; - INITIALIZE_PATH(path); - int pos_in_item; - int jbegin_count = JOURNAL_PER_BALANCE_CNT; - loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1; - int retval; - int use_get_block = 0; - int bytes_copied = 0; - int copy_size; - int trans_running = 0; - - /* - * catch places below that try to log something without - * starting a trans - */ - th.t_trans_id = 0; - - if (!buffer_uptodate(bh_result)) { - return -EIO; - } - - kmap(bh_result->b_page); -start_over: - reiserfs_write_lock(inode->i_sb); - make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); - -research: - retval = search_for_position_by_key(inode->i_sb, &key, &path); - if (retval != POSITION_FOUND) { - use_get_block = 1; - goto out; - } - - bh = get_last_bh(&path); - ih = tp_item_head(&path); - item = tp_item_body(&path); - pos_in_item = path.pos_in_item; - - /* we've found an unformatted node */ - if (indirect_item_found(retval, ih)) { - if (bytes_copied > 0) { - reiserfs_warning(inode->i_sb, "clm-6002", - "bytes_copied %d", bytes_copied); - } - if (!get_block_num(item, pos_in_item)) { - /* crap, we are writing to a hole */ - use_get_block = 1; - goto out; - } - set_block_dev_mapped(bh_result, - get_block_num(item, pos_in_item), inode); - } else if (is_direct_le_ih(ih)) { - char *p; - p = page_address(bh_result->b_page); - p += (byte_offset - 1) & (PAGE_SIZE - 1); - copy_size = ih_item_len(ih) - pos_in_item; - - fs_gen = get_generation(inode->i_sb); - copy_item_head(&tmp_ih, ih); - - if (!trans_running) { - /* vs-3050 is gone, no need to drop the path */ - retval = journal_begin(&th, inode->i_sb, jbegin_count); - if (retval) - goto out; - reiserfs_update_inode_transaction(inode); - trans_running = 1; - if (fs_changed(fs_gen, inode->i_sb) - && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, - bh); - goto research; - } - } - - reiserfs_prepare_for_journal(inode->i_sb, bh, 1); - - if (fs_changed(fs_gen, inode->i_sb) - && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh); - goto research; - } - - memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied, - copy_size); - - journal_mark_dirty(&th, bh); - bytes_copied += copy_size; - set_block_dev_mapped(bh_result, 0, inode); - - /* are there still bytes left? */ - if (bytes_copied < bh_result->b_size && - (byte_offset + bytes_copied) < inode->i_size) { - set_cpu_key_k_offset(&key, - cpu_key_k_offset(&key) + - copy_size); - goto research; - } - } else { - reiserfs_warning(inode->i_sb, "clm-6003", - "bad item inode %lu", inode->i_ino); - retval = -EIO; - goto out; - } - retval = 0; - -out: - pathrelse(&path); - if (trans_running) { - int err = journal_end(&th); - if (err) - retval = err; - trans_running = 0; - } - reiserfs_write_unlock(inode->i_sb); - - /* this is where we fill in holes in the file. */ - if (use_get_block) { - retval = reiserfs_get_block(inode, block, bh_result, - GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX - | GET_BLOCK_NO_DANGLE); - if (!retval) { - if (!buffer_mapped(bh_result) - || bh_result->b_blocknr == 0) { - /* get_block failed to find a mapped unformatted node. */ - use_get_block = 0; - goto start_over; - } - } - } - kunmap(bh_result->b_page); - - if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* - * we've copied data from the page into the direct item, so the - * buffer in the page is now clean, mark it to reflect that. - */ - lock_buffer(bh_result); - clear_buffer_dirty(bh_result); - unlock_buffer(bh_result); - } - return retval; -} - -/* - * mason@suse.com: updated in 2.5.54 to follow the same general io - * start/recovery path as __block_write_full_folio, along with special - * code to handle reiserfs tails. - */ -static int reiserfs_write_folio(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - struct inode *inode = folio->mapping->host; - unsigned long end_index = inode->i_size >> PAGE_SHIFT; - int error = 0; - unsigned long block; - sector_t last_block; - struct buffer_head *head, *bh; - int partial = 0; - int nr = 0; - int checked = folio_test_checked(folio); - struct reiserfs_transaction_handle th; - struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_SIZE / s->s_blocksize; - th.t_trans_id = 0; - - /* no logging allowed when nonblocking or from PF_MEMALLOC */ - if (checked && (current->flags & PF_MEMALLOC)) { - folio_redirty_for_writepage(wbc, folio); - folio_unlock(folio); - return 0; - } - - /* - * The folio dirty bit is cleared before writepage is called, which - * means we have to tell create_empty_buffers to make dirty buffers - * The folio really should be up to date at this point, so tossing - * in the BH_Uptodate is just a sanity check. - */ - head = folio_buffers(folio); - if (!head) - head = create_empty_buffers(folio, s->s_blocksize, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - - /* - * last folio in the file, zero out any contents past the - * last byte in the file - */ - if (folio->index >= end_index) { - unsigned last_offset; - - last_offset = inode->i_size & (PAGE_SIZE - 1); - /* no file contents in this folio */ - if (folio->index >= end_index + 1 || !last_offset) { - folio_unlock(folio); - return 0; - } - folio_zero_segment(folio, last_offset, folio_size(folio)); - } - bh = head; - block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits); - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - /* first map all the buffers, logging any direct items we find */ - do { - if (block > last_block) { - /* - * This can happen when the block size is less than - * the folio size. The corresponding bytes in the folio - * were zero filled above - */ - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - } else if ((checked || buffer_dirty(bh)) && - (!buffer_mapped(bh) || bh->b_blocknr == 0)) { - /* - * not mapped yet, or it points to a direct item, search - * the btree for the mapping info, and log any direct - * items found - */ - if ((error = map_block_for_writepage(inode, bh, block))) { - goto fail; - } - } - bh = bh->b_this_page; - block++; - } while (bh != head); - - /* - * we start the transaction after map_block_for_writepage, - * because it can create holes in the file (an unbounded operation). - * starting it here, we can make a reliable estimate for how many - * blocks we're going to log - */ - if (checked) { - folio_clear_checked(folio); - reiserfs_write_lock(s); - error = journal_begin(&th, s, bh_per_page + 1); - if (error) { - reiserfs_write_unlock(s); - goto fail; - } - reiserfs_update_inode_transaction(inode); - } - /* now go through and lock any dirty buffers on the folio */ - do { - get_bh(bh); - if (!buffer_mapped(bh)) - continue; - if (buffer_mapped(bh) && bh->b_blocknr == 0) - continue; - - if (checked) { - reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, bh); - continue; - } - /* - * from this point on, we know the buffer is mapped to a - * real block and not a direct item - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - lock_buffer(bh); - } else { - if (!trylock_buffer(bh)) { - folio_redirty_for_writepage(wbc, folio); - continue; - } - } - if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); - } - } while ((bh = bh->b_this_page) != head); - - if (checked) { - error = journal_end(&th); - reiserfs_write_unlock(s); - if (error) - goto fail; - } - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - - /* - * since any buffer might be the only dirty buffer on the folio, - * the first submit_bh can bring the folio out of writeback. - * be careful with the buffers. - */ - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(REQ_OP_WRITE, bh); - nr++; - } - put_bh(bh); - bh = next; - } while (bh != head); - - error = 0; -done: - if (nr == 0) { - /* - * if this folio only had a direct item, it is very possible for - * no io to be required without there being an error. Or, - * someone else could have locked them and sent them down the - * pipe without locking the folio - */ - bh = head; - do { - if (!buffer_uptodate(bh)) { - partial = 1; - break; - } - bh = bh->b_this_page; - } while (bh != head); - if (!partial) - folio_mark_uptodate(folio); - folio_end_writeback(folio); - } - return error; - -fail: - /* - * catches various errors, we need to make sure any valid dirty blocks - * get to the media. The folio is currently locked and not marked for - * writeback - */ - folio_clear_uptodate(folio); - bh = head; - do { - get_bh(bh); - if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { - lock_buffer(bh); - mark_buffer_async_write(bh); - } else { - /* - * clear any dirty bits that might have come from - * getting attached to a dirty folio - */ - clear_buffer_dirty(bh); - } - bh = bh->b_this_page; - } while (bh != head); - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - folio_unlock(folio); - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - clear_buffer_dirty(bh); - submit_bh(REQ_OP_WRITE, bh); - nr++; - } - put_bh(bh); - bh = next; - } while (bh != head); - goto done; -} - -static int reiserfs_read_folio(struct file *f, struct folio *folio) -{ - return block_read_full_folio(folio, reiserfs_get_block); -} - -static int reiserfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - reiserfs_wait_on_write_block(mapping->host->i_sb); - return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL); -} - -static void reiserfs_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - reiserfs_truncate_file(inode, 0); -} - -static int reiserfs_write_begin(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - struct inode *inode; - struct folio *folio; - pgoff_t index; - int ret; - int old_ref = 0; - - inode = mapping->host; - index = pos >> PAGE_SHIFT; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); - *foliop = folio; - - reiserfs_wait_on_write_block(inode->i_sb); - fix_tail_page_for_writing(&folio->page); - if (reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th; - th = (struct reiserfs_transaction_handle *)current-> - journal_info; - BUG_ON(!th->t_refcount); - BUG_ON(!th->t_trans_id); - old_ref = th->t_refcount; - th->t_refcount++; - } - ret = __block_write_begin(folio, pos, len, reiserfs_get_block); - if (ret && reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th = current->journal_info; - /* - * this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close - * it, and we've got to free handle if it was a persistent - * transaction. - * - * But, if we had nested into an existing transaction, we need - * to just drop the ref count on the handle. - * - * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested - * above. - */ - if (th->t_refcount > old_ref) { - if (old_ref) - th->t_refcount--; - else { - int err; - reiserfs_write_lock(inode->i_sb); - err = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); - if (err) - ret = err; - } - } - } - if (ret) { - folio_unlock(folio); - folio_put(folio); - /* Truncate allocated blocks */ - reiserfs_truncate_failed_write(inode); - } - return ret; -} - -int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) -{ - struct inode *inode = page->mapping->host; - int ret; - int old_ref = 0; - int depth; - - depth = reiserfs_write_unlock_nested(inode->i_sb); - reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock_nested(inode->i_sb, depth); - - fix_tail_page_for_writing(page); - if (reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th; - th = (struct reiserfs_transaction_handle *)current-> - journal_info; - BUG_ON(!th->t_refcount); - BUG_ON(!th->t_trans_id); - old_ref = th->t_refcount; - th->t_refcount++; - } - - ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block); - if (ret && reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th = current->journal_info; - /* - * this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close - * it, and we've got to free handle if it was a persistent - * transaction. - * - * But, if we had nested into an existing transaction, we need - * to just drop the ref count on the handle. - * - * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested - * above. - */ - if (th->t_refcount > old_ref) { - if (old_ref) - th->t_refcount--; - else { - int err; - reiserfs_write_lock(inode->i_sb); - err = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); - if (err) - ret = err; - } - } - } - return ret; - -} - -static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) -{ - return generic_block_bmap(as, block, reiserfs_bmap); -} - -static int reiserfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct inode *inode = folio->mapping->host; - int ret = 0; - int update_sd = 0; - struct reiserfs_transaction_handle *th; - unsigned start; - bool locked = false; - - reiserfs_wait_on_write_block(inode->i_sb); - if (reiserfs_transaction_running(inode->i_sb)) - th = current->journal_info; - else - th = NULL; - - start = pos & (PAGE_SIZE - 1); - if (unlikely(copied < len)) { - if (!folio_test_uptodate(folio)) - copied = 0; - - folio_zero_new_buffers(folio, start + copied, start + len); - } - flush_dcache_folio(folio); - - reiserfs_commit_page(inode, &folio->page, start, start + copied); - - /* - * generic_commit_write does this for us, but does not update the - * transaction tracking stuff when the size changes. So, we have - * to do the i_size updates here. - */ - if (pos + copied > inode->i_size) { - struct reiserfs_transaction_handle myth; - reiserfs_write_lock(inode->i_sb); - locked = true; - /* - * If the file have grown beyond the border where it - * can have a tail, unmark it as needing a tail - * packing - */ - if ((have_large_tails(inode->i_sb) - && inode->i_size > i_block_size(inode) * 4) - || (have_small_tails(inode->i_sb) - && inode->i_size > i_block_size(inode))) - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - - ret = journal_begin(&myth, inode->i_sb, 1); - if (ret) - goto journal_error; - - reiserfs_update_inode_transaction(inode); - inode->i_size = pos + copied; - /* - * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around on - * the dirty lists, and so that O_SYNC works as expected - */ - mark_inode_dirty(inode); - reiserfs_update_sd(&myth, inode); - update_sd = 1; - ret = journal_end(&myth); - if (ret) - goto journal_error; - } - if (th) { - if (!locked) { - reiserfs_write_lock(inode->i_sb); - locked = true; - } - if (!update_sd) - mark_inode_dirty(inode); - ret = reiserfs_end_persistent_transaction(th); - if (ret) - goto out; - } - -out: - if (locked) - reiserfs_write_unlock(inode->i_sb); - folio_unlock(folio); - folio_put(folio); - - if (pos + len > inode->i_size) - reiserfs_truncate_failed_write(inode); - - return ret == 0 ? copied : ret; - -journal_error: - reiserfs_write_unlock(inode->i_sb); - locked = false; - if (th) { - if (!update_sd) - reiserfs_update_sd(th, inode); - ret = reiserfs_end_persistent_transaction(th); - } - goto out; -} - -int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to; - int ret = 0; - int update_sd = 0; - struct reiserfs_transaction_handle *th = NULL; - int depth; - - depth = reiserfs_write_unlock_nested(inode->i_sb); - reiserfs_wait_on_write_block(inode->i_sb); - reiserfs_write_lock_nested(inode->i_sb, depth); - - if (reiserfs_transaction_running(inode->i_sb)) { - th = current->journal_info; - } - reiserfs_commit_page(inode, page, from, to); - - /* - * generic_commit_write does this for us, but does not update the - * transaction tracking stuff when the size changes. So, we have - * to do the i_size updates here. - */ - if (pos > inode->i_size) { - struct reiserfs_transaction_handle myth; - /* - * If the file have grown beyond the border where it - * can have a tail, unmark it as needing a tail - * packing - */ - if ((have_large_tails(inode->i_sb) - && inode->i_size > i_block_size(inode) * 4) - || (have_small_tails(inode->i_sb) - && inode->i_size > i_block_size(inode))) - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - - ret = journal_begin(&myth, inode->i_sb, 1); - if (ret) - goto journal_error; - - reiserfs_update_inode_transaction(inode); - inode->i_size = pos; - /* - * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around - * on the dirty lists, and so that O_SYNC works as expected - */ - mark_inode_dirty(inode); - reiserfs_update_sd(&myth, inode); - update_sd = 1; - ret = journal_end(&myth); - if (ret) - goto journal_error; - } - if (th) { - if (!update_sd) - mark_inode_dirty(inode); - ret = reiserfs_end_persistent_transaction(th); - if (ret) - goto out; - } - -out: - return ret; - -journal_error: - if (th) { - if (!update_sd) - reiserfs_update_sd(th, inode); - ret = reiserfs_end_persistent_transaction(th); - } - - return ret; -} - -void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) -{ - if (reiserfs_attrs(inode->i_sb)) { - if (sd_attrs & REISERFS_SYNC_FL) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (sd_attrs & REISERFS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (sd_attrs & REISERFS_APPEND_FL) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (sd_attrs & REISERFS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - if (sd_attrs & REISERFS_NOTAIL_FL) - REISERFS_I(inode)->i_flags |= i_nopack_mask; - else - REISERFS_I(inode)->i_flags &= ~i_nopack_mask; - } -} - -/* - * decide if this buffer needs to stay around for data logging or ordered - * write purposes - */ -static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh) -{ - int ret = 1; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); - - lock_buffer(bh); - spin_lock(&j->j_dirty_buffers_lock); - if (!buffer_mapped(bh)) { - goto free_jh; - } - /* - * the page is locked, and the only places that log a data buffer - * also lock the page. - */ - if (reiserfs_file_data_log(inode)) { - /* - * very conservative, leave the buffer pinned if - * anyone might need it. - */ - if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { - ret = 0; - } - } else if (buffer_dirty(bh)) { - struct reiserfs_journal_list *jl; - struct reiserfs_jh *jh = bh->b_private; - - /* - * why is this safe? - * reiserfs_setattr updates i_size in the on disk - * stat data before allowing vmtruncate to be called. - * - * If buffer was put onto the ordered list for this - * transaction, we know for sure either this transaction - * or an older one already has updated i_size on disk, - * and this ordered data won't be referenced in the file - * if we crash. - * - * if the buffer was put onto the ordered list for an older - * transaction, we need to leave it around - */ - if (jh && (jl = jh->jl) - && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) - ret = 0; - } -free_jh: - if (ret && bh->b_private) { - reiserfs_free_jh(bh); - } - spin_unlock(&j->j_dirty_buffers_lock); - unlock_buffer(bh); - return ret; -} - -/* clm -- taken from fs/buffer.c:block_invalidate_folio */ -static void reiserfs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) -{ - struct buffer_head *head, *bh, *next; - struct inode *inode = folio->mapping->host; - unsigned int curr_off = 0; - unsigned int stop = offset + length; - int partial_page = (offset || length < folio_size(folio)); - int ret = 1; - - BUG_ON(!folio_test_locked(folio)); - - if (!partial_page) - folio_clear_checked(folio); - - head = folio_buffers(folio); - if (!head) - goto out; - - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - next = bh->b_this_page; - - if (next_off > stop) - goto out; - - /* - * is this block fully invalidated? - */ - if (offset <= curr_off) { - if (invalidate_folio_can_drop(inode, bh)) - reiserfs_unmap_buffer(bh); - else - ret = 0; - } - curr_off = next_off; - bh = next; - } while (bh != head); - - /* - * We release buffers only if the entire page is being invalidated. - * The get_block cached value has been unconditionally invalidated, - * so real IO is not possible anymore. - */ - if (!partial_page && ret) { - ret = filemap_release_folio(folio, 0); - /* maybe should BUG_ON(!ret); - neilb */ - } -out: - return; -} - -static bool reiserfs_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - if (reiserfs_file_data_log(mapping->host)) { - folio_set_checked(folio); - return filemap_dirty_folio(mapping, folio); - } - return block_dirty_folio(mapping, folio); -} - -/* - * Returns true if the folio's buffers were dropped. The folio is locked. - * - * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads - * in the buffers at folio_buffers(folio). - * - * even in -o notail mode, we can't be sure an old mount without -o notail - * didn't create files with tails. - */ -static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) -{ - struct inode *inode = folio->mapping->host; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); - struct buffer_head *head; - struct buffer_head *bh; - bool ret = true; - - WARN_ON(folio_test_checked(folio)); - spin_lock(&j->j_dirty_buffers_lock); - head = folio_buffers(folio); - bh = head; - do { - if (bh->b_private) { - if (!buffer_dirty(bh) && !buffer_locked(bh)) { - reiserfs_free_jh(bh); - } else { - ret = false; - break; - } - } - bh = bh->b_this_page; - } while (bh != head); - if (ret) - ret = try_to_free_buffers(folio); - spin_unlock(&j->j_dirty_buffers_lock); - return ret; -} - -/* - * We thank Mingming Cao for helping us understand in great detail what - * to do in this section of the code. - */ -static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; - - ret = blockdev_direct_IO(iocb, inode, iter, - reiserfs_get_blocks_direct_io); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = iocb->ki_pos + count; - - if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { - truncate_setsize(inode, isize); - reiserfs_vfs_truncate_file(inode); - } - } - - return ret; -} - -int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - unsigned int ia_valid; - int error; - - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (error) - return error; - - /* must be turned off for recursive notify_change calls */ - ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - - if (is_quota_modification(&nop_mnt_idmap, inode, attr)) { - error = dquot_initialize(inode); - if (error) - return error; - } - reiserfs_write_lock(inode->i_sb); - if (attr->ia_valid & ATTR_SIZE) { - /* - * version 2 items will be caught by the s_maxbytes check - * done for us in vmtruncate - */ - if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && - attr->ia_size > MAX_NON_LFS) { - reiserfs_write_unlock(inode->i_sb); - error = -EFBIG; - goto out; - } - - inode_dio_wait(inode); - - /* fill in hole pointers in the expanding truncate case. */ - if (attr->ia_size > inode->i_size) { - loff_t pos = attr->ia_size; - - if ((pos & (inode->i_sb->s_blocksize - 1)) == 0) - pos++; - error = generic_cont_expand_simple(inode, pos); - if (REISERFS_I(inode)->i_prealloc_count > 0) { - int err; - struct reiserfs_transaction_handle th; - /* we're changing at most 2 bitmaps, inode + super */ - err = journal_begin(&th, inode->i_sb, 4); - if (!err) { - reiserfs_discard_prealloc(&th, inode); - err = journal_end(&th); - } - if (err) - error = err; - } - if (error) { - reiserfs_write_unlock(inode->i_sb); - goto out; - } - /* - * file size is changed, ctime and mtime are - * to be updated - */ - attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME); - } - } - reiserfs_write_unlock(inode->i_sb); - - if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) || - ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) && - (get_inode_sd_version(inode) == STAT_DATA_V1)) { - /* stat data of format v3.5 has 16 bit uid and gid */ - error = -EINVAL; - goto out; - } - - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { - struct reiserfs_transaction_handle th; - int jbegin_count = - 2 * - (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + - REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + - 2; - - error = reiserfs_chown_xattrs(inode, attr); - - if (error) - return error; - - /* - * (user+group)*(old+new) structure - we count quota - * info and , inode write (sb, inode) - */ - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, jbegin_count); - reiserfs_write_unlock(inode->i_sb); - if (error) - goto out; - error = dquot_transfer(&nop_mnt_idmap, inode, attr); - reiserfs_write_lock(inode->i_sb); - if (error) { - journal_end(&th); - reiserfs_write_unlock(inode->i_sb); - goto out; - } - - /* - * Update corresponding info in inode so that everything - * is in one transaction - */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; - mark_inode_dirty(inode); - error = journal_end(&th); - reiserfs_write_unlock(inode->i_sb); - if (error) - goto out; - } - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = inode_newsize_ok(inode, attr->ia_size); - if (!error) { - /* - * Could race against reiserfs_file_release - * if called from NFS, so take tailpack mutex. - */ - mutex_lock(&REISERFS_I(inode)->tailpack); - truncate_setsize(inode, attr->ia_size); - reiserfs_truncate_file(inode, 1); - mutex_unlock(&REISERFS_I(inode)->tailpack); - } - } - - if (!error) { - setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); - } - - if (!error && reiserfs_posixacl(inode->i_sb)) { - if (attr->ia_valid & ATTR_MODE) - error = reiserfs_acl_chmod(dentry); - } - -out: - return error; -} - -const struct address_space_operations reiserfs_address_space_operations = { - .writepages = reiserfs_writepages, - .read_folio = reiserfs_read_folio, - .readahead = reiserfs_readahead, - .release_folio = reiserfs_release_folio, - .invalidate_folio = reiserfs_invalidate_folio, - .write_begin = reiserfs_write_begin, - .write_end = reiserfs_write_end, - .bmap = reiserfs_aop_bmap, - .direct_IO = reiserfs_direct_IO, - .dirty_folio = reiserfs_dirty_folio, - .migrate_folio = buffer_migrate_folio, -}; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c deleted file mode 100644 index dd33f8cc6eda..000000000000 --- a/fs/reiserfs/ioctl.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/mount.h> -#include "reiserfs.h" -#include <linux/time.h> -#include <linux/uaccess.h> -#include <linux/pagemap.h> -#include <linux/compat.h> -#include <linux/fileattr.h> - -int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - - if (!reiserfs_attrs(inode->i_sb)) - return -ENOTTY; - - fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs); - - return 0; -} - -int reiserfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - unsigned int flags = fa->flags; - int err; - - reiserfs_write_lock(inode->i_sb); - - err = -ENOTTY; - if (!reiserfs_attrs(inode->i_sb)) - goto unlock; - - err = -EOPNOTSUPP; - if (fileattr_has_fsx(fa)) - goto unlock; - - /* - * Is it quota file? Do not allow user to mess with it - */ - err = -EPERM; - if (IS_NOQUOTA(inode)) - goto unlock; - - if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) { - err = reiserfs_unpack(inode); - if (err) - goto unlock; - } - sd_attrs_to_i_attrs(flags, inode); - REISERFS_I(inode)->i_attrs = flags; - inode_set_ctime_current(inode); - mark_inode_dirty(inode); - err = 0; -unlock: - reiserfs_write_unlock(inode->i_sb); - - return err; -} - -/* - * reiserfs_ioctl - handler for ioctl for inode - * supported commands: - * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect - * and prevent packing file (argument arg has t - * be non-zero) - * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION - * 3) That's all for a while ... - */ -long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - int err = 0; - - reiserfs_write_lock(inode->i_sb); - - switch (cmd) { - case REISERFS_IOC_UNPACK: - if (S_ISREG(inode->i_mode)) { - if (arg) - err = reiserfs_unpack(inode); - } else - err = -ENOTTY; - break; - /* - * following two cases are taken from fs/ext2/ioctl.c by Remy - * Card (card@masi.ibp.fr) - */ - case REISERFS_IOC_GETVERSION: - err = put_user(inode->i_generation, (int __user *)arg); - break; - case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) { - err = -EPERM; - break; - } - err = mnt_want_write_file(filp); - if (err) - break; - if (get_user(inode->i_generation, (int __user *)arg)) { - err = -EFAULT; - goto setversion_out; - } - inode_set_ctime_current(inode); - mark_inode_dirty(inode); -setversion_out: - mnt_drop_write_file(filp); - break; - default: - err = -ENOTTY; - } - - reiserfs_write_unlock(inode->i_sb); - - return err; -} - -#ifdef CONFIG_COMPAT -long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - /* - * These are just misnamed, they actually - * get/put from/to user an int - */ - switch (cmd) { - case REISERFS_IOC32_UNPACK: - cmd = REISERFS_IOC_UNPACK; - break; - case REISERFS_IOC32_GETVERSION: - cmd = REISERFS_IOC_GETVERSION; - break; - case REISERFS_IOC32_SETVERSION: - cmd = REISERFS_IOC_SETVERSION; - break; - default: - return -ENOIOCTLCMD; - } - - return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif - -int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to); -/* - * reiserfs_unpack - * Function try to convert tail from direct item into indirect. - * It set up nopack attribute in the REISERFS_I(inode)->nopack - */ -int reiserfs_unpack(struct inode *inode) -{ - int retval = 0; - int index; - struct page *page; - struct address_space *mapping; - unsigned long write_from; - unsigned long blocksize = inode->i_sb->s_blocksize; - - if (inode->i_size == 0) { - REISERFS_I(inode)->i_flags |= i_nopack_mask; - return 0; - } - /* ioctl already done */ - if (REISERFS_I(inode)->i_flags & i_nopack_mask) { - return 0; - } - - /* we need to make sure nobody is changing the file size beneath us */ - { - int depth = reiserfs_write_unlock_nested(inode->i_sb); - - inode_lock(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); - } - - reiserfs_write_lock(inode->i_sb); - - write_from = inode->i_size & (blocksize - 1); - /* if we are on a block boundary, we are already unpacked. */ - if (write_from == 0) { - REISERFS_I(inode)->i_flags |= i_nopack_mask; - goto out; - } - - /* - * we unpack by finding the page with the tail, and calling - * __reiserfs_write_begin on that page. This will force a - * reiserfs_get_block to unpack the tail for us. - */ - index = inode->i_size >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = grab_cache_page(mapping, index); - retval = -ENOMEM; - if (!page) { - goto out; - } - retval = __reiserfs_write_begin(page, write_from, 0); - if (retval) - goto out_unlock; - - /* conversion can change page contents, must flush */ - flush_dcache_page(page); - retval = reiserfs_commit_write(NULL, page, write_from, write_from); - REISERFS_I(inode)->i_flags |= i_nopack_mask; - -out_unlock: - unlock_page(page); - put_page(page); - -out: - inode_unlock(inode); - reiserfs_write_unlock(inode->i_sb); - return retval; -} diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c deleted file mode 100644 index 5011c10287c6..000000000000 --- a/fs/reiserfs/item_ops.c +++ /dev/null @@ -1,737 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/time.h> -#include "reiserfs.h" - -/* - * this contains item handlers for old item types: sd, direct, - * indirect, directory - */ - -/* - * and where are the comments? how about saying where we can find an - * explanation of each item handler method? -Hans - */ - -/* stat data functions */ -static int sd_bytes_number(struct item_head *ih, int block_size) -{ - return 0; -} - -static void sd_decrement_key(struct cpu_key *key) -{ - key->on_disk_key.k_objectid--; - set_cpu_key_k_type(key, TYPE_ANY); - set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1)); -} - -static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) -{ - return 0; -} - -static void sd_print_item(struct item_head *ih, char *item) -{ - printk("\tmode | size | nlinks | first direct | mtime\n"); - if (stat_data_v1(ih)) { - struct stat_data_v1 *sd = (struct stat_data_v1 *)item; - - printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd), - sd_v1_size(sd), sd_v1_nlink(sd), - sd_v1_first_direct_byte(sd), - sd_v1_mtime(sd)); - } else { - struct stat_data *sd = (struct stat_data *)item; - - printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd), - (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), - sd_v2_rdev(sd), sd_v2_mtime(sd)); - } -} - -static void sd_check_item(struct item_head *ih, char *item) -{ - /* unused */ -} - -static int sd_create_vi(struct virtual_node *vn, - struct virtual_item *vi, - int is_affected, int insert_size) -{ - vi->vi_index = TYPE_STAT_DATA; - return 0; -} - -static int sd_check_left(struct virtual_item *vi, int free, - int start_skip, int end_skip) -{ - BUG_ON(start_skip || end_skip); - return -1; -} - -static int sd_check_right(struct virtual_item *vi, int free) -{ - return -1; -} - -static int sd_part_size(struct virtual_item *vi, int first, int count) -{ - BUG_ON(count); - return 0; -} - -static int sd_unit_num(struct virtual_item *vi) -{ - return vi->vi_item_len - IH_SIZE; -} - -static void sd_print_vi(struct virtual_item *vi) -{ - reiserfs_warning(NULL, "reiserfs-16100", - "STATDATA, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); -} - -static struct item_operations stat_data_ops = { - .bytes_number = sd_bytes_number, - .decrement_key = sd_decrement_key, - .is_left_mergeable = sd_is_left_mergeable, - .print_item = sd_print_item, - .check_item = sd_check_item, - - .create_vi = sd_create_vi, - .check_left = sd_check_left, - .check_right = sd_check_right, - .part_size = sd_part_size, - .unit_num = sd_unit_num, - .print_vi = sd_print_vi -}; - -/* direct item functions */ -static int direct_bytes_number(struct item_head *ih, int block_size) -{ - return ih_item_len(ih); -} - -/* FIXME: this should probably switch to indirect as well */ -static void direct_decrement_key(struct cpu_key *key) -{ - cpu_key_k_offset_dec(key); - if (cpu_key_k_offset(key) == 0) - set_cpu_key_k_type(key, TYPE_STAT_DATA); -} - -static int direct_is_left_mergeable(struct reiserfs_key *key, - unsigned long bsize) -{ - int version = le_key_version(key); - return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); -} - -static void direct_print_item(struct item_head *ih, char *item) -{ - int j = 0; - -/* return; */ - printk("\""); - while (j < ih_item_len(ih)) - printk("%c", item[j++]); - printk("\"\n"); -} - -static void direct_check_item(struct item_head *ih, char *item) -{ - /* unused */ -} - -static int direct_create_vi(struct virtual_node *vn, - struct virtual_item *vi, - int is_affected, int insert_size) -{ - vi->vi_index = TYPE_DIRECT; - return 0; -} - -static int direct_check_left(struct virtual_item *vi, int free, - int start_skip, int end_skip) -{ - int bytes; - - bytes = free - free % 8; - return bytes ? : -1; -} - -static int direct_check_right(struct virtual_item *vi, int free) -{ - return direct_check_left(vi, free, 0, 0); -} - -static int direct_part_size(struct virtual_item *vi, int first, int count) -{ - return count; -} - -static int direct_unit_num(struct virtual_item *vi) -{ - return vi->vi_item_len - IH_SIZE; -} - -static void direct_print_vi(struct virtual_item *vi) -{ - reiserfs_warning(NULL, "reiserfs-16101", - "DIRECT, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); -} - -static struct item_operations direct_ops = { - .bytes_number = direct_bytes_number, - .decrement_key = direct_decrement_key, - .is_left_mergeable = direct_is_left_mergeable, - .print_item = direct_print_item, - .check_item = direct_check_item, - - .create_vi = direct_create_vi, - .check_left = direct_check_left, - .check_right = direct_check_right, - .part_size = direct_part_size, - .unit_num = direct_unit_num, - .print_vi = direct_print_vi -}; - -/* indirect item functions */ -static int indirect_bytes_number(struct item_head *ih, int block_size) -{ - return ih_item_len(ih) / UNFM_P_SIZE * block_size; -} - -/* decrease offset, if it becomes 0, change type to stat data */ -static void indirect_decrement_key(struct cpu_key *key) -{ - cpu_key_k_offset_dec(key); - if (cpu_key_k_offset(key) == 0) - set_cpu_key_k_type(key, TYPE_STAT_DATA); -} - -/* if it is not first item of the body, then it is mergeable */ -static int indirect_is_left_mergeable(struct reiserfs_key *key, - unsigned long bsize) -{ - int version = le_key_version(key); - return (le_key_k_offset(version, key) != 1); -} - -/* printing of indirect item */ -static void start_new_sequence(__u32 * start, int *len, __u32 new) -{ - *start = new; - *len = 1; -} - -static int sequence_finished(__u32 start, int *len, __u32 new) -{ - if (start == INT_MAX) - return 1; - - if (start == 0 && new == 0) { - (*len)++; - return 0; - } - if (start != 0 && (start + *len) == new) { - (*len)++; - return 0; - } - return 1; -} - -static void print_sequence(__u32 start, int len) -{ - if (start == INT_MAX) - return; - - if (len == 1) - printk(" %d", start); - else - printk(" %d(%d)", start, len); -} - -static void indirect_print_item(struct item_head *ih, char *item) -{ - int j; - __le32 *unp; - __u32 prev = INT_MAX; - int num = 0; - - unp = (__le32 *) item; - - if (ih_item_len(ih) % UNFM_P_SIZE) - reiserfs_warning(NULL, "reiserfs-16102", "invalid item len"); - - printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); - for (j = 0; j < I_UNFM_NUM(ih); j++) { - if (sequence_finished(prev, &num, get_block_num(unp, j))) { - print_sequence(prev, num); - start_new_sequence(&prev, &num, get_block_num(unp, j)); - } - } - print_sequence(prev, num); - printk("]\n"); -} - -static void indirect_check_item(struct item_head *ih, char *item) -{ - /* unused */ -} - -static int indirect_create_vi(struct virtual_node *vn, - struct virtual_item *vi, - int is_affected, int insert_size) -{ - vi->vi_index = TYPE_INDIRECT; - return 0; -} - -static int indirect_check_left(struct virtual_item *vi, int free, - int start_skip, int end_skip) -{ - int bytes; - - bytes = free - free % UNFM_P_SIZE; - return bytes ? : -1; -} - -static int indirect_check_right(struct virtual_item *vi, int free) -{ - return indirect_check_left(vi, free, 0, 0); -} - -/* - * return size in bytes of 'units' units. If first == 0 - calculate - * from the head (left), otherwise - from tail (right) - */ -static int indirect_part_size(struct virtual_item *vi, int first, int units) -{ - /* unit of indirect item is byte (yet) */ - return units; -} - -static int indirect_unit_num(struct virtual_item *vi) -{ - /* unit of indirect item is byte (yet) */ - return vi->vi_item_len - IH_SIZE; -} - -static void indirect_print_vi(struct virtual_item *vi) -{ - reiserfs_warning(NULL, "reiserfs-16103", - "INDIRECT, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); -} - -static struct item_operations indirect_ops = { - .bytes_number = indirect_bytes_number, - .decrement_key = indirect_decrement_key, - .is_left_mergeable = indirect_is_left_mergeable, - .print_item = indirect_print_item, - .check_item = indirect_check_item, - - .create_vi = indirect_create_vi, - .check_left = indirect_check_left, - .check_right = indirect_check_right, - .part_size = indirect_part_size, - .unit_num = indirect_unit_num, - .print_vi = indirect_print_vi -}; - -/* direntry functions */ -static int direntry_bytes_number(struct item_head *ih, int block_size) -{ - reiserfs_warning(NULL, "vs-16090", - "bytes number is asked for direntry"); - return 0; -} - -static void direntry_decrement_key(struct cpu_key *key) -{ - cpu_key_k_offset_dec(key); - if (cpu_key_k_offset(key) == 0) - set_cpu_key_k_type(key, TYPE_STAT_DATA); -} - -static int direntry_is_left_mergeable(struct reiserfs_key *key, - unsigned long bsize) -{ - if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) - return 0; - return 1; - -} - -static void direntry_print_item(struct item_head *ih, char *item) -{ - int i; - int namelen; - struct reiserfs_de_head *deh; - char *name; - static char namebuf[80]; - - printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", - "Key of pointed object", "Hash", "Gen number", "Status"); - - deh = (struct reiserfs_de_head *)item; - - for (i = 0; i < ih_entry_count(ih); i++, deh++) { - namelen = - (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - - deh_location(deh); - name = item + deh_location(deh); - if (name[namelen - 1] == 0) - namelen = strlen(name); - - scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"", - (int)sizeof(namebuf)-3, name); - - printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", - i, namebuf, - deh_dir_id(deh), deh_objectid(deh), - GET_HASH_VALUE(deh_offset(deh)), - GET_GENERATION_NUMBER((deh_offset(deh))), - (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); - } -} - -static void direntry_check_item(struct item_head *ih, char *item) -{ - int i; - struct reiserfs_de_head *deh; - - /* unused */ - deh = (struct reiserfs_de_head *)item; - for (i = 0; i < ih_entry_count(ih); i++, deh++) { - ; - } -} - -#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 - -/* - * function returns old entry number in directory item in real node - * using new entry number in virtual item in virtual node - */ -static inline int old_entry_num(int is_affected, int virtual_entry_num, - int pos_in_item, int mode) -{ - if (mode == M_INSERT || mode == M_DELETE) - return virtual_entry_num; - - if (!is_affected) - /* cut or paste is applied to another item */ - return virtual_entry_num; - - if (virtual_entry_num < pos_in_item) - return virtual_entry_num; - - if (mode == M_CUT) - return virtual_entry_num + 1; - - RFALSE(mode != M_PASTE || virtual_entry_num == 0, - "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", - mode); - - return virtual_entry_num - 1; -} - -/* - * Create an array of sizes of directory entries for virtual - * item. Return space used by an item. FIXME: no control over - * consuming of space used by this item handler - */ -static int direntry_create_vi(struct virtual_node *vn, - struct virtual_item *vi, - int is_affected, int insert_size) -{ - struct direntry_uarea *dir_u = vi->vi_uarea; - int i, j; - int size = sizeof(struct direntry_uarea); - struct reiserfs_de_head *deh; - - vi->vi_index = TYPE_DIRENTRY; - - BUG_ON(!(vi->vi_ih) || !vi->vi_item); - - dir_u->flags = 0; - if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) - dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; - - deh = (struct reiserfs_de_head *)(vi->vi_item); - - /* virtual directory item have this amount of entry after */ - dir_u->entry_count = ih_entry_count(vi->vi_ih) + - ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : - (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); - - for (i = 0; i < dir_u->entry_count; i++) { - j = old_entry_num(is_affected, i, vn->vn_pos_in_item, - vn->vn_mode); - dir_u->entry_sizes[i] = - (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - - deh_location(&deh[j]) + DEH_SIZE; - } - - size += (dir_u->entry_count * sizeof(short)); - - /* set size of pasted entry */ - if (is_affected && vn->vn_mode == M_PASTE) - dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; - -#ifdef CONFIG_REISERFS_CHECK - /* compare total size of entries with item length */ - { - int k, l; - - l = 0; - for (k = 0; k < dir_u->entry_count; k++) - l += dir_u->entry_sizes[k]; - - if (l + IH_SIZE != vi->vi_item_len + - ((is_affected - && (vn->vn_mode == M_PASTE - || vn->vn_mode == M_CUT)) ? insert_size : 0)) { - reiserfs_panic(NULL, "vs-8025", "(mode==%c, " - "insert_size==%d), invalid length of " - "directory item", - vn->vn_mode, insert_size); - } - } -#endif - - return size; - -} - -/* - * return number of entries which may fit into specified amount of - * free space, or -1 if free space is not enough even for 1 entry - */ -static int direntry_check_left(struct virtual_item *vi, int free, - int start_skip, int end_skip) -{ - int i; - int entries = 0; - struct direntry_uarea *dir_u = vi->vi_uarea; - - for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { - /* i-th entry doesn't fit into the remaining free space */ - if (dir_u->entry_sizes[i] > free) - break; - - free -= dir_u->entry_sizes[i]; - entries++; - } - - if (entries == dir_u->entry_count) { - reiserfs_panic(NULL, "item_ops-1", - "free space %d, entry_count %d", free, - dir_u->entry_count); - } - - /* "." and ".." can not be separated from each other */ - if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) - && entries < 2) - entries = 0; - - return entries ? : -1; -} - -static int direntry_check_right(struct virtual_item *vi, int free) -{ - int i; - int entries = 0; - struct direntry_uarea *dir_u = vi->vi_uarea; - - for (i = dir_u->entry_count - 1; i >= 0; i--) { - /* i-th entry doesn't fit into the remaining free space */ - if (dir_u->entry_sizes[i] > free) - break; - - free -= dir_u->entry_sizes[i]; - entries++; - } - BUG_ON(entries == dir_u->entry_count); - - /* "." and ".." can not be separated from each other */ - if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) - && entries > dir_u->entry_count - 2) - entries = dir_u->entry_count - 2; - - return entries ? : -1; -} - -/* sum of entry sizes between from-th and to-th entries including both edges */ -static int direntry_part_size(struct virtual_item *vi, int first, int count) -{ - int i, retval; - int from, to; - struct direntry_uarea *dir_u = vi->vi_uarea; - - retval = 0; - if (first == 0) - from = 0; - else - from = dir_u->entry_count - count; - to = from + count - 1; - - for (i = from; i <= to; i++) - retval += dir_u->entry_sizes[i]; - - return retval; -} - -static int direntry_unit_num(struct virtual_item *vi) -{ - struct direntry_uarea *dir_u = vi->vi_uarea; - - return dir_u->entry_count; -} - -static void direntry_print_vi(struct virtual_item *vi) -{ - int i; - struct direntry_uarea *dir_u = vi->vi_uarea; - - reiserfs_warning(NULL, "reiserfs-16104", - "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", - vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); - printk("%d entries: ", dir_u->entry_count); - for (i = 0; i < dir_u->entry_count; i++) - printk("%d ", dir_u->entry_sizes[i]); - printk("\n"); -} - -static struct item_operations direntry_ops = { - .bytes_number = direntry_bytes_number, - .decrement_key = direntry_decrement_key, - .is_left_mergeable = direntry_is_left_mergeable, - .print_item = direntry_print_item, - .check_item = direntry_check_item, - - .create_vi = direntry_create_vi, - .check_left = direntry_check_left, - .check_right = direntry_check_right, - .part_size = direntry_part_size, - .unit_num = direntry_unit_num, - .print_vi = direntry_print_vi -}; - -/* Error catching functions to catch errors caused by incorrect item types. */ -static int errcatch_bytes_number(struct item_head *ih, int block_size) -{ - reiserfs_warning(NULL, "green-16001", - "Invalid item type observed, run fsck ASAP"); - return 0; -} - -static void errcatch_decrement_key(struct cpu_key *key) -{ - reiserfs_warning(NULL, "green-16002", - "Invalid item type observed, run fsck ASAP"); -} - -static int errcatch_is_left_mergeable(struct reiserfs_key *key, - unsigned long bsize) -{ - reiserfs_warning(NULL, "green-16003", - "Invalid item type observed, run fsck ASAP"); - return 0; -} - -static void errcatch_print_item(struct item_head *ih, char *item) -{ - reiserfs_warning(NULL, "green-16004", - "Invalid item type observed, run fsck ASAP"); -} - -static void errcatch_check_item(struct item_head *ih, char *item) -{ - reiserfs_warning(NULL, "green-16005", - "Invalid item type observed, run fsck ASAP"); -} - -static int errcatch_create_vi(struct virtual_node *vn, - struct virtual_item *vi, - int is_affected, int insert_size) -{ - reiserfs_warning(NULL, "green-16006", - "Invalid item type observed, run fsck ASAP"); - /* - * We might return -1 here as well, but it won't help as - * create_virtual_node() from where this operation is called - * from is of return type void. - */ - return 0; -} - -static int errcatch_check_left(struct virtual_item *vi, int free, - int start_skip, int end_skip) -{ - reiserfs_warning(NULL, "green-16007", - "Invalid item type observed, run fsck ASAP"); - return -1; -} - -static int errcatch_check_right(struct virtual_item *vi, int free) -{ - reiserfs_warning(NULL, "green-16008", - "Invalid item type observed, run fsck ASAP"); - return -1; -} - -static int errcatch_part_size(struct virtual_item *vi, int first, int count) -{ - reiserfs_warning(NULL, "green-16009", - "Invalid item type observed, run fsck ASAP"); - return 0; -} - -static int errcatch_unit_num(struct virtual_item *vi) -{ - reiserfs_warning(NULL, "green-16010", - "Invalid item type observed, run fsck ASAP"); - return 0; -} - -static void errcatch_print_vi(struct virtual_item *vi) -{ - reiserfs_warning(NULL, "green-16011", - "Invalid item type observed, run fsck ASAP"); -} - -static struct item_operations errcatch_ops = { - .bytes_number = errcatch_bytes_number, - .decrement_key = errcatch_decrement_key, - .is_left_mergeable = errcatch_is_left_mergeable, - .print_item = errcatch_print_item, - .check_item = errcatch_check_item, - - .create_vi = errcatch_create_vi, - .check_left = errcatch_check_left, - .check_right = errcatch_check_right, - .part_size = errcatch_part_size, - .unit_num = errcatch_unit_num, - .print_vi = errcatch_print_vi -}; - -#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) -#error Item types must use disk-format assigned values. -#endif - -struct item_operations *item_ops[TYPE_ANY + 1] = { - &stat_data_ops, - &indirect_ops, - &direct_ops, - &direntry_ops, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ -}; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c deleted file mode 100644 index e477ee0ff35d..000000000000 --- a/fs/reiserfs/journal.c +++ /dev/null @@ -1,4404 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Write ahead logging implementation copyright Chris Mason 2000 - * - * The background commits make this code very interrelated, and - * overly complex. I need to rethink things a bit....The major players: - * - * journal_begin -- call with the number of blocks you expect to log. - * If the current transaction is too - * old, it will block until the current transaction is - * finished, and then start a new one. - * Usually, your transaction will get joined in with - * previous ones for speed. - * - * journal_join -- same as journal_begin, but won't block on the current - * transaction regardless of age. Don't ever call - * this. Ever. There are only two places it should be - * called from, and they are both inside this file. - * - * journal_mark_dirty -- adds blocks into this transaction. clears any flags - * that might make them get sent to disk - * and then marks them BH_JDirty. Puts the buffer head - * into the current transaction hash. - * - * journal_end -- if the current transaction is batchable, it does nothing - * otherwise, it could do an async/synchronous commit, or - * a full flush of all log and real blocks in the - * transaction. - * - * flush_old_commits -- if the current transaction is too old, it is ended and - * commit blocks are sent to disk. Forces commit blocks - * to disk for all backgrounded commits that have been - * around too long. - * -- Note, if you call this as an immediate flush from - * within kupdate, it will ignore the immediate flag - */ - -#include <linux/time.h> -#include <linux/semaphore.h> -#include <linux/vmalloc.h> -#include "reiserfs.h" -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/workqueue.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/uaccess.h> -#include <linux/slab.h> - - -/* gets a struct reiserfs_journal_list * from a list head */ -#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ - j_list)) - -/* must be correct to keep the desc and commit structs at 4k */ -#define JOURNAL_TRANS_HALF 1018 -#define BUFNR 64 /*read ahead */ - -/* cnode stat bits. Move these into reiserfs_fs.h */ - -/* this block was freed, and can't be written. */ -#define BLOCK_FREED 2 -/* this block was freed during this transaction, and can't be written */ -#define BLOCK_FREED_HOLDER 3 - -/* used in flush_journal_list */ -#define BLOCK_NEEDS_FLUSH 4 -#define BLOCK_DIRTIED 5 - -/* journal list state bits */ -#define LIST_TOUCHED 1 -#define LIST_DIRTY 2 -#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ - -/* flags for do_journal_end */ -#define FLUSH_ALL 1 /* flush commit and real blocks */ -#define COMMIT_NOW 2 /* end and commit this transaction */ -#define WAIT 4 /* wait for the log blocks to hit the disk */ - -static int do_journal_end(struct reiserfs_transaction_handle *, int flags); -static int flush_journal_list(struct super_block *s, - struct reiserfs_journal_list *jl, int flushall); -static int flush_commit_list(struct super_block *s, - struct reiserfs_journal_list *jl, int flushall); -static int can_dirty(struct reiserfs_journal_cnode *cn); -static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb); -static void release_journal_dev(struct reiserfs_journal *journal); -static void dirty_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl); -static void flush_async_commits(struct work_struct *work); -static void queue_log_writer(struct super_block *s); - -/* values for join in do_journal_begin_r */ -enum { - JBEGIN_REG = 0, /* regular journal begin */ - /* join the running transaction if at all possible */ - JBEGIN_JOIN = 1, - /* called from cleanup code, ignores aborted flag */ - JBEGIN_ABORT = 2, -}; - -static int do_journal_begin_r(struct reiserfs_transaction_handle *th, - struct super_block *sb, - unsigned long nblocks, int join); - -static void init_journal_hash(struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - memset(journal->j_hash_table, 0, - JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); -} - -/* - * clears BH_Dirty and sticks the buffer on the clean list. Called because - * I can't allow refile_buffer to make schedule happen after I've freed a - * block. Look at remove_from_transaction and journal_mark_freed for - * more details. - */ -static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) -{ - if (bh) { - clear_buffer_dirty(bh); - clear_buffer_journal_test(bh); - } - return 0; -} - -static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block - *sb) -{ - struct reiserfs_bitmap_node *bn; - static int id; - - bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); - if (!bn) { - return NULL; - } - bn->data = kzalloc(sb->s_blocksize, GFP_NOFS); - if (!bn->data) { - kfree(bn); - return NULL; - } - bn->id = id++; - INIT_LIST_HEAD(&bn->list); - return bn; -} - -static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_bitmap_node *bn = NULL; - struct list_head *entry = journal->j_bitmap_nodes.next; - - journal->j_used_bitmap_nodes++; -repeat: - - if (entry != &journal->j_bitmap_nodes) { - bn = list_entry(entry, struct reiserfs_bitmap_node, list); - list_del(entry); - memset(bn->data, 0, sb->s_blocksize); - journal->j_free_bitmap_nodes--; - return bn; - } - bn = allocate_bitmap_node(sb); - if (!bn) { - yield(); - goto repeat; - } - return bn; -} -static inline void free_bitmap_node(struct super_block *sb, - struct reiserfs_bitmap_node *bn) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - journal->j_used_bitmap_nodes--; - if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { - kfree(bn->data); - kfree(bn); - } else { - list_add(&bn->list, &journal->j_bitmap_nodes); - journal->j_free_bitmap_nodes++; - } -} - -static void allocate_bitmap_nodes(struct super_block *sb) -{ - int i; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_bitmap_node *bn = NULL; - for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { - bn = allocate_bitmap_node(sb); - if (bn) { - list_add(&bn->list, &journal->j_bitmap_nodes); - journal->j_free_bitmap_nodes++; - } else { - /* this is ok, we'll try again when more are needed */ - break; - } - } -} - -static int set_bit_in_list_bitmap(struct super_block *sb, - b_blocknr_t block, - struct reiserfs_list_bitmap *jb) -{ - unsigned int bmap_nr = block / (sb->s_blocksize << 3); - unsigned int bit_nr = block % (sb->s_blocksize << 3); - - if (!jb->bitmaps[bmap_nr]) { - jb->bitmaps[bmap_nr] = get_bitmap_node(sb); - } - set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); - return 0; -} - -static void cleanup_bitmap_list(struct super_block *sb, - struct reiserfs_list_bitmap *jb) -{ - int i; - if (jb->bitmaps == NULL) - return; - - for (i = 0; i < reiserfs_bmap_count(sb); i++) { - if (jb->bitmaps[i]) { - free_bitmap_node(sb, jb->bitmaps[i]); - jb->bitmaps[i] = NULL; - } - } -} - -/* - * only call this on FS unmount. - */ -static int free_list_bitmaps(struct super_block *sb, - struct reiserfs_list_bitmap *jb_array) -{ - int i; - struct reiserfs_list_bitmap *jb; - for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { - jb = jb_array + i; - jb->journal_list = NULL; - cleanup_bitmap_list(sb, jb); - vfree(jb->bitmaps); - jb->bitmaps = NULL; - } - return 0; -} - -static int free_bitmap_nodes(struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct list_head *next = journal->j_bitmap_nodes.next; - struct reiserfs_bitmap_node *bn; - - while (next != &journal->j_bitmap_nodes) { - bn = list_entry(next, struct reiserfs_bitmap_node, list); - list_del(next); - kfree(bn->data); - kfree(bn); - next = journal->j_bitmap_nodes.next; - journal->j_free_bitmap_nodes--; - } - - return 0; -} - -/* - * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. - * jb_array is the array to be filled in. - */ -int reiserfs_allocate_list_bitmaps(struct super_block *sb, - struct reiserfs_list_bitmap *jb_array, - unsigned int bmap_nr) -{ - int i; - int failed = 0; - struct reiserfs_list_bitmap *jb; - int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); - - for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { - jb = jb_array + i; - jb->journal_list = NULL; - jb->bitmaps = vzalloc(mem); - if (!jb->bitmaps) { - reiserfs_warning(sb, "clm-2000", "unable to " - "allocate bitmaps for journal lists"); - failed = 1; - break; - } - } - if (failed) { - free_list_bitmaps(sb, jb_array); - return -1; - } - return 0; -} - -/* - * find an available list bitmap. If you can't find one, flush a commit list - * and try again - */ -static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, - struct reiserfs_journal_list - *jl) -{ - int i, j; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_list_bitmap *jb = NULL; - - for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { - i = journal->j_list_bitmap_index; - journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; - jb = journal->j_list_bitmap + i; - if (journal->j_list_bitmap[i].journal_list) { - flush_commit_list(sb, - journal->j_list_bitmap[i]. - journal_list, 1); - if (!journal->j_list_bitmap[i].journal_list) { - break; - } - } else { - break; - } - } - /* double check to make sure if flushed correctly */ - if (jb->journal_list) - return NULL; - jb->journal_list = jl; - return jb; -} - -/* - * allocates a new chunk of X nodes, and links them all together as a list. - * Uses the cnode->next and cnode->prev pointers - * returns NULL on failure - */ -static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) -{ - struct reiserfs_journal_cnode *head; - int i; - if (num_cnodes <= 0) { - return NULL; - } - head = vzalloc(array_size(num_cnodes, - sizeof(struct reiserfs_journal_cnode))); - if (!head) { - return NULL; - } - head[0].prev = NULL; - head[0].next = head + 1; - for (i = 1; i < num_cnodes; i++) { - head[i].prev = head + (i - 1); - head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ - } - head[num_cnodes - 1].next = NULL; - return head; -} - -/* pulls a cnode off the free list, or returns NULL on failure */ -static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) -{ - struct reiserfs_journal_cnode *cn; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - reiserfs_check_lock_depth(sb, "get_cnode"); - - if (journal->j_cnode_free <= 0) { - return NULL; - } - journal->j_cnode_used++; - journal->j_cnode_free--; - cn = journal->j_cnode_free_list; - if (!cn) { - return cn; - } - if (cn->next) { - cn->next->prev = NULL; - } - journal->j_cnode_free_list = cn->next; - memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); - return cn; -} - -/* - * returns a cnode to the free list - */ -static void free_cnode(struct super_block *sb, - struct reiserfs_journal_cnode *cn) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - reiserfs_check_lock_depth(sb, "free_cnode"); - - journal->j_cnode_used--; - journal->j_cnode_free++; - /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ - cn->next = journal->j_cnode_free_list; - if (journal->j_cnode_free_list) { - journal->j_cnode_free_list->prev = cn; - } - cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ - journal->j_cnode_free_list = cn; -} - -static void clear_prepared_bits(struct buffer_head *bh) -{ - clear_buffer_journal_prepared(bh); - clear_buffer_journal_restore_dirty(bh); -} - -/* - * return a cnode with same dev, block number and size in table, - * or null if not found - */ -static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct - super_block - *sb, - struct - reiserfs_journal_cnode - **table, - long bl) -{ - struct reiserfs_journal_cnode *cn; - cn = journal_hash(table, sb, bl); - while (cn) { - if (cn->blocknr == bl && cn->sb == sb) - return cn; - cn = cn->hnext; - } - return (struct reiserfs_journal_cnode *)0; -} - -/* - * this actually means 'can this block be reallocated yet?'. If you set - * search_all, a block can only be allocated if it is not in the current - * transaction, was not freed by the current transaction, and has no chance - * of ever being overwritten by a replay after crashing. - * - * If you don't set search_all, a block can only be allocated if it is not - * in the current transaction. Since deleting a block removes it from the - * current transaction, this case should never happen. If you don't set - * search_all, make sure you never write the block without logging it. - * - * next_zero_bit is a suggestion about the next block to try for find_forward. - * when bl is rejected because it is set in a journal list bitmap, we search - * for the next zero bit in the bitmap that rejected bl. Then, we return - * that through next_zero_bit for find_forward to try. - * - * Just because we return something in next_zero_bit does not mean we won't - * reject it on the next call to reiserfs_in_journal - */ -int reiserfs_in_journal(struct super_block *sb, - unsigned int bmap_nr, int bit_nr, int search_all, - b_blocknr_t * next_zero_bit) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_list_bitmap *jb; - int i; - unsigned long bl; - - *next_zero_bit = 0; /* always start this at zero. */ - - PROC_INFO_INC(sb, journal.in_journal); - /* - * If we aren't doing a search_all, this is a metablock, and it - * will be logged before use. if we crash before the transaction - * that freed it commits, this transaction won't have committed - * either, and the block will never be written - */ - if (search_all) { - for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { - PROC_INFO_INC(sb, journal.in_journal_bitmap); - jb = journal->j_list_bitmap + i; - if (jb->journal_list && jb->bitmaps[bmap_nr] && - test_bit(bit_nr, - (unsigned long *)jb->bitmaps[bmap_nr]-> - data)) { - *next_zero_bit = - find_next_zero_bit((unsigned long *) - (jb->bitmaps[bmap_nr]-> - data), - sb->s_blocksize << 3, - bit_nr + 1); - return 1; - } - } - } - - bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr; - /* is it in any old transactions? */ - if (search_all - && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) { - return 1; - } - - /* is it in the current transaction. This should never happen */ - if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) { - BUG(); - return 1; - } - - PROC_INFO_INC(sb, journal.in_journal_reusable); - /* safe for reuse */ - return 0; -} - -/* insert cn into table */ -static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, - struct reiserfs_journal_cnode *cn) -{ - struct reiserfs_journal_cnode *cn_orig; - - cn_orig = journal_hash(table, cn->sb, cn->blocknr); - cn->hnext = cn_orig; - cn->hprev = NULL; - if (cn_orig) { - cn_orig->hprev = cn; - } - journal_hash(table, cn->sb, cn->blocknr) = cn; -} - -/* lock the current transaction */ -static inline void lock_journal(struct super_block *sb) -{ - PROC_INFO_INC(sb, journal.lock_journal); - - reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); -} - -/* unlock the current transaction */ -static inline void unlock_journal(struct super_block *sb) -{ - mutex_unlock(&SB_JOURNAL(sb)->j_mutex); -} - -static inline void get_journal_list(struct reiserfs_journal_list *jl) -{ - jl->j_refcount++; -} - -static inline void put_journal_list(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - if (jl->j_refcount < 1) { - reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d", - jl->j_trans_id, jl->j_refcount); - } - if (--jl->j_refcount == 0) - kfree(jl); -} - -/* - * this used to be much more involved, and I'm keeping it just in case - * things get ugly again. it gets called by flush_commit_list, and - * cleans up any data stored about blocks freed during a transaction. - */ -static void cleanup_freed_for_journal_list(struct super_block *sb, - struct reiserfs_journal_list *jl) -{ - - struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; - if (jb) { - cleanup_bitmap_list(sb, jb); - } - jl->j_list_bitmap->journal_list = NULL; - jl->j_list_bitmap = NULL; -} - -static int journal_list_still_alive(struct super_block *s, - unsigned int trans_id) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - struct list_head *entry = &journal->j_journal_list; - struct reiserfs_journal_list *jl; - - if (!list_empty(entry)) { - jl = JOURNAL_LIST_ENTRY(entry->next); - if (jl->j_trans_id <= trans_id) { - return 1; - } - } - return 0; -} - -/* - * If page->mapping was null, we failed to truncate this page for - * some reason. Most likely because it was truncated after being - * logged via data=journal. - * - * This does a check to see if the buffer belongs to one of these - * lost pages before doing the final put_bh. If page->mapping was - * null, it tries to free buffers on the page, which should make the - * final put_page drop the page from the lru. - */ -static void release_buffer_page(struct buffer_head *bh) -{ - struct folio *folio = bh->b_folio; - if (!folio->mapping && folio_trylock(folio)) { - folio_get(folio); - put_bh(bh); - if (!folio->mapping) - try_to_free_buffers(folio); - folio_unlock(folio); - folio_put(folio); - } else { - put_bh(bh); - } -} - -static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) -{ - if (buffer_journaled(bh)) { - reiserfs_warning(NULL, "clm-2084", - "pinned buffer %lu:%pg sent to disk", - bh->b_blocknr, bh->b_bdev); - } - if (uptodate) - set_buffer_uptodate(bh); - else - clear_buffer_uptodate(bh); - - unlock_buffer(bh); - release_buffer_page(bh); -} - -static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) -{ - if (uptodate) - set_buffer_uptodate(bh); - else - clear_buffer_uptodate(bh); - unlock_buffer(bh); - put_bh(bh); -} - -static void submit_logged_buffer(struct buffer_head *bh) -{ - get_bh(bh); - bh->b_end_io = reiserfs_end_buffer_io_sync; - clear_buffer_journal_new(bh); - clear_buffer_dirty(bh); - if (!test_clear_buffer_journal_test(bh)) - BUG(); - if (!buffer_uptodate(bh)) - BUG(); - submit_bh(REQ_OP_WRITE, bh); -} - -static void submit_ordered_buffer(struct buffer_head *bh) -{ - get_bh(bh); - bh->b_end_io = reiserfs_end_ordered_io; - clear_buffer_dirty(bh); - if (!buffer_uptodate(bh)) - BUG(); - submit_bh(REQ_OP_WRITE, bh); -} - -#define CHUNK_SIZE 32 -struct buffer_chunk { - struct buffer_head *bh[CHUNK_SIZE]; - int nr; -}; - -static void write_chunk(struct buffer_chunk *chunk) -{ - int i; - for (i = 0; i < chunk->nr; i++) { - submit_logged_buffer(chunk->bh[i]); - } - chunk->nr = 0; -} - -static void write_ordered_chunk(struct buffer_chunk *chunk) -{ - int i; - for (i = 0; i < chunk->nr; i++) { - submit_ordered_buffer(chunk->bh[i]); - } - chunk->nr = 0; -} - -static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, - spinlock_t * lock, void (fn) (struct buffer_chunk *)) -{ - int ret = 0; - BUG_ON(chunk->nr >= CHUNK_SIZE); - chunk->bh[chunk->nr++] = bh; - if (chunk->nr >= CHUNK_SIZE) { - ret = 1; - if (lock) { - spin_unlock(lock); - fn(chunk); - spin_lock(lock); - } else { - fn(chunk); - } - } - return ret; -} - -static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); -static struct reiserfs_jh *alloc_jh(void) -{ - struct reiserfs_jh *jh; - while (1) { - jh = kmalloc(sizeof(*jh), GFP_NOFS); - if (jh) { - atomic_inc(&nr_reiserfs_jh); - return jh; - } - yield(); - } -} - -/* - * we want to free the jh when the buffer has been written - * and waited on - */ -void reiserfs_free_jh(struct buffer_head *bh) -{ - struct reiserfs_jh *jh; - - jh = bh->b_private; - if (jh) { - bh->b_private = NULL; - jh->bh = NULL; - list_del_init(&jh->list); - kfree(jh); - if (atomic_read(&nr_reiserfs_jh) <= 0) - BUG(); - atomic_dec(&nr_reiserfs_jh); - put_bh(bh); - } -} - -static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, - int tail) -{ - struct reiserfs_jh *jh; - - if (bh->b_private) { - spin_lock(&j->j_dirty_buffers_lock); - if (!bh->b_private) { - spin_unlock(&j->j_dirty_buffers_lock); - goto no_jh; - } - jh = bh->b_private; - list_del_init(&jh->list); - } else { -no_jh: - get_bh(bh); - jh = alloc_jh(); - spin_lock(&j->j_dirty_buffers_lock); - /* - * buffer must be locked for __add_jh, should be able to have - * two adds at the same time - */ - BUG_ON(bh->b_private); - jh->bh = bh; - bh->b_private = jh; - } - jh->jl = j->j_current_jl; - if (tail) - list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); - else { - list_add_tail(&jh->list, &jh->jl->j_bh_list); - } - spin_unlock(&j->j_dirty_buffers_lock); - return 0; -} - -int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) -{ - return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); -} -int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) -{ - return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); -} - -#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) -static int write_ordered_buffers(spinlock_t * lock, - struct reiserfs_journal *j, - struct reiserfs_journal_list *jl, - struct list_head *list) -{ - struct buffer_head *bh; - struct reiserfs_jh *jh; - int ret = j->j_errno; - struct buffer_chunk chunk; - struct list_head tmp; - INIT_LIST_HEAD(&tmp); - - chunk.nr = 0; - spin_lock(lock); - while (!list_empty(list)) { - jh = JH_ENTRY(list->next); - bh = jh->bh; - get_bh(bh); - if (!trylock_buffer(bh)) { - if (!buffer_dirty(bh)) { - list_move(&jh->list, &tmp); - goto loop_next; - } - spin_unlock(lock); - if (chunk.nr) - write_ordered_chunk(&chunk); - wait_on_buffer(bh); - cond_resched(); - spin_lock(lock); - goto loop_next; - } - /* - * in theory, dirty non-uptodate buffers should never get here, - * but the upper layer io error paths still have a few quirks. - * Handle them here as gracefully as we can - */ - if (!buffer_uptodate(bh) && buffer_dirty(bh)) { - clear_buffer_dirty(bh); - ret = -EIO; - } - if (buffer_dirty(bh)) { - list_move(&jh->list, &tmp); - add_to_chunk(&chunk, bh, lock, write_ordered_chunk); - } else { - reiserfs_free_jh(bh); - unlock_buffer(bh); - } -loop_next: - put_bh(bh); - cond_resched_lock(lock); - } - if (chunk.nr) { - spin_unlock(lock); - write_ordered_chunk(&chunk); - spin_lock(lock); - } - while (!list_empty(&tmp)) { - jh = JH_ENTRY(tmp.prev); - bh = jh->bh; - get_bh(bh); - reiserfs_free_jh(bh); - - if (buffer_locked(bh)) { - spin_unlock(lock); - wait_on_buffer(bh); - spin_lock(lock); - } - if (!buffer_uptodate(bh)) { - ret = -EIO; - } - /* - * ugly interaction with invalidate_folio here. - * reiserfs_invalidate_folio will pin any buffer that has a - * valid journal head from an older transaction. If someone - * else sets our buffer dirty after we write it in the first - * loop, and then someone truncates the page away, nobody - * will ever write the buffer. We're safe if we write the - * page one last time after freeing the journal header. - */ - if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) { - spin_unlock(lock); - write_dirty_buffer(bh, 0); - spin_lock(lock); - } - put_bh(bh); - cond_resched_lock(lock); - } - spin_unlock(lock); - return ret; -} - -static int flush_older_commits(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - struct reiserfs_journal_list *other_jl; - struct reiserfs_journal_list *first_jl; - struct list_head *entry; - unsigned int trans_id = jl->j_trans_id; - unsigned int other_trans_id; - -find_first: - /* - * first we walk backwards to find the oldest uncommitted transation - */ - first_jl = jl; - entry = jl->j_list.prev; - while (1) { - other_jl = JOURNAL_LIST_ENTRY(entry); - if (entry == &journal->j_journal_list || - atomic_read(&other_jl->j_older_commits_done)) - break; - - first_jl = other_jl; - entry = other_jl->j_list.prev; - } - - /* if we didn't find any older uncommitted transactions, return now */ - if (first_jl == jl) { - return 0; - } - - entry = &first_jl->j_list; - while (1) { - other_jl = JOURNAL_LIST_ENTRY(entry); - other_trans_id = other_jl->j_trans_id; - - if (other_trans_id < trans_id) { - if (atomic_read(&other_jl->j_commit_left) != 0) { - flush_commit_list(s, other_jl, 0); - - /* list we were called with is gone, return */ - if (!journal_list_still_alive(s, trans_id)) - return 1; - - /* - * the one we just flushed is gone, this means - * all older lists are also gone, so first_jl - * is no longer valid either. Go back to the - * beginning. - */ - if (!journal_list_still_alive - (s, other_trans_id)) { - goto find_first; - } - } - entry = entry->next; - if (entry == &journal->j_journal_list) - return 0; - } else { - return 0; - } - } - return 0; -} - -static int reiserfs_async_progress_wait(struct super_block *s) -{ - struct reiserfs_journal *j = SB_JOURNAL(s); - - if (atomic_read(&j->j_async_throttle)) { - int depth; - - depth = reiserfs_write_unlock_nested(s); - wait_var_event_timeout(&j->j_async_throttle, - atomic_read(&j->j_async_throttle) == 0, - HZ / 10); - reiserfs_write_lock_nested(s, depth); - } - - return 0; -} - -/* - * if this journal list still has commit blocks unflushed, send them to disk. - * - * log areas must be flushed in order (transaction 2 can't commit before - * transaction 1) Before the commit block can by written, every other log - * block must be safely on disk - */ -static int flush_commit_list(struct super_block *s, - struct reiserfs_journal_list *jl, int flushall) -{ - int i; - b_blocknr_t bn; - struct buffer_head *tbh = NULL; - unsigned int trans_id = jl->j_trans_id; - struct reiserfs_journal *journal = SB_JOURNAL(s); - int retval = 0; - int write_len; - int depth; - - reiserfs_check_lock_depth(s, "flush_commit_list"); - - if (atomic_read(&jl->j_older_commits_done)) { - return 0; - } - - /* - * before we can put our commit blocks on disk, we have to make - * sure everyone older than us is on disk too - */ - BUG_ON(jl->j_len <= 0); - BUG_ON(trans_id == journal->j_trans_id); - - get_journal_list(jl); - if (flushall) { - if (flush_older_commits(s, jl) == 1) { - /* - * list disappeared during flush_older_commits. - * return - */ - goto put_jl; - } - } - - /* make sure nobody is trying to flush this one at the same time */ - reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); - - if (!journal_list_still_alive(s, trans_id)) { - mutex_unlock(&jl->j_commit_mutex); - goto put_jl; - } - BUG_ON(jl->j_trans_id == 0); - - /* this commit is done, exit */ - if (atomic_read(&jl->j_commit_left) <= 0) { - if (flushall) { - atomic_set(&jl->j_older_commits_done, 1); - } - mutex_unlock(&jl->j_commit_mutex); - goto put_jl; - } - - if (!list_empty(&jl->j_bh_list)) { - int ret; - - /* - * We might sleep in numerous places inside - * write_ordered_buffers. Relax the write lock. - */ - depth = reiserfs_write_unlock_nested(s); - ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_bh_list); - if (ret < 0 && retval == 0) - retval = ret; - reiserfs_write_lock_nested(s, depth); - } - BUG_ON(!list_empty(&jl->j_bh_list)); - /* - * for the description block and all the log blocks, submit any buffers - * that haven't already reached the disk. Try to write at least 256 - * log blocks. later on, we will only wait on blocks that correspond - * to this transaction, but while we're unplugging we might as well - * get a chunk of data on there. - */ - atomic_inc(&journal->j_async_throttle); - write_len = jl->j_len + 1; - if (write_len < 256) - write_len = 256; - for (i = 0 ; i < write_len ; i++) { - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % - SB_ONDISK_JOURNAL_SIZE(s); - tbh = journal_find_get_block(s, bn); - if (tbh) { - if (buffer_dirty(tbh)) { - depth = reiserfs_write_unlock_nested(s); - write_dirty_buffer(tbh, 0); - reiserfs_write_lock_nested(s, depth); - } - put_bh(tbh) ; - } - } - if (atomic_dec_and_test(&journal->j_async_throttle)) - wake_up_var(&journal->j_async_throttle); - - for (i = 0; i < (jl->j_len + 1); i++) { - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + - (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); - tbh = journal_find_get_block(s, bn); - - depth = reiserfs_write_unlock_nested(s); - __wait_on_buffer(tbh); - reiserfs_write_lock_nested(s, depth); - /* - * since we're using ll_rw_blk above, it might have skipped - * over a locked buffer. Double check here - */ - /* redundant, sync_dirty_buffer() checks */ - if (buffer_dirty(tbh)) { - depth = reiserfs_write_unlock_nested(s); - sync_dirty_buffer(tbh); - reiserfs_write_lock_nested(s, depth); - } - if (unlikely(!buffer_uptodate(tbh))) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-601", - "buffer write failed"); -#endif - retval = -EIO; - } - /* once for journal_find_get_block */ - put_bh(tbh); - /* once due to original getblk in do_journal_end */ - put_bh(tbh); - atomic_dec(&jl->j_commit_left); - } - - BUG_ON(atomic_read(&jl->j_commit_left) != 1); - - /* - * If there was a write error in the journal - we can't commit - * this transaction - it will be invalid and, if successful, - * will just end up propagating the write error out to - * the file system. - */ - if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { - if (buffer_dirty(jl->j_commit_bh)) - BUG(); - mark_buffer_dirty(jl->j_commit_bh) ; - depth = reiserfs_write_unlock_nested(s); - if (reiserfs_barrier_flush(s)) - __sync_dirty_buffer(jl->j_commit_bh, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA); - else - sync_dirty_buffer(jl->j_commit_bh); - reiserfs_write_lock_nested(s, depth); - } - - /* - * If there was a write error in the journal - we can't commit this - * transaction - it will be invalid and, if successful, will just end - * up propagating the write error out to the filesystem. - */ - if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-615", "buffer write failed"); -#endif - retval = -EIO; - } - bforget(jl->j_commit_bh); - if (journal->j_last_commit_id != 0 && - (jl->j_trans_id - journal->j_last_commit_id) != 1) { - reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu", - journal->j_last_commit_id, jl->j_trans_id); - } - journal->j_last_commit_id = jl->j_trans_id; - - /* - * now, every commit block is on the disk. It is safe to allow - * blocks freed during this transaction to be reallocated - */ - cleanup_freed_for_journal_list(s, jl); - - retval = retval ? retval : journal->j_errno; - - /* mark the metadata dirty */ - if (!retval) - dirty_one_transaction(s, jl); - atomic_dec(&jl->j_commit_left); - - if (flushall) { - atomic_set(&jl->j_older_commits_done, 1); - } - mutex_unlock(&jl->j_commit_mutex); -put_jl: - put_journal_list(s, jl); - - if (retval) - reiserfs_abort(s, retval, "Journal write error in %s", - __func__); - return retval; -} - -/* - * flush_journal_list frequently needs to find a newer transaction for a - * given block. This does that, or returns NULL if it can't find anything - */ -static struct reiserfs_journal_list *find_newer_jl_for_cn(struct - reiserfs_journal_cnode - *cn) -{ - struct super_block *sb = cn->sb; - b_blocknr_t blocknr = cn->blocknr; - - cn = cn->hprev; - while (cn) { - if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { - return cn->jlist; - } - cn = cn->hprev; - } - return NULL; -} - -static void remove_journal_hash(struct super_block *, - struct reiserfs_journal_cnode **, - struct reiserfs_journal_list *, unsigned long, - int); - -/* - * once all the real blocks have been flushed, it is safe to remove them - * from the journal list for this transaction. Aside from freeing the - * cnode, this also allows the block to be reallocated for data blocks - * if it had been deleted. - */ -static void remove_all_from_journal_list(struct super_block *sb, - struct reiserfs_journal_list *jl, - int debug) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_cnode *cn, *last; - cn = jl->j_realblock; - - /* - * which is better, to lock once around the whole loop, or - * to lock for each call to remove_journal_hash? - */ - while (cn) { - if (cn->blocknr != 0) { - if (debug) { - reiserfs_warning(sb, "reiserfs-2201", - "block %u, bh is %d, state %ld", - cn->blocknr, cn->bh ? 1 : 0, - cn->state); - } - cn->state = 0; - remove_journal_hash(sb, journal->j_list_hash_table, - jl, cn->blocknr, 1); - } - last = cn; - cn = cn->next; - free_cnode(sb, last); - } - jl->j_realblock = NULL; -} - -/* - * if this timestamp is greater than the timestamp we wrote last to the - * header block, write it to the header block. once this is done, I can - * safely say the log area for this transaction won't ever be replayed, - * and I can start releasing blocks in this transaction for reuse as data - * blocks. called by flush_journal_list, before it calls - * remove_all_from_journal_list - */ -static int _update_journal_header_block(struct super_block *sb, - unsigned long offset, - unsigned int trans_id) -{ - struct reiserfs_journal_header *jh; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - int depth; - - if (reiserfs_is_journal_aborted(journal)) - return -EIO; - - if (trans_id >= journal->j_last_flush_trans_id) { - if (buffer_locked((journal->j_header_bh))) { - depth = reiserfs_write_unlock_nested(sb); - __wait_on_buffer(journal->j_header_bh); - reiserfs_write_lock_nested(sb, depth); - if (unlikely(!buffer_uptodate(journal->j_header_bh))) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(sb, "journal-699", - "buffer write failed"); -#endif - return -EIO; - } - } - journal->j_last_flush_trans_id = trans_id; - journal->j_first_unflushed_offset = offset; - jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> - b_data); - jh->j_last_flush_trans_id = cpu_to_le32(trans_id); - jh->j_first_unflushed_offset = cpu_to_le32(offset); - jh->j_mount_id = cpu_to_le32(journal->j_mount_id); - - set_buffer_dirty(journal->j_header_bh); - depth = reiserfs_write_unlock_nested(sb); - - if (reiserfs_barrier_flush(sb)) - __sync_dirty_buffer(journal->j_header_bh, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA); - else - sync_dirty_buffer(journal->j_header_bh); - - reiserfs_write_lock_nested(sb, depth); - if (!buffer_uptodate(journal->j_header_bh)) { - reiserfs_warning(sb, "journal-837", - "IO error during journal replay"); - return -EIO; - } - } - return 0; -} - -static int update_journal_header_block(struct super_block *sb, - unsigned long offset, - unsigned int trans_id) -{ - return _update_journal_header_block(sb, offset, trans_id); -} - -/* -** flush any and all journal lists older than you are -** can only be called from flush_journal_list -*/ -static int flush_older_journal_lists(struct super_block *sb, - struct reiserfs_journal_list *jl) -{ - struct list_head *entry; - struct reiserfs_journal_list *other_jl; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - unsigned int trans_id = jl->j_trans_id; - - /* - * we know we are the only ones flushing things, no extra race - * protection is required. - */ -restart: - entry = journal->j_journal_list.next; - /* Did we wrap? */ - if (entry == &journal->j_journal_list) - return 0; - other_jl = JOURNAL_LIST_ENTRY(entry); - if (other_jl->j_trans_id < trans_id) { - BUG_ON(other_jl->j_refcount <= 0); - /* do not flush all */ - flush_journal_list(sb, other_jl, 0); - - /* other_jl is now deleted from the list */ - goto restart; - } - return 0; -} - -static void del_from_work_list(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - if (!list_empty(&jl->j_working_list)) { - list_del_init(&jl->j_working_list); - journal->j_num_work_lists--; - } -} - -/* - * flush a journal list, both commit and real blocks - * - * always set flushall to 1, unless you are calling from inside - * flush_journal_list - * - * IMPORTANT. This can only be called while there are no journal writers, - * and the journal is locked. That means it can only be called from - * do_journal_end, or by journal_release - */ -static int flush_journal_list(struct super_block *s, - struct reiserfs_journal_list *jl, int flushall) -{ - struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn; - int count; - int was_jwait = 0; - int was_dirty = 0; - struct buffer_head *saved_bh; - unsigned long j_len_saved = jl->j_len; - struct reiserfs_journal *journal = SB_JOURNAL(s); - int err = 0; - int depth; - - BUG_ON(j_len_saved <= 0); - - if (atomic_read(&journal->j_wcount) != 0) { - reiserfs_warning(s, "clm-2048", "called with wcount %d", - atomic_read(&journal->j_wcount)); - } - - /* if flushall == 0, the lock is already held */ - if (flushall) { - reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); - } else if (mutex_trylock(&journal->j_flush_mutex)) { - BUG(); - } - - count = 0; - if (j_len_saved > journal->j_trans_max) { - reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu", - j_len_saved, jl->j_trans_id); - return 0; - } - - /* if all the work is already done, get out of here */ - if (atomic_read(&jl->j_nonzerolen) <= 0 && - atomic_read(&jl->j_commit_left) <= 0) { - goto flush_older_and_return; - } - - /* - * start by putting the commit list on disk. This will also flush - * the commit lists of any olders transactions - */ - flush_commit_list(s, jl, 1); - - if (!(jl->j_state & LIST_DIRTY) - && !reiserfs_is_journal_aborted(journal)) - BUG(); - - /* are we done now? */ - if (atomic_read(&jl->j_nonzerolen) <= 0 && - atomic_read(&jl->j_commit_left) <= 0) { - goto flush_older_and_return; - } - - /* - * loop through each cnode, see if we need to write it, - * or wait on a more recent transaction, or just ignore it - */ - if (atomic_read(&journal->j_wcount) != 0) { - reiserfs_panic(s, "journal-844", "journal list is flushing, " - "wcount is not 0"); - } - cn = jl->j_realblock; - while (cn) { - was_jwait = 0; - was_dirty = 0; - saved_bh = NULL; - /* blocknr of 0 is no longer in the hash, ignore it */ - if (cn->blocknr == 0) { - goto free_cnode; - } - - /* - * This transaction failed commit. - * Don't write out to the disk - */ - if (!(jl->j_state & LIST_DIRTY)) - goto free_cnode; - - pjl = find_newer_jl_for_cn(cn); - /* - * the order is important here. We check pjl to make sure we - * don't clear BH_JDirty_wait if we aren't the one writing this - * block to disk - */ - if (!pjl && cn->bh) { - saved_bh = cn->bh; - - /* - * we do this to make sure nobody releases the - * buffer while we are working with it - */ - get_bh(saved_bh); - - if (buffer_journal_dirty(saved_bh)) { - BUG_ON(!can_dirty(cn)); - was_jwait = 1; - was_dirty = 1; - } else if (can_dirty(cn)) { - /* - * everything with !pjl && jwait - * should be writable - */ - BUG(); - } - } - - /* - * if someone has this block in a newer transaction, just make - * sure they are committed, and don't try writing it to disk - */ - if (pjl) { - if (atomic_read(&pjl->j_commit_left)) - flush_commit_list(s, pjl, 1); - goto free_cnode; - } - - /* - * bh == NULL when the block got to disk on its own, OR, - * the block got freed in a future transaction - */ - if (saved_bh == NULL) { - goto free_cnode; - } - - /* - * this should never happen. kupdate_one_transaction has - * this list locked while it works, so we should never see a - * buffer here that is not marked JDirty_wait - */ - if ((!was_jwait) && !buffer_locked(saved_bh)) { - reiserfs_warning(s, "journal-813", - "BAD! buffer %llu %cdirty %cjwait, " - "not in a newer transaction", - (unsigned long long)saved_bh-> - b_blocknr, was_dirty ? ' ' : '!', - was_jwait ? ' ' : '!'); - } - if (was_dirty) { - /* - * we inc again because saved_bh gets decremented - * at free_cnode - */ - get_bh(saved_bh); - set_bit(BLOCK_NEEDS_FLUSH, &cn->state); - lock_buffer(saved_bh); - BUG_ON(cn->blocknr != saved_bh->b_blocknr); - if (buffer_dirty(saved_bh)) - submit_logged_buffer(saved_bh); - else - unlock_buffer(saved_bh); - count++; - } else { - reiserfs_warning(s, "clm-2082", - "Unable to flush buffer %llu in %s", - (unsigned long long)saved_bh-> - b_blocknr, __func__); - } -free_cnode: - cn = cn->next; - if (saved_bh) { - /* - * we incremented this to keep others from - * taking the buffer head away - */ - put_bh(saved_bh); - if (atomic_read(&saved_bh->b_count) < 0) { - reiserfs_warning(s, "journal-945", - "saved_bh->b_count < 0"); - } - } - } - if (count > 0) { - cn = jl->j_realblock; - while (cn) { - if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { - if (!cn->bh) { - reiserfs_panic(s, "journal-1011", - "cn->bh is NULL"); - } - - depth = reiserfs_write_unlock_nested(s); - __wait_on_buffer(cn->bh); - reiserfs_write_lock_nested(s, depth); - - if (!cn->bh) { - reiserfs_panic(s, "journal-1012", - "cn->bh is NULL"); - } - if (unlikely(!buffer_uptodate(cn->bh))) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-949", - "buffer write failed"); -#endif - err = -EIO; - } - /* - * note, we must clear the JDirty_wait bit - * after the up to date check, otherwise we - * race against our flushpage routine - */ - BUG_ON(!test_clear_buffer_journal_dirty - (cn->bh)); - - /* drop one ref for us */ - put_bh(cn->bh); - /* drop one ref for journal_mark_dirty */ - release_buffer_page(cn->bh); - } - cn = cn->next; - } - } - - if (err) - reiserfs_abort(s, -EIO, - "Write error while pushing transaction to disk in %s", - __func__); -flush_older_and_return: - - /* - * before we can update the journal header block, we _must_ flush all - * real blocks from all older transactions to disk. This is because - * once the header block is updated, this transaction will not be - * replayed after a crash - */ - if (flushall) { - flush_older_journal_lists(s, jl); - } - - err = journal->j_errno; - /* - * before we can remove everything from the hash tables for this - * transaction, we must make sure it can never be replayed - * - * since we are only called from do_journal_end, we know for sure there - * are no allocations going on while we are flushing journal lists. So, - * we only need to update the journal header block for the last list - * being flushed - */ - if (!err && flushall) { - err = - update_journal_header_block(s, - (jl->j_start + jl->j_len + - 2) % SB_ONDISK_JOURNAL_SIZE(s), - jl->j_trans_id); - if (err) - reiserfs_abort(s, -EIO, - "Write error while updating journal header in %s", - __func__); - } - remove_all_from_journal_list(s, jl, 0); - list_del_init(&jl->j_list); - journal->j_num_lists--; - del_from_work_list(s, jl); - - if (journal->j_last_flush_id != 0 && - (jl->j_trans_id - journal->j_last_flush_id) != 1) { - reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu", - journal->j_last_flush_id, jl->j_trans_id); - } - journal->j_last_flush_id = jl->j_trans_id; - - /* - * not strictly required since we are freeing the list, but it should - * help find code using dead lists later on - */ - jl->j_len = 0; - atomic_set(&jl->j_nonzerolen, 0); - jl->j_start = 0; - jl->j_realblock = NULL; - jl->j_commit_bh = NULL; - jl->j_trans_id = 0; - jl->j_state = 0; - put_journal_list(s, jl); - if (flushall) - mutex_unlock(&journal->j_flush_mutex); - return err; -} - -static int write_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl, - struct buffer_chunk *chunk) -{ - struct reiserfs_journal_cnode *cn; - int ret = 0; - - jl->j_state |= LIST_TOUCHED; - del_from_work_list(s, jl); - if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { - return 0; - } - - cn = jl->j_realblock; - while (cn) { - /* - * if the blocknr == 0, this has been cleared from the hash, - * skip it - */ - if (cn->blocknr == 0) { - goto next; - } - if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { - struct buffer_head *tmp_bh; - /* - * we can race against journal_mark_freed when we try - * to lock_buffer(cn->bh), so we have to inc the buffer - * count, and recheck things after locking - */ - tmp_bh = cn->bh; - get_bh(tmp_bh); - lock_buffer(tmp_bh); - if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { - if (!buffer_journal_dirty(tmp_bh) || - buffer_journal_prepared(tmp_bh)) - BUG(); - add_to_chunk(chunk, tmp_bh, NULL, write_chunk); - ret++; - } else { - /* note, cn->bh might be null now */ - unlock_buffer(tmp_bh); - } - put_bh(tmp_bh); - } -next: - cn = cn->next; - cond_resched(); - } - return ret; -} - -/* used by flush_commit_list */ -static void dirty_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - struct reiserfs_journal_cnode *cn; - struct reiserfs_journal_list *pjl; - - jl->j_state |= LIST_DIRTY; - cn = jl->j_realblock; - while (cn) { - /* - * look for a more recent transaction that logged this - * buffer. Only the most recent transaction with a buffer in - * it is allowed to send that buffer to disk - */ - pjl = find_newer_jl_for_cn(cn); - if (!pjl && cn->blocknr && cn->bh - && buffer_journal_dirty(cn->bh)) { - BUG_ON(!can_dirty(cn)); - /* - * if the buffer is prepared, it will either be logged - * or restored. If restored, we need to make sure - * it actually gets marked dirty - */ - clear_buffer_journal_new(cn->bh); - if (buffer_journal_prepared(cn->bh)) { - set_buffer_journal_restore_dirty(cn->bh); - } else { - set_buffer_journal_test(cn->bh); - mark_buffer_dirty(cn->bh); - } - } - cn = cn->next; - } -} - -static int kupdate_transactions(struct super_block *s, - struct reiserfs_journal_list *jl, - struct reiserfs_journal_list **next_jl, - unsigned int *next_trans_id, - int num_blocks, int num_trans) -{ - int ret = 0; - int written = 0; - int transactions_flushed = 0; - unsigned int orig_trans_id = jl->j_trans_id; - struct buffer_chunk chunk; - struct list_head *entry; - struct reiserfs_journal *journal = SB_JOURNAL(s); - chunk.nr = 0; - - reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); - if (!journal_list_still_alive(s, orig_trans_id)) { - goto done; - } - - /* - * we've got j_flush_mutex held, nobody is going to delete any - * of these lists out from underneath us - */ - while ((num_trans && transactions_flushed < num_trans) || - (!num_trans && written < num_blocks)) { - - if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || - atomic_read(&jl->j_commit_left) - || !(jl->j_state & LIST_DIRTY)) { - del_from_work_list(s, jl); - break; - } - ret = write_one_transaction(s, jl, &chunk); - - if (ret < 0) - goto done; - transactions_flushed++; - written += ret; - entry = jl->j_list.next; - - /* did we wrap? */ - if (entry == &journal->j_journal_list) { - break; - } - jl = JOURNAL_LIST_ENTRY(entry); - - /* don't bother with older transactions */ - if (jl->j_trans_id <= orig_trans_id) - break; - } - if (chunk.nr) { - write_chunk(&chunk); - } - -done: - mutex_unlock(&journal->j_flush_mutex); - return ret; -} - -/* - * for o_sync and fsync heavy applications, they tend to use - * all the journa list slots with tiny transactions. These - * trigger lots and lots of calls to update the header block, which - * adds seeks and slows things down. - * - * This function tries to clear out a large chunk of the journal lists - * at once, which makes everything faster since only the newest journal - * list updates the header block - */ -static int flush_used_journal_lists(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - unsigned long len = 0; - unsigned long cur_len; - int i; - int limit = 256; - struct reiserfs_journal_list *tjl; - struct reiserfs_journal_list *flush_jl; - unsigned int trans_id; - struct reiserfs_journal *journal = SB_JOURNAL(s); - - flush_jl = tjl = jl; - - /* in data logging mode, try harder to flush a lot of blocks */ - if (reiserfs_data_log(s)) - limit = 1024; - /* flush for 256 transactions or limit blocks, whichever comes first */ - for (i = 0; i < 256 && len < limit; i++) { - if (atomic_read(&tjl->j_commit_left) || - tjl->j_trans_id < jl->j_trans_id) { - break; - } - cur_len = atomic_read(&tjl->j_nonzerolen); - if (cur_len > 0) { - tjl->j_state &= ~LIST_TOUCHED; - } - len += cur_len; - flush_jl = tjl; - if (tjl->j_list.next == &journal->j_journal_list) - break; - tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); - } - get_journal_list(jl); - get_journal_list(flush_jl); - - /* - * try to find a group of blocks we can flush across all the - * transactions, but only bother if we've actually spanned - * across multiple lists - */ - if (flush_jl != jl) - kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - - flush_journal_list(s, flush_jl, 1); - put_journal_list(s, flush_jl); - put_journal_list(s, jl); - return 0; -} - -/* - * removes any nodes in table with name block and dev as bh. - * only touchs the hnext and hprev pointers. - */ -static void remove_journal_hash(struct super_block *sb, - struct reiserfs_journal_cnode **table, - struct reiserfs_journal_list *jl, - unsigned long block, int remove_freed) -{ - struct reiserfs_journal_cnode *cur; - struct reiserfs_journal_cnode **head; - - head = &(journal_hash(table, sb, block)); - if (!head) { - return; - } - cur = *head; - while (cur) { - if (cur->blocknr == block && cur->sb == sb - && (jl == NULL || jl == cur->jlist) - && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { - if (cur->hnext) { - cur->hnext->hprev = cur->hprev; - } - if (cur->hprev) { - cur->hprev->hnext = cur->hnext; - } else { - *head = cur->hnext; - } - cur->blocknr = 0; - cur->sb = NULL; - cur->state = 0; - /* - * anybody who clears the cur->bh will also - * dec the nonzerolen - */ - if (cur->bh && cur->jlist) - atomic_dec(&cur->jlist->j_nonzerolen); - cur->bh = NULL; - cur->jlist = NULL; - } - cur = cur->hnext; - } -} - -static void free_journal_ram(struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - kfree(journal->j_current_jl); - journal->j_num_lists--; - - vfree(journal->j_cnode_free_orig); - free_list_bitmaps(sb, journal->j_list_bitmap); - free_bitmap_nodes(sb); /* must be after free_list_bitmaps */ - if (journal->j_header_bh) { - brelse(journal->j_header_bh); - } - /* - * j_header_bh is on the journal dev, make sure - * not to release the journal dev until we brelse j_header_bh - */ - release_journal_dev(journal); - vfree(journal); -} - -/* - * call on unmount. Only set error to 1 if you haven't made your way out - * of read_super() yet. Any other caller must keep error at 0. - */ -static int do_journal_release(struct reiserfs_transaction_handle *th, - struct super_block *sb, int error) -{ - struct reiserfs_transaction_handle myth; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - /* - * we only want to flush out transactions if we were - * called with error == 0 - */ - if (!error && !sb_rdonly(sb)) { - /* end the current trans */ - BUG_ON(!th->t_trans_id); - do_journal_end(th, FLUSH_ALL); - - /* - * make sure something gets logged to force - * our way into the flush code - */ - if (!journal_join(&myth, sb)) { - reiserfs_prepare_for_journal(sb, - SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, FLUSH_ALL); - } - } - - /* this also catches errors during the do_journal_end above */ - if (!error && reiserfs_is_journal_aborted(journal)) { - memset(&myth, 0, sizeof(myth)); - if (!journal_join_abort(&myth, sb)) { - reiserfs_prepare_for_journal(sb, - SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, FLUSH_ALL); - } - } - - - /* - * We must release the write lock here because - * the workqueue job (flush_async_commit) needs this lock - */ - reiserfs_write_unlock(sb); - - /* - * Cancel flushing of old commits. Note that neither of these works - * will be requeued because superblock is being shutdown and doesn't - * have SB_ACTIVE set. - */ - reiserfs_cancel_old_flush(sb); - /* wait for all commits to finish */ - cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); - - free_journal_ram(sb); - - reiserfs_write_lock(sb); - - return 0; -} - -/* * call on unmount. flush all journal trans, release all alloc'd ram */ -int journal_release(struct reiserfs_transaction_handle *th, - struct super_block *sb) -{ - return do_journal_release(th, sb, 0); -} - -/* only call from an error condition inside reiserfs_read_super! */ -int journal_release_error(struct reiserfs_transaction_handle *th, - struct super_block *sb) -{ - return do_journal_release(th, sb, 1); -} - -/* - * compares description block with commit block. - * returns 1 if they differ, 0 if they are the same - */ -static int journal_compare_desc_commit(struct super_block *sb, - struct reiserfs_journal_desc *desc, - struct reiserfs_journal_commit *commit) -{ - if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || - get_commit_trans_len(commit) != get_desc_trans_len(desc) || - get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max || - get_commit_trans_len(commit) <= 0) { - return 1; - } - return 0; -} - -/* - * returns 0 if it did not find a description block - * returns -1 if it found a corrupt commit block - * returns 1 if both desc and commit were valid - * NOTE: only called during fs mount - */ -static int journal_transaction_is_valid(struct super_block *sb, - struct buffer_head *d_bh, - unsigned int *oldest_invalid_trans_id, - unsigned long *newest_mount_id) -{ - struct reiserfs_journal_desc *desc; - struct reiserfs_journal_commit *commit; - struct buffer_head *c_bh; - unsigned long offset; - - if (!d_bh) - return 0; - - desc = (struct reiserfs_journal_desc *)d_bh->b_data; - if (get_desc_trans_len(desc) > 0 - && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { - if (oldest_invalid_trans_id && *oldest_invalid_trans_id - && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-986: transaction " - "is valid returning because trans_id %d is greater than " - "oldest_invalid %lu", - get_desc_trans_id(desc), - *oldest_invalid_trans_id); - return 0; - } - if (newest_mount_id - && *newest_mount_id > get_desc_mount_id(desc)) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1087: transaction " - "is valid returning because mount_id %d is less than " - "newest_mount_id %lu", - get_desc_mount_id(desc), - *newest_mount_id); - return -1; - } - if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) { - reiserfs_warning(sb, "journal-2018", - "Bad transaction length %d " - "encountered, ignoring transaction", - get_desc_trans_len(desc)); - return -1; - } - offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); - - /* - * ok, we have a journal description block, - * let's see if the transaction was valid - */ - c_bh = - journal_bread(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - ((offset + get_desc_trans_len(desc) + - 1) % SB_ONDISK_JOURNAL_SIZE(sb))); - if (!c_bh) - return 0; - commit = (struct reiserfs_journal_commit *)c_bh->b_data; - if (journal_compare_desc_commit(sb, desc, commit)) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal_transaction_is_valid, commit offset %ld had bad " - "time %d or length %d", - c_bh->b_blocknr - - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - get_commit_trans_id(commit), - get_commit_trans_len(commit)); - brelse(c_bh); - if (oldest_invalid_trans_id) { - *oldest_invalid_trans_id = - get_desc_trans_id(desc); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1004: " - "transaction_is_valid setting oldest invalid trans_id " - "to %d", - get_desc_trans_id(desc)); - } - return -1; - } - brelse(c_bh); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1006: found valid " - "transaction start offset %llu, len %d id %d", - d_bh->b_blocknr - - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - get_desc_trans_len(desc), - get_desc_trans_id(desc)); - return 1; - } else { - return 0; - } -} - -static void brelse_array(struct buffer_head **heads, int num) -{ - int i; - for (i = 0; i < num; i++) { - brelse(heads[i]); - } -} - -/* - * given the start, and values for the oldest acceptable transactions, - * this either reads in a replays a transaction, or returns because the - * transaction is invalid, or too old. - * NOTE: only called during fs mount - */ -static int journal_read_transaction(struct super_block *sb, - unsigned long cur_dblock, - unsigned long oldest_start, - unsigned int oldest_trans_id, - unsigned long newest_mount_id) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_desc *desc; - struct reiserfs_journal_commit *commit; - unsigned int trans_id = 0; - struct buffer_head *c_bh; - struct buffer_head *d_bh; - struct buffer_head **log_blocks = NULL; - struct buffer_head **real_blocks = NULL; - unsigned int trans_offset; - int i; - int trans_half; - - d_bh = journal_bread(sb, cur_dblock); - if (!d_bh) - return 1; - desc = (struct reiserfs_journal_desc *)d_bh->b_data; - trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: " - "journal_read_transaction, offset %llu, len %d mount_id %d", - d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - get_desc_trans_len(desc), get_desc_mount_id(desc)); - if (get_desc_trans_id(desc) < oldest_trans_id) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: " - "journal_read_trans skipping because %lu is too old", - cur_dblock - - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); - brelse(d_bh); - return 1; - } - if (get_desc_mount_id(desc) != newest_mount_id) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: " - "journal_read_trans skipping because %d is != " - "newest_mount_id %lu", get_desc_mount_id(desc), - newest_mount_id); - brelse(d_bh); - return 1; - } - c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - ((trans_offset + get_desc_trans_len(desc) + 1) % - SB_ONDISK_JOURNAL_SIZE(sb))); - if (!c_bh) { - brelse(d_bh); - return 1; - } - commit = (struct reiserfs_journal_commit *)c_bh->b_data; - if (journal_compare_desc_commit(sb, desc, commit)) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal_read_transaction, " - "commit offset %llu had bad time %d or length %d", - c_bh->b_blocknr - - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - get_commit_trans_id(commit), - get_commit_trans_len(commit)); - brelse(c_bh); - brelse(d_bh); - return 1; - } - - if (bdev_read_only(sb->s_bdev)) { - reiserfs_warning(sb, "clm-2076", - "device is readonly, unable to replay log"); - brelse(c_bh); - brelse(d_bh); - return -EROFS; - } - - trans_id = get_desc_trans_id(desc); - /* - * now we know we've got a good transaction, and it was - * inside the valid time ranges - */ - log_blocks = kmalloc_array(get_desc_trans_len(desc), - sizeof(struct buffer_head *), - GFP_NOFS); - real_blocks = kmalloc_array(get_desc_trans_len(desc), - sizeof(struct buffer_head *), - GFP_NOFS); - if (!log_blocks || !real_blocks) { - brelse(c_bh); - brelse(d_bh); - kfree(log_blocks); - kfree(real_blocks); - reiserfs_warning(sb, "journal-1169", - "kmalloc failed, unable to mount FS"); - return -1; - } - /* get all the buffer heads */ - trans_half = journal_trans_half(sb->s_blocksize); - for (i = 0; i < get_desc_trans_len(desc); i++) { - log_blocks[i] = - journal_getblk(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - (trans_offset + 1 + - i) % SB_ONDISK_JOURNAL_SIZE(sb)); - if (i < trans_half) { - real_blocks[i] = - sb_getblk(sb, - le32_to_cpu(desc->j_realblock[i])); - } else { - real_blocks[i] = - sb_getblk(sb, - le32_to_cpu(commit-> - j_realblock[i - trans_half])); - } - if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) { - reiserfs_warning(sb, "journal-1207", - "REPLAY FAILURE fsck required! " - "Block to replay is outside of " - "filesystem"); - goto abort_replay; - } - /* make sure we don't try to replay onto log or reserved area */ - if (is_block_in_log_or_reserved_area - (sb, real_blocks[i]->b_blocknr)) { - reiserfs_warning(sb, "journal-1204", - "REPLAY FAILURE fsck required! " - "Trying to replay onto a log block"); -abort_replay: - brelse_array(log_blocks, i); - brelse_array(real_blocks, i); - brelse(c_bh); - brelse(d_bh); - kfree(log_blocks); - kfree(real_blocks); - return -1; - } - } - /* read in the log blocks, memcpy to the corresponding real block */ - bh_read_batch(get_desc_trans_len(desc), log_blocks); - for (i = 0; i < get_desc_trans_len(desc); i++) { - - wait_on_buffer(log_blocks[i]); - if (!buffer_uptodate(log_blocks[i])) { - reiserfs_warning(sb, "journal-1212", - "REPLAY FAILURE fsck required! " - "buffer write failed"); - brelse_array(log_blocks + i, - get_desc_trans_len(desc) - i); - brelse_array(real_blocks, get_desc_trans_len(desc)); - brelse(c_bh); - brelse(d_bh); - kfree(log_blocks); - kfree(real_blocks); - return -1; - } - memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, - real_blocks[i]->b_size); - set_buffer_uptodate(real_blocks[i]); - brelse(log_blocks[i]); - } - /* flush out the real blocks */ - for (i = 0; i < get_desc_trans_len(desc); i++) { - set_buffer_dirty(real_blocks[i]); - write_dirty_buffer(real_blocks[i], 0); - } - for (i = 0; i < get_desc_trans_len(desc); i++) { - wait_on_buffer(real_blocks[i]); - if (!buffer_uptodate(real_blocks[i])) { - reiserfs_warning(sb, "journal-1226", - "REPLAY FAILURE, fsck required! " - "buffer write failed"); - brelse_array(real_blocks + i, - get_desc_trans_len(desc) - i); - brelse(c_bh); - brelse(d_bh); - kfree(log_blocks); - kfree(real_blocks); - return -1; - } - brelse(real_blocks[i]); - } - cur_dblock = - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - ((trans_offset + get_desc_trans_len(desc) + - 2) % SB_ONDISK_JOURNAL_SIZE(sb)); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1095: setting journal " "start to offset %ld", - cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); - - /* - * init starting values for the first transaction, in case - * this is the last transaction to be replayed. - */ - journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); - journal->j_last_flush_trans_id = trans_id; - journal->j_trans_id = trans_id + 1; - /* check for trans_id overflow */ - if (journal->j_trans_id == 0) - journal->j_trans_id = 10; - brelse(c_bh); - brelse(d_bh); - kfree(log_blocks); - kfree(real_blocks); - return 0; -} - -/* - * This function reads blocks starting from block and to max_block of bufsize - * size (but no more than BUFNR blocks at a time). This proved to improve - * mounting speed on self-rebuilding raid5 arrays at least. - * Right now it is only used from journal code. But later we might use it - * from other places. - * Note: Do not use journal_getblk/sb_getblk functions here! - */ -static struct buffer_head *reiserfs_breada(struct block_device *dev, - b_blocknr_t block, int bufsize, - b_blocknr_t max_block) -{ - struct buffer_head *bhlist[BUFNR]; - unsigned int blocks = BUFNR; - struct buffer_head *bh; - int i, j; - - bh = __getblk(dev, block, bufsize); - if (!bh || buffer_uptodate(bh)) - return (bh); - - if (block + BUFNR > max_block) { - blocks = max_block - block; - } - bhlist[0] = bh; - j = 1; - for (i = 1; i < blocks; i++) { - bh = __getblk(dev, block + i, bufsize); - if (!bh) - break; - if (buffer_uptodate(bh)) { - brelse(bh); - break; - } else - bhlist[j++] = bh; - } - bh = bhlist[0]; - bh_read_nowait(bh, 0); - bh_readahead_batch(j - 1, &bhlist[1], 0); - for (i = 1; i < j; i++) - brelse(bhlist[i]); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - brelse(bh); - return NULL; -} - -/* - * read and replay the log - * on a clean unmount, the journal header's next unflushed pointer will be - * to an invalid transaction. This tests that before finding all the - * transactions in the log, which makes normal mount times fast. - * - * After a crash, this starts with the next unflushed transaction, and - * replays until it finds one too old, or invalid. - * - * On exit, it sets things up so the first transaction will work correctly. - * NOTE: only called during fs mount - */ -static int journal_read(struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_desc *desc; - unsigned int oldest_trans_id = 0; - unsigned int oldest_invalid_trans_id = 0; - time64_t start; - unsigned long oldest_start = 0; - unsigned long cur_dblock = 0; - unsigned long newest_mount_id = 9; - struct buffer_head *d_bh; - struct reiserfs_journal_header *jh; - int valid_journal_header = 0; - int replay_count = 0; - int continue_replay = 1; - int ret; - - cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); - reiserfs_info(sb, "checking transaction log (%pg)\n", - file_bdev(journal->j_bdev_file)); - start = ktime_get_seconds(); - - /* - * step 1, read in the journal header block. Check the transaction - * it says is the first unflushed, and if that transaction is not - * valid, replay is done - */ - journal->j_header_bh = journal_bread(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) - + SB_ONDISK_JOURNAL_SIZE(sb)); - if (!journal->j_header_bh) { - return 1; - } - jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); - if (le32_to_cpu(jh->j_first_unflushed_offset) < - SB_ONDISK_JOURNAL_SIZE(sb) - && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { - oldest_start = - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - le32_to_cpu(jh->j_first_unflushed_offset); - oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; - newest_mount_id = le32_to_cpu(jh->j_mount_id); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1153: found in " - "header: first_unflushed_offset %d, last_flushed_trans_id " - "%lu", le32_to_cpu(jh->j_first_unflushed_offset), - le32_to_cpu(jh->j_last_flush_trans_id)); - valid_journal_header = 1; - - /* - * now, we try to read the first unflushed offset. If it - * is not valid, there is nothing more we can do, and it - * makes no sense to read through the whole log. - */ - d_bh = - journal_bread(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - le32_to_cpu(jh->j_first_unflushed_offset)); - ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL); - if (!ret) { - continue_replay = 0; - } - brelse(d_bh); - goto start_log_replay; - } - - /* - * ok, there are transactions that need to be replayed. start - * with the first log block, find all the valid transactions, and - * pick out the oldest. - */ - while (continue_replay - && cur_dblock < - (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - SB_ONDISK_JOURNAL_SIZE(sb))) { - /* - * Note that it is required for blocksize of primary fs - * device and journal device to be the same - */ - d_bh = - reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock, - sb->s_blocksize, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - SB_ONDISK_JOURNAL_SIZE(sb)); - ret = - journal_transaction_is_valid(sb, d_bh, - &oldest_invalid_trans_id, - &newest_mount_id); - if (ret == 1) { - desc = (struct reiserfs_journal_desc *)d_bh->b_data; - if (oldest_start == 0) { /* init all oldest_ values */ - oldest_trans_id = get_desc_trans_id(desc); - oldest_start = d_bh->b_blocknr; - newest_mount_id = get_desc_mount_id(desc); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1179: Setting " - "oldest_start to offset %llu, trans_id %lu", - oldest_start - - SB_ONDISK_JOURNAL_1st_BLOCK - (sb), oldest_trans_id); - } else if (oldest_trans_id > get_desc_trans_id(desc)) { - /* one we just read was older */ - oldest_trans_id = get_desc_trans_id(desc); - oldest_start = d_bh->b_blocknr; - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1180: Resetting " - "oldest_start to offset %lu, trans_id %lu", - oldest_start - - SB_ONDISK_JOURNAL_1st_BLOCK - (sb), oldest_trans_id); - } - if (newest_mount_id < get_desc_mount_id(desc)) { - newest_mount_id = get_desc_mount_id(desc); - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1299: Setting " - "newest_mount_id to %d", - get_desc_mount_id(desc)); - } - cur_dblock += get_desc_trans_len(desc) + 2; - } else { - cur_dblock++; - } - brelse(d_bh); - } - -start_log_replay: - cur_dblock = oldest_start; - if (oldest_trans_id) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1206: Starting replay " - "from offset %llu, trans_id %lu", - cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - oldest_trans_id); - - } - replay_count = 0; - while (continue_replay && oldest_trans_id > 0) { - ret = - journal_read_transaction(sb, cur_dblock, oldest_start, - oldest_trans_id, newest_mount_id); - if (ret < 0) { - return ret; - } else if (ret != 0) { - break; - } - cur_dblock = - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start; - replay_count++; - if (cur_dblock == oldest_start) - break; - } - - if (oldest_trans_id == 0) { - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "journal-1225: No valid " "transactions found"); - } - /* - * j_start does not get set correctly if we don't replay any - * transactions. if we had a valid journal_header, set j_start - * to the first unflushed transaction value, copy the trans_id - * from the header - */ - if (valid_journal_header && replay_count == 0) { - journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); - journal->j_trans_id = - le32_to_cpu(jh->j_last_flush_trans_id) + 1; - /* check for trans_id overflow */ - if (journal->j_trans_id == 0) - journal->j_trans_id = 10; - journal->j_last_flush_trans_id = - le32_to_cpu(jh->j_last_flush_trans_id); - journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; - } else { - journal->j_mount_id = newest_mount_id + 1; - } - reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " - "newest_mount_id to %lu", journal->j_mount_id); - journal->j_first_unflushed_offset = journal->j_start; - if (replay_count > 0) { - reiserfs_info(sb, - "replayed %d transactions in %lu seconds\n", - replay_count, ktime_get_seconds() - start); - } - /* needed to satisfy the locking in _update_journal_header_block */ - reiserfs_write_lock(sb); - if (!bdev_read_only(sb->s_bdev) && - _update_journal_header_block(sb, journal->j_start, - journal->j_last_flush_trans_id)) { - reiserfs_write_unlock(sb); - /* - * replay failed, caller must call free_journal_ram and abort - * the mount - */ - return -1; - } - reiserfs_write_unlock(sb); - return 0; -} - -static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) -{ - struct reiserfs_journal_list *jl; - jl = kzalloc(sizeof(struct reiserfs_journal_list), - GFP_NOFS | __GFP_NOFAIL); - INIT_LIST_HEAD(&jl->j_list); - INIT_LIST_HEAD(&jl->j_working_list); - INIT_LIST_HEAD(&jl->j_tail_bh_list); - INIT_LIST_HEAD(&jl->j_bh_list); - mutex_init(&jl->j_commit_mutex); - SB_JOURNAL(s)->j_num_lists++; - get_journal_list(jl); - return jl; -} - -static void journal_list_init(struct super_block *sb) -{ - SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); -} - -static void release_journal_dev(struct reiserfs_journal *journal) -{ - if (journal->j_bdev_file) { - bdev_fput(journal->j_bdev_file); - journal->j_bdev_file = NULL; - } -} - -static int journal_init_dev(struct super_block *super, - struct reiserfs_journal *journal, - const char *jdev_name) -{ - blk_mode_t blkdev_mode = BLK_OPEN_READ; - void *holder = journal; - int result; - dev_t jdev; - - result = 0; - - journal->j_bdev_file = NULL; - jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? - new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; - - if (!bdev_read_only(super->s_bdev)) - blkdev_mode |= BLK_OPEN_WRITE; - - /* there is no "jdev" option and journal is on separate device */ - if ((!jdev_name || !jdev_name[0])) { - if (jdev == super->s_dev) - holder = NULL; - journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode, - holder, NULL); - if (IS_ERR(journal->j_bdev_file)) { - result = PTR_ERR(journal->j_bdev_file); - journal->j_bdev_file = NULL; - reiserfs_warning(super, "sh-458", - "cannot init journal device unknown-block(%u,%u): %i", - MAJOR(jdev), MINOR(jdev), result); - return result; - } else if (jdev != super->s_dev) - set_blocksize(journal->j_bdev_file, super->s_blocksize); - - return 0; - } - - journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode, - holder, NULL); - if (IS_ERR(journal->j_bdev_file)) { - result = PTR_ERR(journal->j_bdev_file); - journal->j_bdev_file = NULL; - reiserfs_warning(super, "sh-457", - "journal_init_dev: Cannot open '%s': %i", - jdev_name, result); - return result; - } - - set_blocksize(journal->j_bdev_file, super->s_blocksize); - reiserfs_info(super, - "journal_init_dev: journal device: %pg\n", - file_bdev(journal->j_bdev_file)); - return 0; -} - -/* - * When creating/tuning a file system user can assign some - * journal params within boundaries which depend on the ratio - * blocksize/standard_blocksize. - * - * For blocks >= standard_blocksize transaction size should - * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more - * then JOURNAL_TRANS_MAX_DEFAULT. - * - * For blocks < standard_blocksize these boundaries should be - * decreased proportionally. - */ -#define REISERFS_STANDARD_BLKSIZE (4096) - -static int check_advise_trans_params(struct super_block *sb, - struct reiserfs_journal *journal) -{ - if (journal->j_trans_max) { - /* Non-default journal params. Do sanity check for them. */ - int ratio = 1; - if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) - ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; - - if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio || - journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio || - SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max < - JOURNAL_MIN_RATIO) { - reiserfs_warning(sb, "sh-462", - "bad transaction max size (%u). " - "FSCK?", journal->j_trans_max); - return 1; - } - if (journal->j_max_batch != (journal->j_trans_max) * - JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) { - reiserfs_warning(sb, "sh-463", - "bad transaction max batch (%u). " - "FSCK?", journal->j_max_batch); - return 1; - } - } else { - /* - * Default journal params. - * The file system was created by old version - * of mkreiserfs, so some fields contain zeros, - * and we need to advise proper values for them - */ - if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { - reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", - sb->s_blocksize); - return 1; - } - journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; - journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; - journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; - } - return 0; -} - -/* must be called once on fs mount. calls journal_read for you */ -int journal_init(struct super_block *sb, const char *j_dev_name, - int old_format, unsigned int commit_max_age) -{ - int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2; - struct buffer_head *bhjh; - struct reiserfs_super_block *rs; - struct reiserfs_journal_header *jh; - struct reiserfs_journal *journal; - struct reiserfs_journal_list *jl; - int ret; - - journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); - if (!journal) { - reiserfs_warning(sb, "journal-1256", - "unable to get memory for journal structure"); - return 1; - } - INIT_LIST_HEAD(&journal->j_bitmap_nodes); - INIT_LIST_HEAD(&journal->j_prealloc_list); - INIT_LIST_HEAD(&journal->j_working_list); - INIT_LIST_HEAD(&journal->j_journal_list); - journal->j_persistent_trans = 0; - if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, - reiserfs_bmap_count(sb))) - goto free_and_return; - - allocate_bitmap_nodes(sb); - - /* reserved for journal area support */ - SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ? - REISERFS_OLD_DISK_OFFSET_IN_BYTES - / sb->s_blocksize + - reiserfs_bmap_count(sb) + - 1 : - REISERFS_DISK_OFFSET_IN_BYTES / - sb->s_blocksize + 2); - - /* - * Sanity check to see is the standard journal fitting - * within first bitmap (actual for small blocksizes) - */ - if (!SB_ONDISK_JOURNAL_DEVICE(sb) && - (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + - SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { - reiserfs_warning(sb, "journal-1393", - "journal does not fit for area addressed " - "by first of bitmap blocks. It starts at " - "%u and its size is %u. Block size %ld", - SB_JOURNAL_1st_RESERVED_BLOCK(sb), - SB_ONDISK_JOURNAL_SIZE(sb), - sb->s_blocksize); - goto free_and_return; - } - - /* - * Sanity check to see if journal first block is correct. - * If journal first block is invalid it can cause - * zeroing important superblock members. - */ - if (!SB_ONDISK_JOURNAL_DEVICE(sb) && - SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) { - reiserfs_warning(sb, "journal-1393", - "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d", - SB_JOURNAL_1st_RESERVED_BLOCK(sb), - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); - goto free_and_return; - } - - if (journal_init_dev(sb, journal, j_dev_name) != 0) { - reiserfs_warning(sb, "sh-462", - "unable to initialize journal device"); - goto free_and_return; - } - - rs = SB_DISK_SUPER_BLOCK(sb); - - /* read journal header */ - bhjh = journal_bread(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - SB_ONDISK_JOURNAL_SIZE(sb)); - if (!bhjh) { - reiserfs_warning(sb, "sh-459", - "unable to read journal header"); - goto free_and_return; - } - jh = (struct reiserfs_journal_header *)(bhjh->b_data); - - /* make sure that journal matches to the super block */ - if (is_reiserfs_jr(rs) - && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != - sb_jp_journal_magic(rs))) { - reiserfs_warning(sb, "sh-460", - "journal header magic %x (device %pg) does " - "not match to magic found in super block %x", - jh->jh_journal.jp_journal_magic, - file_bdev(journal->j_bdev_file), - sb_jp_journal_magic(rs)); - brelse(bhjh); - goto free_and_return; - } - - journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); - journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); - journal->j_max_commit_age = - le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); - journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; - - if (check_advise_trans_params(sb, journal) != 0) - goto free_and_return; - journal->j_default_max_commit_age = journal->j_max_commit_age; - - if (commit_max_age != 0) { - journal->j_max_commit_age = commit_max_age; - journal->j_max_trans_age = commit_max_age; - } - - reiserfs_info(sb, "journal params: device %pg, size %u, " - "journal first block %u, max trans len %u, max batch %u, " - "max commit age %u, max trans age %u\n", - file_bdev(journal->j_bdev_file), - SB_ONDISK_JOURNAL_SIZE(sb), - SB_ONDISK_JOURNAL_1st_BLOCK(sb), - journal->j_trans_max, - journal->j_max_batch, - journal->j_max_commit_age, journal->j_max_trans_age); - - brelse(bhjh); - - journal->j_list_bitmap_index = 0; - journal_list_init(sb); - - memset(journal->j_list_hash_table, 0, - JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); - - INIT_LIST_HEAD(&journal->j_dirty_buffers); - spin_lock_init(&journal->j_dirty_buffers_lock); - - journal->j_start = 0; - journal->j_len = 0; - journal->j_len_alloc = 0; - atomic_set(&journal->j_wcount, 0); - atomic_set(&journal->j_async_throttle, 0); - journal->j_bcount = 0; - journal->j_trans_start_time = 0; - journal->j_last = NULL; - journal->j_first = NULL; - init_waitqueue_head(&journal->j_join_wait); - mutex_init(&journal->j_mutex); - mutex_init(&journal->j_flush_mutex); - - journal->j_trans_id = 10; - journal->j_mount_id = 10; - journal->j_state = 0; - atomic_set(&journal->j_jlock, 0); - journal->j_cnode_free_list = allocate_cnodes(num_cnodes); - journal->j_cnode_free_orig = journal->j_cnode_free_list; - journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; - journal->j_cnode_used = 0; - journal->j_must_wait = 0; - - if (journal->j_cnode_free == 0) { - reiserfs_warning(sb, "journal-2004", "Journal cnode memory " - "allocation failed (%ld bytes). Journal is " - "too large for available memory. Usually " - "this is due to a journal that is too large.", - sizeof (struct reiserfs_journal_cnode) * num_cnodes); - goto free_and_return; - } - - init_journal_hash(sb); - jl = journal->j_current_jl; - - /* - * get_list_bitmap() may call flush_commit_list() which - * requires the lock. Calling flush_commit_list() shouldn't happen - * this early but I like to be paranoid. - */ - reiserfs_write_lock(sb); - jl->j_list_bitmap = get_list_bitmap(sb, jl); - reiserfs_write_unlock(sb); - if (!jl->j_list_bitmap) { - reiserfs_warning(sb, "journal-2005", - "get_list_bitmap failed for journal list 0"); - goto free_and_return; - } - - ret = journal_read(sb); - if (ret < 0) { - reiserfs_warning(sb, "reiserfs-2006", - "Replay Failure, unable to mount"); - goto free_and_return; - } - - INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); - journal->j_work_sb = sb; - return 0; -free_and_return: - free_journal_ram(sb); - return 1; -} - -/* - * test for a polite end of the current transaction. Used by file_write, - * and should be used by delete to make sure they don't write more than - * can fit inside a single transaction - */ -int journal_transaction_should_end(struct reiserfs_transaction_handle *th, - int new_alloc) -{ - struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); - time64_t now = ktime_get_seconds(); - /* cannot restart while nested */ - BUG_ON(!th->t_trans_id); - if (th->t_refcount > 1) - return 0; - if (journal->j_must_wait > 0 || - (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || - atomic_read(&journal->j_jlock) || - (now - journal->j_trans_start_time) > journal->j_max_trans_age || - journal->j_cnode_free < (journal->j_trans_max * 3)) { - return 1; - } - - journal->j_len_alloc += new_alloc; - th->t_blocks_allocated += new_alloc ; - return 0; -} - -/* this must be called inside a transaction */ -void reiserfs_block_writes(struct reiserfs_transaction_handle *th) -{ - struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); - BUG_ON(!th->t_trans_id); - journal->j_must_wait = 1; - set_bit(J_WRITERS_BLOCKED, &journal->j_state); - return; -} - -/* this must be called without a transaction started */ -void reiserfs_allow_writes(struct super_block *s) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - clear_bit(J_WRITERS_BLOCKED, &journal->j_state); - wake_up(&journal->j_join_wait); -} - -/* this must be called without a transaction started */ -void reiserfs_wait_on_write_block(struct super_block *s) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - wait_event(journal->j_join_wait, - !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); -} - -static void queue_log_writer(struct super_block *s) -{ - wait_queue_entry_t wait; - struct reiserfs_journal *journal = SB_JOURNAL(s); - set_bit(J_WRITERS_QUEUED, &journal->j_state); - - /* - * we don't want to use wait_event here because - * we only want to wait once. - */ - init_waitqueue_entry(&wait, current); - add_wait_queue(&journal->j_join_wait, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { - int depth = reiserfs_write_unlock_nested(s); - schedule(); - reiserfs_write_lock_nested(s, depth); - } - __set_current_state(TASK_RUNNING); - remove_wait_queue(&journal->j_join_wait, &wait); -} - -static void wake_queued_writers(struct super_block *s) -{ - struct reiserfs_journal *journal = SB_JOURNAL(s); - if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) - wake_up(&journal->j_join_wait); -} - -static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - unsigned long bcount = journal->j_bcount; - while (1) { - int depth; - - depth = reiserfs_write_unlock_nested(sb); - schedule_timeout_uninterruptible(1); - reiserfs_write_lock_nested(sb, depth); - - journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; - while ((atomic_read(&journal->j_wcount) > 0 || - atomic_read(&journal->j_jlock)) && - journal->j_trans_id == trans_id) { - queue_log_writer(sb); - } - if (journal->j_trans_id != trans_id) - break; - if (bcount == journal->j_bcount) - break; - bcount = journal->j_bcount; - } -} - -/* - * join == true if you must join an existing transaction. - * join == false if you can deal with waiting for others to finish - * - * this will block until the transaction is joinable. send the number of - * blocks you expect to use in nblocks. -*/ -static int do_journal_begin_r(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks, - int join) -{ - time64_t now = ktime_get_seconds(); - unsigned int old_trans_id; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_transaction_handle myth; - int retval; - int depth; - - reiserfs_check_lock_depth(sb, "journal_begin"); - BUG_ON(nblocks > journal->j_trans_max); - - PROC_INFO_INC(sb, journal.journal_being); - /* set here for journal_join */ - th->t_refcount = 1; - th->t_super = sb; - -relock: - lock_journal(sb); - if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { - unlock_journal(sb); - retval = journal->j_errno; - goto out_fail; - } - journal->j_bcount++; - - if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { - unlock_journal(sb); - depth = reiserfs_write_unlock_nested(sb); - reiserfs_wait_on_write_block(sb); - reiserfs_write_lock_nested(sb, depth); - PROC_INFO_INC(sb, journal.journal_relock_writers); - goto relock; - } - now = ktime_get_seconds(); - - /* - * if there is no room in the journal OR - * if this transaction is too old, and we weren't called joinable, - * wait for it to finish before beginning we don't sleep if there - * aren't other writers - */ - - if ((!join && journal->j_must_wait > 0) || - (!join - && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) - || (!join && atomic_read(&journal->j_wcount) > 0 - && journal->j_trans_start_time > 0 - && (now - journal->j_trans_start_time) > - journal->j_max_trans_age) || (!join - && atomic_read(&journal->j_jlock)) - || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { - - old_trans_id = journal->j_trans_id; - /* allow others to finish this transaction */ - unlock_journal(sb); - - if (!join && (journal->j_len_alloc + nblocks + 2) >= - journal->j_max_batch && - ((journal->j_len + nblocks + 2) * 100) < - (journal->j_len_alloc * 75)) { - if (atomic_read(&journal->j_wcount) > 10) { - queue_log_writer(sb); - goto relock; - } - } - /* - * don't mess with joining the transaction if all we - * have to do is wait for someone else to do a commit - */ - if (atomic_read(&journal->j_jlock)) { - while (journal->j_trans_id == old_trans_id && - atomic_read(&journal->j_jlock)) { - queue_log_writer(sb); - } - goto relock; - } - retval = journal_join(&myth, sb); - if (retval) - goto out_fail; - - /* someone might have ended the transaction while we joined */ - if (old_trans_id != journal->j_trans_id) { - retval = do_journal_end(&myth, 0); - } else { - retval = do_journal_end(&myth, COMMIT_NOW); - } - - if (retval) - goto out_fail; - - PROC_INFO_INC(sb, journal.journal_relock_wcount); - goto relock; - } - /* we are the first writer, set trans_id */ - if (journal->j_trans_start_time == 0) { - journal->j_trans_start_time = ktime_get_seconds(); - } - atomic_inc(&journal->j_wcount); - journal->j_len_alloc += nblocks; - th->t_blocks_logged = 0; - th->t_blocks_allocated = nblocks; - th->t_trans_id = journal->j_trans_id; - unlock_journal(sb); - INIT_LIST_HEAD(&th->t_list); - return 0; - -out_fail: - memset(th, 0, sizeof(*th)); - /* - * Re-set th->t_super, so we can properly keep track of how many - * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later - */ - th->t_super = sb; - return retval; -} - -struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct - super_block - *s, - int nblocks) -{ - int ret; - struct reiserfs_transaction_handle *th; - - /* - * if we're nesting into an existing transaction. It will be - * persistent on its own - */ - if (reiserfs_transaction_running(s)) { - th = current->journal_info; - th->t_refcount++; - BUG_ON(th->t_refcount < 2); - - return th; - } - th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); - if (!th) - return NULL; - ret = journal_begin(th, s, nblocks); - if (ret) { - kfree(th); - return NULL; - } - - SB_JOURNAL(s)->j_persistent_trans++; - return th; -} - -int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) -{ - struct super_block *s = th->t_super; - int ret = 0; - if (th->t_trans_id) - ret = journal_end(th); - else - ret = -EIO; - if (th->t_refcount == 0) { - SB_JOURNAL(s)->j_persistent_trans--; - kfree(th); - } - return ret; -} - -static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb) -{ - struct reiserfs_transaction_handle *cur_th = current->journal_info; - - /* - * this keeps do_journal_end from NULLing out the - * current->journal_info pointer - */ - th->t_handle_save = cur_th; - BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); -} - -int journal_join_abort(struct reiserfs_transaction_handle *th, - struct super_block *sb) -{ - struct reiserfs_transaction_handle *cur_th = current->journal_info; - - /* - * this keeps do_journal_end from NULLing out the - * current->journal_info pointer - */ - th->t_handle_save = cur_th; - BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); -} - -int journal_begin(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) -{ - struct reiserfs_transaction_handle *cur_th = current->journal_info; - int ret; - - th->t_handle_save = NULL; - if (cur_th) { - /* we are nesting into the current transaction */ - if (cur_th->t_super == sb) { - BUG_ON(!cur_th->t_refcount); - cur_th->t_refcount++; - memcpy(th, cur_th, sizeof(*th)); - if (th->t_refcount <= 1) - reiserfs_warning(sb, "reiserfs-2005", - "BAD: refcount <= 1, but " - "journal_info != 0"); - return 0; - } else { - /* - * we've ended up with a handle from a different - * filesystem. save it and restore on journal_end. - * This should never really happen... - */ - reiserfs_warning(sb, "clm-2100", - "nesting info a different FS"); - th->t_handle_save = current->journal_info; - current->journal_info = th; - } - } else { - current->journal_info = th; - } - ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); - BUG_ON(current->journal_info != th); - - /* - * I guess this boils down to being the reciprocal of clm-2100 above. - * If do_journal_begin_r fails, we need to put it back, since - * journal_end won't be called to do it. */ - if (ret) - current->journal_info = th->t_handle_save; - else - BUG_ON(!th->t_refcount); - - return ret; -} - -/* - * puts bh into the current transaction. If it was already there, reorders - * removes the old pointers from the hash, and puts new ones in (to make - * sure replay happen in the right order). - * - * if it was dirty, cleans and files onto the clean list. I can't let it - * be dirty again until the transaction is committed. - * - * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. - */ -int journal_mark_dirty(struct reiserfs_transaction_handle *th, - struct buffer_head *bh) -{ - struct super_block *sb = th->t_super; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_cnode *cn = NULL; - int count_already_incd = 0; - int prepared = 0; - BUG_ON(!th->t_trans_id); - - PROC_INFO_INC(sb, journal.mark_dirty); - if (th->t_trans_id != journal->j_trans_id) { - reiserfs_panic(th->t_super, "journal-1577", - "handle trans id %ld != current trans id %ld", - th->t_trans_id, journal->j_trans_id); - } - - prepared = test_clear_buffer_journal_prepared(bh); - clear_buffer_journal_restore_dirty(bh); - /* already in this transaction, we are done */ - if (buffer_journaled(bh)) { - PROC_INFO_INC(sb, journal.mark_dirty_already); - return 0; - } - - /* - * this must be turned into a panic instead of a warning. We can't - * allow a dirty or journal_dirty or locked buffer to be logged, as - * some changes could get to disk too early. NOT GOOD. - */ - if (!prepared || buffer_dirty(bh)) { - reiserfs_warning(sb, "journal-1777", - "buffer %llu bad state " - "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", - (unsigned long long)bh->b_blocknr, - prepared ? ' ' : '!', - buffer_locked(bh) ? ' ' : '!', - buffer_dirty(bh) ? ' ' : '!', - buffer_journal_dirty(bh) ? ' ' : '!'); - } - - if (atomic_read(&journal->j_wcount) <= 0) { - reiserfs_warning(sb, "journal-1409", - "returning because j_wcount was %d", - atomic_read(&journal->j_wcount)); - return 1; - } - /* - * this error means I've screwed up, and we've overflowed - * the transaction. Nothing can be done here, except make the - * FS readonly or panic. - */ - if (journal->j_len >= journal->j_trans_max) { - reiserfs_panic(th->t_super, "journal-1413", - "j_len (%lu) is too big", - journal->j_len); - } - - if (buffer_journal_dirty(bh)) { - count_already_incd = 1; - PROC_INFO_INC(sb, journal.mark_dirty_notjournal); - clear_buffer_journal_dirty(bh); - } - - if (journal->j_len > journal->j_len_alloc) { - journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; - } - - set_buffer_journaled(bh); - - /* now put this guy on the end */ - if (!cn) { - cn = get_cnode(sb); - if (!cn) { - reiserfs_panic(sb, "journal-4", "get_cnode failed!"); - } - - if (th->t_blocks_logged == th->t_blocks_allocated) { - th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; - journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; - } - th->t_blocks_logged++; - journal->j_len++; - - cn->bh = bh; - cn->blocknr = bh->b_blocknr; - cn->sb = sb; - cn->jlist = NULL; - insert_journal_hash(journal->j_hash_table, cn); - if (!count_already_incd) { - get_bh(bh); - } - } - cn->next = NULL; - cn->prev = journal->j_last; - cn->bh = bh; - if (journal->j_last) { - journal->j_last->next = cn; - journal->j_last = cn; - } else { - journal->j_first = cn; - journal->j_last = cn; - } - reiserfs_schedule_old_flush(sb); - return 0; -} - -int journal_end(struct reiserfs_transaction_handle *th) -{ - struct super_block *sb = th->t_super; - if (!current->journal_info && th->t_refcount > 1) - reiserfs_warning(sb, "REISER-NESTING", - "th NULL, refcount %d", th->t_refcount); - - if (!th->t_trans_id) { - WARN_ON(1); - return -EIO; - } - - th->t_refcount--; - if (th->t_refcount > 0) { - struct reiserfs_transaction_handle *cur_th = - current->journal_info; - - /* - * we aren't allowed to close a nested transaction on a - * different filesystem from the one in the task struct - */ - BUG_ON(cur_th->t_super != th->t_super); - - if (th != cur_th) { - memcpy(current->journal_info, th, sizeof(*th)); - th->t_trans_id = 0; - } - return 0; - } else { - return do_journal_end(th, 0); - } -} - -/* - * removes from the current transaction, relsing and descrementing any counters. - * also files the removed buffer directly onto the clean list - * - * called by journal_mark_freed when a block has been deleted - * - * returns 1 if it cleaned and relsed the buffer. 0 otherwise - */ -static int remove_from_transaction(struct super_block *sb, - b_blocknr_t blocknr, int already_cleaned) -{ - struct buffer_head *bh; - struct reiserfs_journal_cnode *cn; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - int ret = 0; - - cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); - if (!cn || !cn->bh) { - return ret; - } - bh = cn->bh; - if (cn->prev) { - cn->prev->next = cn->next; - } - if (cn->next) { - cn->next->prev = cn->prev; - } - if (cn == journal->j_first) { - journal->j_first = cn->next; - } - if (cn == journal->j_last) { - journal->j_last = cn->prev; - } - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); - clear_buffer_journaled(bh); /* don't log this one */ - - if (!already_cleaned) { - clear_buffer_journal_dirty(bh); - clear_buffer_dirty(bh); - clear_buffer_journal_test(bh); - put_bh(bh); - if (atomic_read(&bh->b_count) < 0) { - reiserfs_warning(sb, "journal-1752", - "b_count < 0"); - } - ret = 1; - } - journal->j_len--; - journal->j_len_alloc--; - free_cnode(sb, cn); - return ret; -} - -/* - * for any cnode in a journal list, it can only be dirtied of all the - * transactions that include it are committed to disk. - * this checks through each transaction, and returns 1 if you are allowed - * to dirty, and 0 if you aren't - * - * it is called by dirty_journal_list, which is called after - * flush_commit_list has gotten all the log blocks for a given - * transaction on disk - * - */ -static int can_dirty(struct reiserfs_journal_cnode *cn) -{ - struct super_block *sb = cn->sb; - b_blocknr_t blocknr = cn->blocknr; - struct reiserfs_journal_cnode *cur = cn->hprev; - int can_dirty = 1; - - /* - * first test hprev. These are all newer than cn, so any node here - * with the same block number and dev means this node can't be sent - * to disk right now. - */ - while (cur && can_dirty) { - if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && - cur->blocknr == blocknr) { - can_dirty = 0; - } - cur = cur->hprev; - } - /* - * then test hnext. These are all older than cn. As long as they - * are committed to the log, it is safe to write cn to disk - */ - cur = cn->hnext; - while (cur && can_dirty) { - if (cur->jlist && cur->jlist->j_len > 0 && - atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && - cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { - can_dirty = 0; - } - cur = cur->hnext; - } - return can_dirty; -} - -/* - * syncs the commit blocks, but does not force the real buffers to disk - * will wait until the current transaction is done/committed before returning - */ -int journal_end_sync(struct reiserfs_transaction_handle *th) -{ - struct super_block *sb = th->t_super; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - BUG_ON(!th->t_trans_id); - /* you can sync while nested, very, very bad */ - BUG_ON(th->t_refcount > 1); - if (journal->j_len == 0) { - reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); - } - return do_journal_end(th, COMMIT_NOW | WAIT); -} - -/* writeback the pending async commits to disk */ -static void flush_async_commits(struct work_struct *work) -{ - struct reiserfs_journal *journal = - container_of(work, struct reiserfs_journal, j_work.work); - struct super_block *sb = journal->j_work_sb; - struct reiserfs_journal_list *jl; - struct list_head *entry; - - reiserfs_write_lock(sb); - if (!list_empty(&journal->j_journal_list)) { - /* last entry is the youngest, commit it and you get everything */ - entry = journal->j_journal_list.prev; - jl = JOURNAL_LIST_ENTRY(entry); - flush_commit_list(sb, jl, 1); - } - reiserfs_write_unlock(sb); -} - -/* - * flushes any old transactions to disk - * ends the current transaction if it is too old - */ -void reiserfs_flush_old_commits(struct super_block *sb) -{ - time64_t now; - struct reiserfs_transaction_handle th; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - now = ktime_get_seconds(); - /* - * safety check so we don't flush while we are replaying the log during - * mount - */ - if (list_empty(&journal->j_journal_list)) - return; - - /* - * check the current transaction. If there are no writers, and it is - * too old, finish it, and force the commit blocks to disk - */ - if (atomic_read(&journal->j_wcount) <= 0 && - journal->j_trans_start_time > 0 && - journal->j_len > 0 && - (now - journal->j_trans_start_time) > journal->j_max_trans_age) { - if (!journal_join(&th, sb)) { - reiserfs_prepare_for_journal(sb, - SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); - - /* - * we're only being called from kreiserfsd, it makes - * no sense to do an async commit so that kreiserfsd - * can do it later - */ - do_journal_end(&th, COMMIT_NOW | WAIT); - } - } -} - -/* - * returns 0 if do_journal_end should return right away, returns 1 if - * do_journal_end should finish the commit - * - * if the current transaction is too old, but still has writers, this will - * wait on j_join_wait until all the writers are done. By the time it - * wakes up, the transaction it was called has already ended, so it just - * flushes the commit list and returns 0. - * - * Won't batch when flush or commit_now is set. Also won't batch when - * others are waiting on j_join_wait. - * - * Note, we can't allow the journal_end to proceed while there are still - * writers in the log. - */ -static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) -{ - - time64_t now; - int flush = flags & FLUSH_ALL; - int commit_now = flags & COMMIT_NOW; - int wait_on_commit = flags & WAIT; - struct reiserfs_journal_list *jl; - struct super_block *sb = th->t_super; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - - BUG_ON(!th->t_trans_id); - - if (th->t_trans_id != journal->j_trans_id) { - reiserfs_panic(th->t_super, "journal-1577", - "handle trans id %ld != current trans id %ld", - th->t_trans_id, journal->j_trans_id); - } - - journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); - /* <= 0 is allowed. unmounting might not call begin */ - if (atomic_read(&journal->j_wcount) > 0) - atomic_dec(&journal->j_wcount); - - /* - * BUG, deal with case where j_len is 0, but people previously - * freed blocks need to be released will be dealt with by next - * transaction that actually writes something, but should be taken - * care of in this trans - */ - BUG_ON(journal->j_len == 0); - - /* - * if wcount > 0, and we are called to with flush or commit_now, - * we wait on j_join_wait. We will wake up when the last writer has - * finished the transaction, and started it on its way to the disk. - * Then, we flush the commit or journal list, and just return 0 - * because the rest of journal end was already done for this - * transaction. - */ - if (atomic_read(&journal->j_wcount) > 0) { - if (flush || commit_now) { - unsigned trans_id; - - jl = journal->j_current_jl; - trans_id = jl->j_trans_id; - if (wait_on_commit) - jl->j_state |= LIST_COMMIT_PENDING; - atomic_set(&journal->j_jlock, 1); - if (flush) { - journal->j_next_full_flush = 1; - } - unlock_journal(sb); - - /* - * sleep while the current transaction is - * still j_jlocked - */ - while (journal->j_trans_id == trans_id) { - if (atomic_read(&journal->j_jlock)) { - queue_log_writer(sb); - } else { - lock_journal(sb); - if (journal->j_trans_id == trans_id) { - atomic_set(&journal->j_jlock, - 1); - } - unlock_journal(sb); - } - } - BUG_ON(journal->j_trans_id == trans_id); - - if (commit_now - && journal_list_still_alive(sb, trans_id) - && wait_on_commit) { - flush_commit_list(sb, jl, 1); - } - return 0; - } - unlock_journal(sb); - return 0; - } - - /* deal with old transactions where we are the last writers */ - now = ktime_get_seconds(); - if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { - commit_now = 1; - journal->j_next_async_flush = 1; - } - /* don't batch when someone is waiting on j_join_wait */ - /* don't batch when syncing the commit or flushing the whole trans */ - if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) - && !flush && !commit_now && (journal->j_len < journal->j_max_batch) - && journal->j_len_alloc < journal->j_max_batch - && journal->j_cnode_free > (journal->j_trans_max * 3)) { - journal->j_bcount++; - unlock_journal(sb); - return 0; - } - - if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) { - reiserfs_panic(sb, "journal-003", - "j_start (%ld) is too high", - journal->j_start); - } - return 1; -} - -/* - * Does all the work that makes deleting blocks safe. - * when deleting a block mark BH_JNew, just remove it from the current - * transaction, clean it's buffer_head and move on. - * - * otherwise: - * set a bit for the block in the journal bitmap. That will prevent it from - * being allocated for unformatted nodes before this transaction has finished. - * - * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. - * That will prevent any old transactions with this block from trying to flush - * to the real location. Since we aren't removing the cnode from the - * journal_list_hash, *the block can't be reallocated yet. - * - * Then remove it from the current transaction, decrementing any counters and - * filing it on the clean list. - */ -int journal_mark_freed(struct reiserfs_transaction_handle *th, - struct super_block *sb, b_blocknr_t blocknr) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_cnode *cn = NULL; - struct buffer_head *bh = NULL; - struct reiserfs_list_bitmap *jb = NULL; - int cleaned = 0; - BUG_ON(!th->t_trans_id); - - cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); - if (cn && cn->bh) { - bh = cn->bh; - get_bh(bh); - } - /* if it is journal new, we just remove it from this transaction */ - if (bh && buffer_journal_new(bh)) { - clear_buffer_journal_new(bh); - clear_prepared_bits(bh); - reiserfs_clean_and_file_buffer(bh); - cleaned = remove_from_transaction(sb, blocknr, cleaned); - } else { - /* - * set the bit for this block in the journal bitmap - * for this transaction - */ - jb = journal->j_current_jl->j_list_bitmap; - if (!jb) { - reiserfs_panic(sb, "journal-1702", - "journal_list_bitmap is NULL"); - } - set_bit_in_list_bitmap(sb, blocknr, jb); - - /* Note, the entire while loop is not allowed to schedule. */ - - if (bh) { - clear_prepared_bits(bh); - reiserfs_clean_and_file_buffer(bh); - } - cleaned = remove_from_transaction(sb, blocknr, cleaned); - - /* - * find all older transactions with this block, - * make sure they don't try to write it out - */ - cn = get_journal_hash_dev(sb, journal->j_list_hash_table, - blocknr); - while (cn) { - if (sb == cn->sb && blocknr == cn->blocknr) { - set_bit(BLOCK_FREED, &cn->state); - if (cn->bh) { - /* - * remove_from_transaction will brelse - * the buffer if it was in the current - * trans - */ - if (!cleaned) { - clear_buffer_journal_dirty(cn-> - bh); - clear_buffer_dirty(cn->bh); - clear_buffer_journal_test(cn-> - bh); - cleaned = 1; - put_bh(cn->bh); - if (atomic_read - (&cn->bh->b_count) < 0) { - reiserfs_warning(sb, - "journal-2138", - "cn->bh->b_count < 0"); - } - } - /* - * since we are clearing the bh, - * we MUST dec nonzerolen - */ - if (cn->jlist) { - atomic_dec(&cn->jlist-> - j_nonzerolen); - } - cn->bh = NULL; - } - } - cn = cn->hnext; - } - } - - if (bh) - release_buffer_page(bh); /* get_hash grabs the buffer */ - return 0; -} - -void reiserfs_update_inode_transaction(struct inode *inode) -{ - struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); - REISERFS_I(inode)->i_jl = journal->j_current_jl; - REISERFS_I(inode)->i_trans_id = journal->j_trans_id; -} - -/* - * returns -1 on error, 0 if no commits/barriers were done and 1 - * if a transaction was actually committed and the barrier was done - */ -static int __commit_trans_jl(struct inode *inode, unsigned long id, - struct reiserfs_journal_list *jl) -{ - struct reiserfs_transaction_handle th; - struct super_block *sb = inode->i_sb; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - int ret = 0; - - /* - * is it from the current transaction, - * or from an unknown transaction? - */ - if (id == journal->j_trans_id) { - jl = journal->j_current_jl; - /* - * try to let other writers come in and - * grow this transaction - */ - let_transaction_grow(sb, id); - if (journal->j_trans_id != id) { - goto flush_commit_only; - } - - ret = journal_begin(&th, sb, 1); - if (ret) - return ret; - - /* someone might have ended this transaction while we joined */ - if (journal->j_trans_id != id) { - reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); - ret = journal_end(&th); - goto flush_commit_only; - } - - ret = journal_end_sync(&th); - if (!ret) - ret = 1; - - } else { - /* - * this gets tricky, we have to make sure the journal list in - * the inode still exists. We know the list is still around - * if we've got a larger transaction id than the oldest list - */ -flush_commit_only: - if (journal_list_still_alive(inode->i_sb, id)) { - /* - * we only set ret to 1 when we know for sure - * the barrier hasn't been started yet on the commit - * block. - */ - if (atomic_read(&jl->j_commit_left) > 1) - ret = 1; - flush_commit_list(sb, jl, 1); - if (journal->j_errno) - ret = journal->j_errno; - } - } - /* otherwise the list is gone, and long since committed */ - return ret; -} - -int reiserfs_commit_for_inode(struct inode *inode) -{ - unsigned int id = REISERFS_I(inode)->i_trans_id; - struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; - - /* - * for the whole inode, assume unset id means it was - * changed in the current transaction. More conservative - */ - if (!id || !jl) { - reiserfs_update_inode_transaction(inode); - id = REISERFS_I(inode)->i_trans_id; - /* jl will be updated in __commit_trans_jl */ - } - - return __commit_trans_jl(inode, id, jl); -} - -void reiserfs_restore_prepared_buffer(struct super_block *sb, - struct buffer_head *bh) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - PROC_INFO_INC(sb, journal.restore_prepared); - if (!bh) { - return; - } - if (test_clear_buffer_journal_restore_dirty(bh) && - buffer_journal_dirty(bh)) { - struct reiserfs_journal_cnode *cn; - reiserfs_write_lock(sb); - cn = get_journal_hash_dev(sb, - journal->j_list_hash_table, - bh->b_blocknr); - if (cn && can_dirty(cn)) { - set_buffer_journal_test(bh); - mark_buffer_dirty(bh); - } - reiserfs_write_unlock(sb); - } - clear_buffer_journal_prepared(bh); -} - -extern struct tree_balance *cur_tb; -/* - * before we can change a metadata block, we have to make sure it won't - * be written to disk while we are altering it. So, we must: - * clean it - * wait on it. - */ -int reiserfs_prepare_for_journal(struct super_block *sb, - struct buffer_head *bh, int wait) -{ - PROC_INFO_INC(sb, journal.prepare); - - if (!trylock_buffer(bh)) { - if (!wait) - return 0; - lock_buffer(bh); - } - set_buffer_journal_prepared(bh); - if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { - clear_buffer_journal_test(bh); - set_buffer_journal_restore_dirty(bh); - } - unlock_buffer(bh); - return 1; -} - -/* - * long and ugly. If flush, will not return until all commit - * blocks and all real buffers in the trans are on disk. - * If no_async, won't return until all commit blocks are on disk. - * - * keep reading, there are comments as you go along - * - * If the journal is aborted, we just clean up. Things like flushing - * journal lists, etc just won't happen. - */ -static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) -{ - struct super_block *sb = th->t_super; - struct reiserfs_journal *journal = SB_JOURNAL(sb); - struct reiserfs_journal_cnode *cn, *next, *jl_cn; - struct reiserfs_journal_cnode *last_cn = NULL; - struct reiserfs_journal_desc *desc; - struct reiserfs_journal_commit *commit; - struct buffer_head *c_bh; /* commit bh */ - struct buffer_head *d_bh; /* desc bh */ - int cur_write_start = 0; /* start index of current log write */ - int i; - int flush; - int wait_on_commit; - struct reiserfs_journal_list *jl, *temp_jl; - struct list_head *entry, *safe; - unsigned long jindex; - unsigned int commit_trans_id; - int trans_half; - int depth; - - BUG_ON(th->t_refcount > 1); - BUG_ON(!th->t_trans_id); - BUG_ON(!th->t_super); - - /* - * protect flush_older_commits from doing mistakes if the - * transaction ID counter gets overflowed. - */ - if (th->t_trans_id == ~0U) - flags |= FLUSH_ALL | COMMIT_NOW | WAIT; - flush = flags & FLUSH_ALL; - wait_on_commit = flags & WAIT; - - current->journal_info = th->t_handle_save; - reiserfs_check_lock_depth(sb, "journal end"); - if (journal->j_len == 0) { - reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), - 1); - journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); - } - - lock_journal(sb); - if (journal->j_next_full_flush) { - flags |= FLUSH_ALL; - flush = 1; - } - if (journal->j_next_async_flush) { - flags |= COMMIT_NOW | WAIT; - wait_on_commit = 1; - } - - /* - * check_journal_end locks the journal, and unlocks if it does - * not return 1 it tells us if we should continue with the - * journal_end, or just return - */ - if (!check_journal_end(th, flags)) { - reiserfs_schedule_old_flush(sb); - wake_queued_writers(sb); - reiserfs_async_progress_wait(sb); - goto out; - } - - /* check_journal_end might set these, check again */ - if (journal->j_next_full_flush) { - flush = 1; - } - - /* - * j must wait means we have to flush the log blocks, and the - * real blocks for this transaction - */ - if (journal->j_must_wait > 0) { - flush = 1; - } -#ifdef REISERFS_PREALLOCATE - /* - * quota ops might need to nest, setup the journal_info pointer - * for them and raise the refcount so that it is > 0. - */ - current->journal_info = th; - th->t_refcount++; - - /* it should not involve new blocks into the transaction */ - reiserfs_discard_all_prealloc(th); - - th->t_refcount--; - current->journal_info = th->t_handle_save; -#endif - - /* setup description block */ - d_bh = - journal_getblk(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - journal->j_start); - set_buffer_uptodate(d_bh); - desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; - memset(d_bh->b_data, 0, d_bh->b_size); - memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); - set_desc_trans_id(desc, journal->j_trans_id); - - /* - * setup commit block. Don't write (keep it clean too) this one - * until after everyone else is written - */ - c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - ((journal->j_start + journal->j_len + - 1) % SB_ONDISK_JOURNAL_SIZE(sb))); - commit = (struct reiserfs_journal_commit *)c_bh->b_data; - memset(c_bh->b_data, 0, c_bh->b_size); - set_commit_trans_id(commit, journal->j_trans_id); - set_buffer_uptodate(c_bh); - - /* init this journal list */ - jl = journal->j_current_jl; - - /* - * we lock the commit before doing anything because - * we want to make sure nobody tries to run flush_commit_list until - * the new transaction is fully setup, and we've already flushed the - * ordered bh list - */ - reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); - - /* save the transaction id in case we need to commit it later */ - commit_trans_id = jl->j_trans_id; - - atomic_set(&jl->j_older_commits_done, 0); - jl->j_trans_id = journal->j_trans_id; - jl->j_timestamp = journal->j_trans_start_time; - jl->j_commit_bh = c_bh; - jl->j_start = journal->j_start; - jl->j_len = journal->j_len; - atomic_set(&jl->j_nonzerolen, journal->j_len); - atomic_set(&jl->j_commit_left, journal->j_len + 2); - jl->j_realblock = NULL; - - /* - * The ENTIRE FOR LOOP MUST not cause schedule to occur. - * for each real block, add it to the journal list hash, - * copy into real block index array in the commit or desc block - */ - trans_half = journal_trans_half(sb->s_blocksize); - for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { - if (buffer_journaled(cn->bh)) { - jl_cn = get_cnode(sb); - if (!jl_cn) { - reiserfs_panic(sb, "journal-1676", - "get_cnode returned NULL"); - } - if (i == 0) { - jl->j_realblock = jl_cn; - } - jl_cn->prev = last_cn; - jl_cn->next = NULL; - if (last_cn) { - last_cn->next = jl_cn; - } - last_cn = jl_cn; - /* - * make sure the block we are trying to log - * is not a block of journal or reserved area - */ - if (is_block_in_log_or_reserved_area - (sb, cn->bh->b_blocknr)) { - reiserfs_panic(sb, "journal-2332", - "Trying to log block %lu, " - "which is a log block", - cn->bh->b_blocknr); - } - jl_cn->blocknr = cn->bh->b_blocknr; - jl_cn->state = 0; - jl_cn->sb = sb; - jl_cn->bh = cn->bh; - jl_cn->jlist = jl; - insert_journal_hash(journal->j_list_hash_table, jl_cn); - if (i < trans_half) { - desc->j_realblock[i] = - cpu_to_le32(cn->bh->b_blocknr); - } else { - commit->j_realblock[i - trans_half] = - cpu_to_le32(cn->bh->b_blocknr); - } - } else { - i--; - } - } - set_desc_trans_len(desc, journal->j_len); - set_desc_mount_id(desc, journal->j_mount_id); - set_desc_trans_id(desc, journal->j_trans_id); - set_commit_trans_len(commit, journal->j_len); - - /* - * special check in case all buffers in the journal - * were marked for not logging - */ - BUG_ON(journal->j_len == 0); - - /* - * we're about to dirty all the log blocks, mark the description block - * dirty now too. Don't mark the commit block dirty until all the - * others are on disk - */ - mark_buffer_dirty(d_bh); - - /* - * first data block is j_start + 1, so add one to - * cur_write_start wherever you use it - */ - cur_write_start = journal->j_start; - cn = journal->j_first; - jindex = 1; /* start at one so we don't get the desc again */ - while (cn) { - clear_buffer_journal_new(cn->bh); - /* copy all the real blocks into log area. dirty log blocks */ - if (buffer_journaled(cn->bh)) { - struct buffer_head *tmp_bh; - char *addr; - struct page *page; - tmp_bh = - journal_getblk(sb, - SB_ONDISK_JOURNAL_1st_BLOCK(sb) + - ((cur_write_start + - jindex) % - SB_ONDISK_JOURNAL_SIZE(sb))); - set_buffer_uptodate(tmp_bh); - page = cn->bh->b_page; - addr = kmap(page); - memcpy(tmp_bh->b_data, - addr + offset_in_page(cn->bh->b_data), - cn->bh->b_size); - kunmap(page); - mark_buffer_dirty(tmp_bh); - jindex++; - set_buffer_journal_dirty(cn->bh); - clear_buffer_journaled(cn->bh); - } else { - /* - * JDirty cleared sometime during transaction. - * don't log this one - */ - reiserfs_warning(sb, "journal-2048", - "BAD, buffer in journal hash, " - "but not JDirty!"); - brelse(cn->bh); - } - next = cn->next; - free_cnode(sb, cn); - cn = next; - reiserfs_cond_resched(sb); - } - - /* - * we are done with both the c_bh and d_bh, but - * c_bh must be written after all other commit blocks, - * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. - */ - - journal->j_current_jl = alloc_journal_list(sb); - - /* now it is safe to insert this transaction on the main list */ - list_add_tail(&jl->j_list, &journal->j_journal_list); - list_add_tail(&jl->j_working_list, &journal->j_working_list); - journal->j_num_work_lists++; - - /* reset journal values for the next transaction */ - journal->j_start = - (journal->j_start + journal->j_len + - 2) % SB_ONDISK_JOURNAL_SIZE(sb); - atomic_set(&journal->j_wcount, 0); - journal->j_bcount = 0; - journal->j_last = NULL; - journal->j_first = NULL; - journal->j_len = 0; - journal->j_trans_start_time = 0; - /* check for trans_id overflow */ - if (++journal->j_trans_id == 0) - journal->j_trans_id = 10; - journal->j_current_jl->j_trans_id = journal->j_trans_id; - journal->j_must_wait = 0; - journal->j_len_alloc = 0; - journal->j_next_full_flush = 0; - journal->j_next_async_flush = 0; - init_journal_hash(sb); - - /* - * make sure reiserfs_add_jh sees the new current_jl before we - * write out the tails - */ - smp_mb(); - - /* - * tail conversion targets have to hit the disk before we end the - * transaction. Otherwise a later transaction might repack the tail - * before this transaction commits, leaving the data block unflushed - * and clean, if we crash before the later transaction commits, the - * data block is lost. - */ - if (!list_empty(&jl->j_tail_bh_list)) { - depth = reiserfs_write_unlock_nested(sb); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_tail_bh_list); - reiserfs_write_lock_nested(sb, depth); - } - BUG_ON(!list_empty(&jl->j_tail_bh_list)); - mutex_unlock(&jl->j_commit_mutex); - - /* - * honor the flush wishes from the caller, simple commits can - * be done outside the journal lock, they are done below - * - * if we don't flush the commit list right now, we put it into - * the work queue so the people waiting on the async progress work - * queue don't wait for this proc to flush journal lists and such. - */ - if (flush) { - flush_commit_list(sb, jl, 1); - flush_journal_list(sb, jl, 1); - } else if (!(jl->j_state & LIST_COMMIT_PENDING)) { - /* - * Avoid queueing work when sb is being shut down. Transaction - * will be flushed on journal shutdown. - */ - if (sb->s_flags & SB_ACTIVE) - queue_delayed_work(REISERFS_SB(sb)->commit_wq, - &journal->j_work, HZ / 10); - } - - /* - * if the next transaction has any chance of wrapping, flush - * transactions that might get overwritten. If any journal lists - * are very old flush them as well. - */ -first_jl: - list_for_each_safe(entry, safe, &journal->j_journal_list) { - temp_jl = JOURNAL_LIST_ENTRY(entry); - if (journal->j_start <= temp_jl->j_start) { - if ((journal->j_start + journal->j_trans_max + 1) >= - temp_jl->j_start) { - flush_used_journal_lists(sb, temp_jl); - goto first_jl; - } else if ((journal->j_start + - journal->j_trans_max + 1) < - SB_ONDISK_JOURNAL_SIZE(sb)) { - /* - * if we don't cross into the next - * transaction and we don't wrap, there is - * no way we can overlap any later transactions - * break now - */ - break; - } - } else if ((journal->j_start + - journal->j_trans_max + 1) > - SB_ONDISK_JOURNAL_SIZE(sb)) { - if (((journal->j_start + journal->j_trans_max + 1) % - SB_ONDISK_JOURNAL_SIZE(sb)) >= - temp_jl->j_start) { - flush_used_journal_lists(sb, temp_jl); - goto first_jl; - } else { - /* - * we don't overlap anything from out start - * to the end of the log, and our wrapped - * portion doesn't overlap anything at - * the start of the log. We can break - */ - break; - } - } - } - - journal->j_current_jl->j_list_bitmap = - get_list_bitmap(sb, journal->j_current_jl); - - if (!(journal->j_current_jl->j_list_bitmap)) { - reiserfs_panic(sb, "journal-1996", - "could not get a list bitmap"); - } - - atomic_set(&journal->j_jlock, 0); - unlock_journal(sb); - /* wake up any body waiting to join. */ - clear_bit(J_WRITERS_QUEUED, &journal->j_state); - wake_up(&journal->j_join_wait); - - if (!flush && wait_on_commit && - journal_list_still_alive(sb, commit_trans_id)) { - flush_commit_list(sb, jl, 1); - } -out: - reiserfs_check_lock_depth(sb, "journal end2"); - - memset(th, 0, sizeof(*th)); - /* - * Re-set th->t_super, so we can properly keep track of how many - * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later - */ - th->t_super = sb; - - return journal->j_errno; -} - -/* Send the file system read only and refuse new transactions */ -void reiserfs_abort_journal(struct super_block *sb, int errno) -{ - struct reiserfs_journal *journal = SB_JOURNAL(sb); - if (test_bit(J_ABORTED, &journal->j_state)) - return; - - if (!journal->j_errno) - journal->j_errno = errno; - - sb->s_flags |= SB_RDONLY; - set_bit(J_ABORTED, &journal->j_state); - -#ifdef CONFIG_REISERFS_CHECK - dump_stack(); -#endif -} diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c deleted file mode 100644 index 7f868569d4d0..000000000000 --- a/fs/reiserfs/lbalance.c +++ /dev/null @@ -1,1426 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/uaccess.h> -#include <linux/string.h> -#include <linux/time.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> - -/* - * copy copy_count entries from source directory item to dest buffer - * (creating new item if needed) - */ -static void leaf_copy_dir_entries(struct buffer_info *dest_bi, - struct buffer_head *source, int last_first, - int item_num, int from, int copy_count) -{ - struct buffer_head *dest = dest_bi->bi_bh; - /* - * either the number of target item, or if we must create a - * new item, the number of the item we will create it next to - */ - int item_num_in_dest; - - struct item_head *ih; - struct reiserfs_de_head *deh; - int copy_records_len; /* length of all records in item to be copied */ - char *records; - - ih = item_head(source, item_num); - - RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); - - /* - * length of all record to be copied and first byte of - * the last of them - */ - deh = B_I_DEH(source, ih); - if (copy_count) { - copy_records_len = (from ? deh_location(&deh[from - 1]) : - ih_item_len(ih)) - - deh_location(&deh[from + copy_count - 1]); - records = - source->b_data + ih_location(ih) + - deh_location(&deh[from + copy_count - 1]); - } else { - copy_records_len = 0; - records = NULL; - } - - /* when copy last to first, dest buffer can contain 0 items */ - item_num_in_dest = - (last_first == - LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) - - 1); - - /* - * if there are no items in dest or the first/last item in - * dest is not item of the same directory - */ - if ((item_num_in_dest == -1) || - (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || - (last_first == LAST_TO_FIRST - && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, - leaf_key(dest, - item_num_in_dest)))) - { - /* create new item in dest */ - struct item_head new_ih; - - /* form item header */ - memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); - put_ih_version(&new_ih, KEY_FORMAT_3_5); - /* calculate item len */ - put_ih_item_len(&new_ih, - DEH_SIZE * copy_count + copy_records_len); - put_ih_entry_count(&new_ih, 0); - - if (last_first == LAST_TO_FIRST) { - /* form key by the following way */ - if (from < ih_entry_count(ih)) { - set_le_ih_k_offset(&new_ih, - deh_offset(&deh[from])); - } else { - /* - * no entries will be copied to this - * item in this function - */ - set_le_ih_k_offset(&new_ih, U32_MAX); - /* - * this item is not yet valid, but we - * want I_IS_DIRECTORY_ITEM to return 1 - * for it, so we -1 - */ - } - set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, - TYPE_DIRENTRY); - } - - /* insert item into dest buffer */ - leaf_insert_into_buf(dest_bi, - (last_first == - LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), - &new_ih, NULL, 0); - } else { - /* prepare space for entries */ - leaf_paste_in_buffer(dest_bi, - (last_first == - FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - - 1) : 0, MAX_US_INT, - DEH_SIZE * copy_count + copy_records_len, - records, 0); - } - - item_num_in_dest = - (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; - - leaf_paste_entries(dest_bi, item_num_in_dest, - (last_first == - FIRST_TO_LAST) ? ih_entry_count(item_head(dest, - item_num_in_dest)) - : 0, copy_count, deh + from, records, - DEH_SIZE * copy_count + copy_records_len); -} - -/* - * Copy the first (if last_first == FIRST_TO_LAST) or last - * (last_first == LAST_TO_FIRST) item or part of it or nothing - * (see the return 0 below) from SOURCE to the end (if last_first) - * or beginning (!last_first) of the DEST - */ -/* returns 1 if anything was copied, else 0 */ -static int leaf_copy_boundary_item(struct buffer_info *dest_bi, - struct buffer_head *src, int last_first, - int bytes_or_entries) -{ - struct buffer_head *dest = dest_bi->bi_bh; - /* number of items in the source and destination buffers */ - int dest_nr_item, src_nr_item; - struct item_head *ih; - struct item_head *dih; - - dest_nr_item = B_NR_ITEMS(dest); - - /* - * if ( DEST is empty or first item of SOURCE and last item of - * DEST are the items of different objects or of different types ) - * then there is no need to treat this item differently from the - * other items that we copy, so we return - */ - if (last_first == FIRST_TO_LAST) { - ih = item_head(src, 0); - dih = item_head(dest, dest_nr_item - 1); - - /* there is nothing to merge */ - if (!dest_nr_item - || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) - return 0; - - RFALSE(!ih_item_len(ih), - "vs-10010: item can not have empty length"); - - if (is_direntry_le_ih(ih)) { - if (bytes_or_entries == -1) - /* copy all entries to dest */ - bytes_or_entries = ih_entry_count(ih); - leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, - bytes_or_entries); - return 1; - } - - /* - * copy part of the body of the first item of SOURCE - * to the end of the body of the last item of the DEST - * part defined by 'bytes_or_entries'; if bytes_or_entries - * == -1 copy whole body; don't create new item header - */ - if (bytes_or_entries == -1) - bytes_or_entries = ih_item_len(ih); - -#ifdef CONFIG_REISERFS_CHECK - else { - if (bytes_or_entries == ih_item_len(ih) - && is_indirect_le_ih(ih)) - if (get_ih_free_space(ih)) - reiserfs_panic(sb_from_bi(dest_bi), - "vs-10020", - "last unformatted node " - "must be filled " - "entirely (%h)", ih); - } -#endif - - /* - * merge first item (or its part) of src buffer with the last - * item of dest buffer. Both are of the same file - */ - leaf_paste_in_buffer(dest_bi, - dest_nr_item - 1, ih_item_len(dih), - bytes_or_entries, ih_item_body(src, ih), 0); - - if (is_indirect_le_ih(dih)) { - RFALSE(get_ih_free_space(dih), - "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", - ih); - if (bytes_or_entries == ih_item_len(ih)) - set_ih_free_space(dih, get_ih_free_space(ih)); - } - - return 1; - } - - /* copy boundary item to right (last_first == LAST_TO_FIRST) */ - - /* - * (DEST is empty or last item of SOURCE and first item of DEST - * are the items of different object or of different types) - */ - src_nr_item = B_NR_ITEMS(src); - ih = item_head(src, src_nr_item - 1); - dih = item_head(dest, 0); - - if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) - return 0; - - if (is_direntry_le_ih(ih)) { - /* - * bytes_or_entries = entries number in last - * item body of SOURCE - */ - if (bytes_or_entries == -1) - bytes_or_entries = ih_entry_count(ih); - - leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, - src_nr_item - 1, - ih_entry_count(ih) - bytes_or_entries, - bytes_or_entries); - return 1; - } - - /* - * copy part of the body of the last item of SOURCE to the - * begin of the body of the first item of the DEST; part defined - * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; - * change first item key of the DEST; don't create new item header - */ - - RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), - "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", - ih); - - if (bytes_or_entries == -1) { - /* bytes_or_entries = length of last item body of SOURCE */ - bytes_or_entries = ih_item_len(ih); - - RFALSE(le_ih_k_offset(dih) != - le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), - "vs-10050: items %h and %h do not match", ih, dih); - - /* change first item key of the DEST */ - set_le_ih_k_offset(dih, le_ih_k_offset(ih)); - - /* item becomes non-mergeable */ - /* or mergeable if left item was */ - set_le_ih_k_type(dih, le_ih_k_type(ih)); - } else { - /* merge to right only part of item */ - RFALSE(ih_item_len(ih) <= bytes_or_entries, - "vs-10060: no so much bytes %lu (needed %lu)", - (unsigned long)ih_item_len(ih), - (unsigned long)bytes_or_entries); - - /* change first item key of the DEST */ - if (is_direct_le_ih(dih)) { - RFALSE(le_ih_k_offset(dih) <= - (unsigned long)bytes_or_entries, - "vs-10070: dih %h, bytes_or_entries(%d)", dih, - bytes_or_entries); - set_le_ih_k_offset(dih, - le_ih_k_offset(dih) - - bytes_or_entries); - } else { - RFALSE(le_ih_k_offset(dih) <= - (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, - "vs-10080: dih %h, bytes_or_entries(%d)", - dih, - (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); - set_le_ih_k_offset(dih, - le_ih_k_offset(dih) - - ((bytes_or_entries / UNFM_P_SIZE) * - dest->b_size)); - } - } - - leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, - ih_item_body(src, - ih) + ih_item_len(ih) - bytes_or_entries, - 0); - return 1; -} - -/* - * copy cpy_mun items from buffer src to buffer dest - * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning - * from first-th item in src to tail of dest - * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning - * from first-th item in src to head of dest - */ -static void leaf_copy_items_entirely(struct buffer_info *dest_bi, - struct buffer_head *src, int last_first, - int first, int cpy_num) -{ - struct buffer_head *dest; - int nr, free_space; - int dest_before; - int last_loc, last_inserted_loc, location; - int i, j; - struct block_head *blkh; - struct item_head *ih; - - RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, - "vs-10090: bad last_first parameter %d", last_first); - RFALSE(B_NR_ITEMS(src) - first < cpy_num, - "vs-10100: too few items in source %d, required %d from %d", - B_NR_ITEMS(src), cpy_num, first); - RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); - RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); - - dest = dest_bi->bi_bh; - - RFALSE(!dest, "vs-10130: can not copy negative amount of items"); - - if (cpy_num == 0) - return; - - blkh = B_BLK_HEAD(dest); - nr = blkh_nr_item(blkh); - free_space = blkh_free_space(blkh); - - /* - * we will insert items before 0-th or nr-th item in dest buffer. - * It depends of last_first parameter - */ - dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; - - /* location of head of first new item */ - ih = item_head(dest, dest_before); - - RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, - "vs-10140: not enough free space for headers %d (needed %d)", - B_FREE_SPACE(dest), cpy_num * IH_SIZE); - - /* prepare space for headers */ - memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); - - /* copy item headers */ - memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); - - free_space -= (IH_SIZE * cpy_num); - set_blkh_free_space(blkh, free_space); - - /* location of unmovable item */ - j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); - for (i = dest_before; i < nr + cpy_num; i++) { - location -= ih_item_len(ih + i - dest_before); - put_ih_location(ih + i - dest_before, location); - } - - /* prepare space for items */ - last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); - last_inserted_loc = ih_location(&ih[cpy_num - 1]); - - /* check free space */ - RFALSE(free_space < j - last_inserted_loc, - "vs-10150: not enough free space for items %d (needed %d)", - free_space, j - last_inserted_loc); - - memmove(dest->b_data + last_loc, - dest->b_data + last_loc + j - last_inserted_loc, - last_inserted_loc - last_loc); - - /* copy items */ - memcpy(dest->b_data + last_inserted_loc, - item_body(src, (first + cpy_num - 1)), - j - last_inserted_loc); - - /* sizes, item number */ - set_blkh_nr_item(blkh, nr + cpy_num); - set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); - - do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); - RFALSE(dc_block_number(t_dc) != dest->b_blocknr, - "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", - (long unsigned)dest->b_blocknr, - (long unsigned)dc_block_number(t_dc)); - put_dc_size(t_dc, - dc_size(t_dc) + (j - last_inserted_loc + - IH_SIZE * cpy_num)); - - do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, - 0); - } -} - -/* - * This function splits the (liquid) item into two items (useful when - * shifting part of an item into another node.) - */ -static void leaf_item_bottle(struct buffer_info *dest_bi, - struct buffer_head *src, int last_first, - int item_num, int cpy_bytes) -{ - struct buffer_head *dest = dest_bi->bi_bh; - struct item_head *ih; - - RFALSE(cpy_bytes == -1, - "vs-10170: bytes == - 1 means: do not split item"); - - if (last_first == FIRST_TO_LAST) { - /* - * if ( if item in position item_num in buffer SOURCE - * is directory item ) - */ - ih = item_head(src, item_num); - if (is_direntry_le_ih(ih)) - leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, - item_num, 0, cpy_bytes); - else { - struct item_head n_ih; - - /* - * copy part of the body of the item number 'item_num' - * of SOURCE to the end of the DEST part defined by - * 'cpy_bytes'; create new item header; change old - * item_header (????); n_ih = new item_header; - */ - memcpy(&n_ih, ih, IH_SIZE); - put_ih_item_len(&n_ih, cpy_bytes); - if (is_indirect_le_ih(ih)) { - RFALSE(cpy_bytes == ih_item_len(ih) - && get_ih_free_space(ih), - "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", - (long unsigned)get_ih_free_space(ih)); - set_ih_free_space(&n_ih, 0); - } - - RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), - "vs-10190: bad mergeability of item %h", ih); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ - leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, - item_body(src, item_num), 0); - } - } else { - /* - * if ( if item in position item_num in buffer - * SOURCE is directory item ) - */ - ih = item_head(src, item_num); - if (is_direntry_le_ih(ih)) - leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, - item_num, - ih_entry_count(ih) - cpy_bytes, - cpy_bytes); - else { - struct item_head n_ih; - - /* - * copy part of the body of the item number 'item_num' - * of SOURCE to the begin of the DEST part defined by - * 'cpy_bytes'; create new item header; - * n_ih = new item_header; - */ - memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE); - - /* Endian safe, both le */ - n_ih.ih_version = ih->ih_version; - - if (is_direct_le_ih(ih)) { - set_le_ih_k_offset(&n_ih, - le_ih_k_offset(ih) + - ih_item_len(ih) - cpy_bytes); - set_le_ih_k_type(&n_ih, TYPE_DIRECT); - set_ih_free_space(&n_ih, MAX_US_INT); - } else { - /* indirect item */ - RFALSE(!cpy_bytes && get_ih_free_space(ih), - "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); - set_le_ih_k_offset(&n_ih, - le_ih_k_offset(ih) + - (ih_item_len(ih) - - cpy_bytes) / UNFM_P_SIZE * - dest->b_size); - set_le_ih_k_type(&n_ih, TYPE_INDIRECT); - set_ih_free_space(&n_ih, get_ih_free_space(ih)); - } - - /* set item length */ - put_ih_item_len(&n_ih, cpy_bytes); - - /* Endian safe, both le */ - n_ih.ih_version = ih->ih_version; - - leaf_insert_into_buf(dest_bi, 0, &n_ih, - item_body(src, item_num) + - ih_item_len(ih) - cpy_bytes, 0); - } - } -} - -/* - * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE - * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole - * items from SOURCE to DEST. From last item copy cpy_num bytes for regular - * item and cpy_num directory entries for directory item. - */ -static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, - int last_first, int cpy_num, int cpy_bytes) -{ - struct buffer_head *dest; - int pos, i, src_nr_item, bytes; - - dest = dest_bi->bi_bh; - RFALSE(!dest || !src, "vs-10210: !dest || !src"); - RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, - "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); - RFALSE(B_NR_ITEMS(src) < cpy_num, - "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), - cpy_num); - RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); - - if (cpy_num == 0) - return 0; - - if (last_first == FIRST_TO_LAST) { - /* copy items to left */ - pos = 0; - if (cpy_num == 1) - bytes = cpy_bytes; - else - bytes = -1; - - /* - * copy the first item or it part or nothing to the end of - * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) - */ - i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); - cpy_num -= i; - if (cpy_num == 0) - return i; - pos += i; - if (cpy_bytes == -1) - /* - * copy first cpy_num items starting from position - * 'pos' of SOURCE to end of DEST - */ - leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, - pos, cpy_num); - else { - /* - * copy first cpy_num-1 items starting from position - * 'pos-1' of the SOURCE to the end of the DEST - */ - leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, - pos, cpy_num - 1); - - /* - * copy part of the item which number is - * cpy_num+pos-1 to the end of the DEST - */ - leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, - cpy_num + pos - 1, cpy_bytes); - } - } else { - /* copy items to right */ - src_nr_item = B_NR_ITEMS(src); - if (cpy_num == 1) - bytes = cpy_bytes; - else - bytes = -1; - - /* - * copy the last item or it part or nothing to the - * begin of the DEST - * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); - */ - i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); - - cpy_num -= i; - if (cpy_num == 0) - return i; - - pos = src_nr_item - cpy_num - i; - if (cpy_bytes == -1) { - /* - * starting from position 'pos' copy last cpy_num - * items of SOURCE to begin of DEST - */ - leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, - pos, cpy_num); - } else { - /* - * copy last cpy_num-1 items starting from position - * 'pos+1' of the SOURCE to the begin of the DEST; - */ - leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, - pos + 1, cpy_num - 1); - - /* - * copy part of the item which number is pos to - * the begin of the DEST - */ - leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, - cpy_bytes); - } - } - return i; -} - -/* - * there are types of coping: from S[0] to L[0], from S[0] to R[0], - * from R[0] to L[0]. for each of these we have to define parent and - * positions of destination and source buffers - */ -static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, - struct buffer_info *dest_bi, - struct buffer_info *src_bi, - int *first_last, - struct buffer_head *Snew) -{ - memset(dest_bi, 0, sizeof(struct buffer_info)); - memset(src_bi, 0, sizeof(struct buffer_info)); - - /* define dest, src, dest parent, dest position */ - switch (shift_mode) { - case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - - /* src->b_item_order */ - src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[0]; - dest_bi->bi_parent = tb->FL[0]; - dest_bi->bi_position = get_left_neighbor_position(tb, 0); - *first_last = FIRST_TO_LAST; - break; - - case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[0]; - dest_bi->bi_parent = tb->FR[0]; - dest_bi->bi_position = get_right_neighbor_position(tb, 0); - *first_last = LAST_TO_FIRST; - break; - - case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ - src_bi->tb = tb; - src_bi->bi_bh = tb->R[0]; - src_bi->bi_parent = tb->FR[0]; - src_bi->bi_position = get_right_neighbor_position(tb, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[0]; - dest_bi->bi_parent = tb->FL[0]; - dest_bi->bi_position = get_left_neighbor_position(tb, 0); - *first_last = FIRST_TO_LAST; - break; - - case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ - src_bi->tb = tb; - src_bi->bi_bh = tb->L[0]; - src_bi->bi_parent = tb->FL[0]; - src_bi->bi_position = get_left_neighbor_position(tb, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[0]; - dest_bi->bi_parent = tb->FR[0]; - dest_bi->bi_position = get_right_neighbor_position(tb, 0); - *first_last = LAST_TO_FIRST; - break; - - case LEAF_FROM_S_TO_SNEW: - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = Snew; - dest_bi->bi_parent = NULL; - dest_bi->bi_position = 0; - *first_last = LAST_TO_FIRST; - break; - - default: - reiserfs_panic(sb_from_bi(src_bi), "vs-10250", - "shift type is unknown (%d)", shift_mode); - } - RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, - "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", - shift_mode, src_bi->bi_bh, dest_bi->bi_bh); -} - -/* - * copy mov_num items and mov_bytes of the (mov_num-1)th item to - * neighbor. Delete them from source - */ -int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, - int mov_bytes, struct buffer_head *Snew) -{ - int ret_value; - struct buffer_info dest_bi, src_bi; - int first_last; - - leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, - &first_last, Snew); - - ret_value = - leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, - mov_bytes); - - leaf_delete_items(&src_bi, first_last, - (first_last == - FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - - mov_num), mov_num, mov_bytes); - - return ret_value; -} - -/* - * Shift shift_num items (and shift_bytes of last shifted item if - * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key - */ -int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) -{ - struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); - int i; - - /* - * move shift_num (and shift_bytes bytes) items from S[0] - * to left neighbor L[0] - */ - i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); - - if (shift_num) { - /* number of items in S[0] == 0 */ - if (B_NR_ITEMS(S0) == 0) { - - RFALSE(shift_bytes != -1, - "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", - shift_bytes); -#ifdef CONFIG_REISERFS_CHECK - if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { - print_cur_tb("vs-10275"); - reiserfs_panic(tb->tb_sb, "vs-10275", - "balance condition corrupted " - "(%c)", tb->tb_mode); - } -#endif - - if (PATH_H_POSITION(tb->tb_path, 1) == 0) - replace_key(tb, tb->CFL[0], tb->lkey[0], - PATH_H_PPARENT(tb->tb_path, 0), 0); - - } else { - /* replace lkey in CFL[0] by 0-th key from S[0]; */ - replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); - - RFALSE((shift_bytes != -1 && - !(is_direntry_le_ih(item_head(S0, 0)) - && !ih_entry_count(item_head(S0, 0)))) && - (!op_is_left_mergeable - (leaf_key(S0, 0), S0->b_size)), - "vs-10280: item must be mergeable"); - } - } - - return i; -} - -/* CLEANING STOPPED HERE */ - -/* - * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, - * and replace the delimiting key - */ -int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) -{ - int ret_value; - - /* - * move shift_num (and shift_bytes) items from S[0] to - * right neighbor R[0] - */ - ret_value = - leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); - - /* replace rkey in CFR[0] by the 0-th key from R[0] */ - if (shift_num) { - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - - } - - return ret_value; -} - -static void leaf_delete_items_entirely(struct buffer_info *bi, - int first, int del_num); -/* - * If del_bytes == -1, starting from position 'first' delete del_num - * items in whole in buffer CUR. - * If not. - * If last_first == 0. Starting from position 'first' delete del_num-1 - * items in whole. Delete part of body of the first item. Part defined by - * del_bytes. Don't delete first item header - * If last_first == 1. Starting from position 'first+1' delete del_num-1 - * items in whole. Delete part of body of the last item . Part defined by - * del_bytes. Don't delete last item header. -*/ -void leaf_delete_items(struct buffer_info *cur_bi, int last_first, - int first, int del_num, int del_bytes) -{ - struct buffer_head *bh; - int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); - - RFALSE(!bh, "10155: bh is not defined"); - RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", - del_num); - RFALSE(first < 0 - || first + del_num > item_amount, - "10165: invalid number of first item to be deleted (%d) or " - "no so much items (%d) to delete (only %d)", first, - first + del_num, item_amount); - - if (del_num == 0) - return; - - if (first == 0 && del_num == item_amount && del_bytes == -1) { - make_empty_node(cur_bi); - do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); - return; - } - - if (del_bytes == -1) - /* delete del_num items beginning from item in position first */ - leaf_delete_items_entirely(cur_bi, first, del_num); - else { - if (last_first == FIRST_TO_LAST) { - /* - * delete del_num-1 items beginning from - * item in position first - */ - leaf_delete_items_entirely(cur_bi, first, del_num - 1); - - /* - * delete the part of the first item of the bh - * do not delete item header - */ - leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); - } else { - struct item_head *ih; - int len; - - /* - * delete del_num-1 items beginning from - * item in position first+1 - */ - leaf_delete_items_entirely(cur_bi, first + 1, - del_num - 1); - - ih = item_head(bh, B_NR_ITEMS(bh) - 1); - if (is_direntry_le_ih(ih)) - /* the last item is directory */ - /* - * len = numbers of directory entries - * in this item - */ - len = ih_entry_count(ih); - else - /* len = body len of item */ - len = ih_item_len(ih); - - /* - * delete the part of the last item of the bh - * do not delete item header - */ - leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, - len - del_bytes, del_bytes); - } - } -} - -/* insert item into the leaf node in position before */ -void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head * const inserted_item_ih, - const char * const inserted_item_body, - int zeros_number) -{ - struct buffer_head *bh = bi->bi_bh; - int nr, free_space; - struct block_head *blkh; - struct item_head *ih; - int i; - int last_loc, unmoved_loc; - char *to; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - free_space = blkh_free_space(blkh); - - /* check free space */ - RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, - "vs-10170: not enough free space in block %z, new item %h", - bh, inserted_item_ih); - RFALSE(zeros_number > ih_item_len(inserted_item_ih), - "vs-10172: zero number == %d, item length == %d", - zeros_number, ih_item_len(inserted_item_ih)); - - /* get item new item must be inserted before */ - ih = item_head(bh, before); - - /* prepare space for the body of new item */ - last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; - unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; - - memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), - bh->b_data + last_loc, unmoved_loc - last_loc); - - to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); - memset(to, 0, zeros_number); - to += zeros_number; - - /* copy body to prepared space */ - if (inserted_item_body) - memmove(to, inserted_item_body, - ih_item_len(inserted_item_ih) - zeros_number); - else - memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); - - /* insert item header */ - memmove(ih + 1, ih, IH_SIZE * (nr - before)); - memmove(ih, inserted_item_ih, IH_SIZE); - - /* change locations */ - for (i = before; i < nr + 1; i++) { - unmoved_loc -= ih_item_len(&ih[i - before]); - put_ih_location(&ih[i - before], unmoved_loc); - } - - /* sizes, free space, item number */ - set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); - set_blkh_free_space(blkh, - free_space - (IH_SIZE + - ih_item_len(inserted_item_ih))); - do_balance_mark_leaf_dirty(bi->tb, bh, 1); - - if (bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); - put_dc_size(t_dc, - dc_size(t_dc) + (IH_SIZE + - ih_item_len(inserted_item_ih))); - do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); - } -} - -/* - * paste paste_size bytes to affected_item_num-th item. - * When item is a directory, this only prepare space for new entries - */ -void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, - int pos_in_item, int paste_size, - const char *body, int zeros_number) -{ - struct buffer_head *bh = bi->bi_bh; - int nr, free_space; - struct block_head *blkh; - struct item_head *ih; - int i; - int last_loc, unmoved_loc; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - free_space = blkh_free_space(blkh); - - /* check free space */ - RFALSE(free_space < paste_size, - "vs-10175: not enough free space: needed %d, available %d", - paste_size, free_space); - -#ifdef CONFIG_REISERFS_CHECK - if (zeros_number > paste_size) { - struct super_block *sb = NULL; - if (bi && bi->tb) - sb = bi->tb->tb_sb; - print_cur_tb("10177"); - reiserfs_panic(sb, "vs-10177", - "zeros_number == %d, paste_size == %d", - zeros_number, paste_size); - } -#endif /* CONFIG_REISERFS_CHECK */ - - /* item to be appended */ - ih = item_head(bh, affected_item_num); - - last_loc = ih_location(&ih[nr - affected_item_num - 1]); - unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; - - /* prepare space */ - memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, - unmoved_loc - last_loc); - - /* change locations */ - for (i = affected_item_num; i < nr; i++) - put_ih_location(&ih[i - affected_item_num], - ih_location(&ih[i - affected_item_num]) - - paste_size); - - if (body) { - if (!is_direntry_le_ih(ih)) { - if (!pos_in_item) { - /* shift data to right */ - memmove(bh->b_data + ih_location(ih) + - paste_size, - bh->b_data + ih_location(ih), - ih_item_len(ih)); - /* paste data in the head of item */ - memset(bh->b_data + ih_location(ih), 0, - zeros_number); - memcpy(bh->b_data + ih_location(ih) + - zeros_number, body, - paste_size - zeros_number); - } else { - memset(bh->b_data + unmoved_loc - paste_size, 0, - zeros_number); - memcpy(bh->b_data + unmoved_loc - paste_size + - zeros_number, body, - paste_size - zeros_number); - } - } - } else - memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); - - put_ih_item_len(ih, ih_item_len(ih) + paste_size); - - /* change free space */ - set_blkh_free_space(blkh, free_space - paste_size); - - do_balance_mark_leaf_dirty(bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc = - B_N_CHILD(bi->bi_parent, bi->bi_position); - put_dc_size(t_dc, dc_size(t_dc) + paste_size); - do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); - } -} - -/* - * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item - * does not have free space, so it moves DEHs and remaining records as - * necessary. Return value is size of removed part of directory item - * in bytes. - */ -static int leaf_cut_entries(struct buffer_head *bh, - struct item_head *ih, int from, int del_count) -{ - char *item; - struct reiserfs_de_head *deh; - int prev_record_offset; /* offset of record, that is (from-1)th */ - char *prev_record; /* */ - int cut_records_len; /* length of all removed records */ - int i; - - /* - * make sure that item is directory and there are enough entries to - * remove - */ - RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); - RFALSE(ih_entry_count(ih) < from + del_count, - "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", - ih_entry_count(ih), from, del_count); - - if (del_count == 0) - return 0; - - /* first byte of item */ - item = bh->b_data + ih_location(ih); - - /* entry head array */ - deh = B_I_DEH(bh, ih); - - /* - * first byte of remaining entries, those are BEFORE cut entries - * (prev_record) and length of all removed records (cut_records_len) - */ - prev_record_offset = - (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); - cut_records_len = prev_record_offset /*from_record */ - - deh_location(&deh[from + del_count - 1]); - prev_record = item + prev_record_offset; - - /* adjust locations of remaining entries */ - for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) - put_deh_location(&deh[i], - deh_location(&deh[i]) - - (DEH_SIZE * del_count)); - - for (i = 0; i < from; i++) - put_deh_location(&deh[i], - deh_location(&deh[i]) - (DEH_SIZE * del_count + - cut_records_len)); - - put_ih_entry_count(ih, ih_entry_count(ih) - del_count); - - /* shift entry head array and entries those are AFTER removed entries */ - memmove((char *)(deh + from), - deh + from + del_count, - prev_record - cut_records_len - (char *)(deh + from + - del_count)); - - /* shift records, those are BEFORE removed entries */ - memmove(prev_record - cut_records_len - DEH_SIZE * del_count, - prev_record, item + ih_item_len(ih) - prev_record); - - return DEH_SIZE * del_count + cut_records_len; -} - -/* - * when cut item is part of regular file - * pos_in_item - first byte that must be cut - * cut_size - number of bytes to be cut beginning from pos_in_item - * - * when cut item is part of directory - * pos_in_item - number of first deleted entry - * cut_size - count of deleted entries - */ -void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, - int pos_in_item, int cut_size) -{ - int nr; - struct buffer_head *bh = bi->bi_bh; - struct block_head *blkh; - struct item_head *ih; - int last_loc, unmoved_loc; - int i; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - - /* item head of truncated item */ - ih = item_head(bh, cut_item_num); - - if (is_direntry_le_ih(ih)) { - /* first cut entry () */ - cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); - if (pos_in_item == 0) { - /* change key */ - RFALSE(cut_item_num, - "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", - cut_item_num); - /* change item key by key of first entry in the item */ - set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); - } - } else { - /* item is direct or indirect */ - RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); - RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), - "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", - (long unsigned)pos_in_item, (long unsigned)cut_size, - (long unsigned)ih_item_len(ih)); - - /* shift item body to left if cut is from the head of item */ - if (pos_in_item == 0) { - memmove(bh->b_data + ih_location(ih), - bh->b_data + ih_location(ih) + cut_size, - ih_item_len(ih) - cut_size); - - /* change key of item */ - if (is_direct_le_ih(ih)) - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - cut_size); - else { - set_le_ih_k_offset(ih, - le_ih_k_offset(ih) + - (cut_size / UNFM_P_SIZE) * - bh->b_size); - RFALSE(ih_item_len(ih) == cut_size - && get_ih_free_space(ih), - "10205: invalid ih_free_space (%h)", ih); - } - } - } - - /* location of the last item */ - last_loc = ih_location(&ih[nr - cut_item_num - 1]); - - /* location of the item, which is remaining at the same place */ - unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; - - /* shift */ - memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, - unmoved_loc - last_loc - cut_size); - - /* change item length */ - put_ih_item_len(ih, ih_item_len(ih) - cut_size); - - if (is_indirect_le_ih(ih)) { - if (pos_in_item) - set_ih_free_space(ih, 0); - } - - /* change locations */ - for (i = cut_item_num; i < nr; i++) - put_ih_location(&ih[i - cut_item_num], - ih_location(&ih[i - cut_item_num]) + cut_size); - - /* size, free space */ - set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); - - do_balance_mark_leaf_dirty(bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); - put_dc_size(t_dc, dc_size(t_dc) - cut_size); - do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); - } -} - -/* delete del_num items from buffer starting from the first'th item */ -static void leaf_delete_items_entirely(struct buffer_info *bi, - int first, int del_num) -{ - struct buffer_head *bh = bi->bi_bh; - int nr; - int i, j; - int last_loc, last_removed_loc; - struct block_head *blkh; - struct item_head *ih; - - RFALSE(bh == NULL, "10210: buffer is 0"); - RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); - - if (del_num == 0) - return; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - - RFALSE(first < 0 || first + del_num > nr, - "10220: first=%d, number=%d, there is %d items", first, del_num, - nr); - - if (first == 0 && del_num == nr) { - /* this does not work */ - make_empty_node(bi); - - do_balance_mark_leaf_dirty(bi->tb, bh, 0); - return; - } - - ih = item_head(bh, first); - - /* location of unmovable item */ - j = (first == 0) ? bh->b_size : ih_location(ih - 1); - - /* delete items */ - last_loc = ih_location(&ih[nr - 1 - first]); - last_removed_loc = ih_location(&ih[del_num - 1]); - - memmove(bh->b_data + last_loc + j - last_removed_loc, - bh->b_data + last_loc, last_removed_loc - last_loc); - - /* delete item headers */ - memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); - - /* change item location */ - for (i = first; i < nr - del_num; i++) - put_ih_location(&ih[i - first], - ih_location(&ih[i - first]) + (j - - last_removed_loc)); - - /* sizes, item number */ - set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); - set_blkh_free_space(blkh, - blkh_free_space(blkh) + (j - last_removed_loc + - IH_SIZE * del_num)); - - do_balance_mark_leaf_dirty(bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc = - B_N_CHILD(bi->bi_parent, bi->bi_position); - put_dc_size(t_dc, - dc_size(t_dc) - (j - last_removed_loc + - IH_SIZE * del_num)); - do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); - } -} - -/* - * paste new_entry_count entries (new_dehs, records) into position - * before to item_num-th item - */ -void leaf_paste_entries(struct buffer_info *bi, - int item_num, - int before, - int new_entry_count, - struct reiserfs_de_head *new_dehs, - const char *records, int paste_size) -{ - struct item_head *ih; - char *item; - struct reiserfs_de_head *deh; - char *insert_point; - int i; - struct buffer_head *bh = bi->bi_bh; - - if (new_entry_count == 0) - return; - - ih = item_head(bh, item_num); - - /* - * make sure, that item is directory, and there are enough - * records in it - */ - RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); - RFALSE(ih_entry_count(ih) < before, - "10230: there are no entry we paste entries before. entry_count = %d, before = %d", - ih_entry_count(ih), before); - - /* first byte of dest item */ - item = bh->b_data + ih_location(ih); - - /* entry head array */ - deh = B_I_DEH(bh, ih); - - /* new records will be pasted at this point */ - insert_point = - item + - (before ? deh_location(&deh[before - 1]) - : (ih_item_len(ih) - paste_size)); - - /* adjust locations of records that will be AFTER new records */ - for (i = ih_entry_count(ih) - 1; i >= before; i--) - put_deh_location(&deh[i], - deh_location(&deh[i]) + - (DEH_SIZE * new_entry_count)); - - /* adjust locations of records that will be BEFORE new records */ - for (i = 0; i < before; i++) - put_deh_location(&deh[i], - deh_location(&deh[i]) + paste_size); - - put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); - - /* prepare space for pasted records */ - memmove(insert_point + paste_size, insert_point, - item + (ih_item_len(ih) - paste_size) - insert_point); - - /* copy new records */ - memcpy(insert_point + DEH_SIZE * new_entry_count, records, - paste_size - DEH_SIZE * new_entry_count); - - /* prepare space for new entry heads */ - deh += before; - memmove((char *)(deh + new_entry_count), deh, - insert_point - (char *)deh); - - /* copy new entry heads */ - deh = (struct reiserfs_de_head *)((char *)deh); - memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); - - /* set locations of new records */ - for (i = 0; i < new_entry_count; i++) { - put_deh_location(&deh[i], - deh_location(&deh[i]) + - (-deh_location - (&new_dehs[new_entry_count - 1]) + - insert_point + DEH_SIZE * new_entry_count - - item)); - } - - /* change item key if necessary (when we paste before 0-th entry */ - if (!before) { - set_le_ih_k_offset(ih, deh_offset(new_dehs)); - } -#ifdef CONFIG_REISERFS_CHECK - { - int prev, next; - /* check record locations */ - deh = B_I_DEH(bh, ih); - for (i = 0; i < ih_entry_count(ih); i++) { - next = - (i < - ih_entry_count(ih) - - 1) ? deh_location(&deh[i + 1]) : 0; - prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; - - if (prev && prev <= deh_location(&deh[i])) - reiserfs_error(sb_from_bi(bi), "vs-10240", - "directory item (%h) " - "corrupted (prev %a, " - "cur(%d) %a)", - ih, deh + i - 1, i, deh + i); - if (next && next >= deh_location(&deh[i])) - reiserfs_error(sb_from_bi(bi), "vs-10250", - "directory item (%h) " - "corrupted (cur(%d) %a, " - "next %a)", - ih, i, deh + i, deh + i + 1); - } - } -#endif - -} diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c deleted file mode 100644 index 46bd7bd63a71..000000000000 --- a/fs/reiserfs/lock.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "reiserfs.h" -#include <linux/mutex.h> - -/* - * The previous reiserfs locking scheme was heavily based on - * the tricky properties of the Bkl: - * - * - it was acquired recursively by a same task - * - the performances relied on the release-while-schedule() property - * - * Now that we replace it by a mutex, we still want to keep the same - * recursive property to avoid big changes in the code structure. - * We use our own lock_owner here because the owner field on a mutex - * is only available in SMP or mutex debugging, also we only need this field - * for this mutex, no need for a system wide mutex facility. - * - * Also this lock is often released before a call that could block because - * reiserfs performances were partially based on the release while schedule() - * property of the Bkl. - */ -void reiserfs_write_lock(struct super_block *s) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(s); - - if (sb_i->lock_owner != current) { - mutex_lock(&sb_i->lock); - sb_i->lock_owner = current; - } - - /* No need to protect it, only the current task touches it */ - sb_i->lock_depth++; -} - -void reiserfs_write_unlock(struct super_block *s) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(s); - - /* - * Are we unlocking without even holding the lock? - * Such a situation must raise a BUG() if we don't want - * to corrupt the data. - */ - BUG_ON(sb_i->lock_owner != current); - - if (--sb_i->lock_depth == -1) { - sb_i->lock_owner = NULL; - mutex_unlock(&sb_i->lock); - } -} - -int __must_check reiserfs_write_unlock_nested(struct super_block *s) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(s); - int depth; - - /* this can happen when the lock isn't always held */ - if (sb_i->lock_owner != current) - return -1; - - depth = sb_i->lock_depth; - - sb_i->lock_depth = -1; - sb_i->lock_owner = NULL; - mutex_unlock(&sb_i->lock); - - return depth; -} - -void reiserfs_write_lock_nested(struct super_block *s, int depth) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(s); - - /* this can happen when the lock isn't always held */ - if (depth == -1) - return; - - mutex_lock(&sb_i->lock); - sb_i->lock_owner = current; - sb_i->lock_depth = depth; -} - -/* - * Utility function to force a BUG if it is called without the superblock - * write lock held. caller is the string printed just before calling BUG() - */ -void reiserfs_check_lock_depth(struct super_block *sb, char *caller) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); - - WARN_ON(sb_i->lock_depth < 0); -} - -#ifdef CONFIG_REISERFS_CHECK -void reiserfs_lock_check_recursive(struct super_block *sb) -{ - struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); - - WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); -} -#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c deleted file mode 100644 index 7e7b531fcc49..000000000000 --- a/fs/reiserfs/namei.c +++ /dev/null @@ -1,1725 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - * - * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility - * - * Trivial Changes: - * Rights granted to Hans Reiser to redistribute under other terms providing - * he accepts all liability including but not limited to patent, fitness - * for purpose, and direct or indirect claims arising from failure to perform. - * - * NO WARRANTY - */ - -#include <linux/time.h> -#include <linux/bitops.h> -#include <linux/slab.h> -#include "reiserfs.h" -#include "acl.h" -#include "xattr.h" -#include <linux/quotaops.h> - -#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } -#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); - -/* - * directory item contains array of entry headers. This performs - * binary search through that array - */ -static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) -{ - struct item_head *ih = de->de_ih; - struct reiserfs_de_head *deh = de->de_deh; - int rbound, lbound, j; - - lbound = 0; - rbound = ih_entry_count(ih) - 1; - - for (j = (rbound + lbound) / 2; lbound <= rbound; - j = (rbound + lbound) / 2) { - if (off < deh_offset(deh + j)) { - rbound = j - 1; - continue; - } - if (off > deh_offset(deh + j)) { - lbound = j + 1; - continue; - } - /* this is not name found, but matched third key component */ - de->de_entry_num = j; - return NAME_FOUND; - } - - de->de_entry_num = lbound; - return NAME_NOT_FOUND; -} - -/* - * comment? maybe something like set de to point to what the path points to? - */ -static inline void set_de_item_location(struct reiserfs_dir_entry *de, - struct treepath *path) -{ - de->de_bh = get_last_bh(path); - de->de_ih = tp_item_head(path); - de->de_deh = B_I_DEH(de->de_bh, de->de_ih); - de->de_item_num = PATH_LAST_POSITION(path); -} - -/* - * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set - */ -inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) -{ - struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; - - BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); - - de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); - de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); - de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh); - if (de->de_name[de->de_namelen - 1] == 0) - de->de_namelen = strlen(de->de_name); -} - -/* what entry points to */ -static inline void set_de_object_key(struct reiserfs_dir_entry *de) -{ - BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); - de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]); - de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]); -} - -static inline void store_de_entry_key(struct reiserfs_dir_entry *de) -{ - struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; - - BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); - - /* store key of the found entry */ - de->de_entry_key.version = KEY_FORMAT_3_5; - de->de_entry_key.on_disk_key.k_dir_id = - le32_to_cpu(de->de_ih->ih_key.k_dir_id); - de->de_entry_key.on_disk_key.k_objectid = - le32_to_cpu(de->de_ih->ih_key.k_objectid); - set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh)); - set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY); -} - -/* - * We assign a key to each directory item, and place multiple entries in a - * single directory item. A directory item has a key equal to the key of - * the first directory entry in it. - - * This function first calls search_by_key, then, if item whose first entry - * matches is not found it looks for the entry inside directory item found - * by search_by_key. Fills the path to the entry, and to the entry position - * in the item - */ -/* The function is NOT SCHEDULE-SAFE! */ -int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, - struct treepath *path, struct reiserfs_dir_entry *de) -{ - int retval; - - retval = search_item(sb, key, path); - switch (retval) { - case ITEM_NOT_FOUND: - if (!PATH_LAST_POSITION(path)) { - reiserfs_error(sb, "vs-7000", "search_by_key " - "returned item position == 0"); - pathrelse(path); - return IO_ERROR; - } - PATH_LAST_POSITION(path)--; - break; - - case ITEM_FOUND: - break; - - case IO_ERROR: - return retval; - - default: - pathrelse(path); - reiserfs_error(sb, "vs-7002", "no path to here"); - return IO_ERROR; - } - - set_de_item_location(de, path); - -#ifdef CONFIG_REISERFS_CHECK - if (!is_direntry_le_ih(de->de_ih) || - COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) { - print_block(de->de_bh, 0, -1, -1); - reiserfs_panic(sb, "vs-7005", "found item %h is not directory " - "item or does not belong to the same directory " - "as key %K", de->de_ih, key); - } -#endif /* CONFIG_REISERFS_CHECK */ - - /* - * binary search in directory item by third component of the - * key. sets de->de_entry_num of de - */ - retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); - path->pos_in_item = de->de_entry_num; - if (retval != NAME_NOT_FOUND) { - /* - * ugly, but rename needs de_bh, de_deh, de_name, - * de_namelen, de_objectid set - */ - set_de_name_and_namelen(de); - set_de_object_key(de); - } - return retval; -} - -/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ - -/* - * The third component is hashed, and you can choose from more than - * one hash function. Per directory hashes are not yet implemented - * but are thought about. This function should be moved to hashes.c - * Jedi, please do so. -Hans - */ -static __u32 get_third_component(struct super_block *s, - const char *name, int len) -{ - __u32 res; - - if (!len || (len == 1 && name[0] == '.')) - return DOT_OFFSET; - if (len == 2 && name[0] == '.' && name[1] == '.') - return DOT_DOT_OFFSET; - - res = REISERFS_SB(s)->s_hash_function(name, len); - - /* take bits from 7-th to 30-th including both bounds */ - res = GET_HASH_VALUE(res); - if (res == 0) - /* - * needed to have no names before "." and ".." those have hash - * value == 0 and generation conters 1 and 2 accordingly - */ - res = 128; - return res + MAX_GENERATION_NUMBER; -} - -static int reiserfs_match(struct reiserfs_dir_entry *de, - const char *name, int namelen) -{ - int retval = NAME_NOT_FOUND; - - if ((namelen == de->de_namelen) && - !memcmp(de->de_name, name, de->de_namelen)) - retval = - (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : - NAME_FOUND_INVISIBLE); - - return retval; -} - -/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ - -/* used when hash collisions exist */ - -static int linear_search_in_dir_item(struct cpu_key *key, - struct reiserfs_dir_entry *de, - const char *name, int namelen) -{ - struct reiserfs_de_head *deh = de->de_deh; - int retval; - int i; - - i = de->de_entry_num; - - if (i == ih_entry_count(de->de_ih) || - GET_HASH_VALUE(deh_offset(deh + i)) != - GET_HASH_VALUE(cpu_key_k_offset(key))) { - i--; - } - - RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih), - "vs-7010: array of entry headers not found"); - - deh += i; - - for (; i >= 0; i--, deh--) { - /* hash value does not match, no need to check whole name */ - if (GET_HASH_VALUE(deh_offset(deh)) != - GET_HASH_VALUE(cpu_key_k_offset(key))) { - return NAME_NOT_FOUND; - } - - /* mark that this generation number is used */ - if (de->de_gen_number_bit_string) - set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), - de->de_gen_number_bit_string); - - /* calculate pointer to name and namelen */ - de->de_entry_num = i; - set_de_name_and_namelen(de); - - /* - * de's de_name, de_namelen, de_recordlen are set. - * Fill the rest. - */ - if ((retval = - reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { - - /* key of pointed object */ - set_de_object_key(de); - - store_de_entry_key(de); - - /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ - return retval; - } - } - - if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) - /* - * we have reached left most entry in the node. In common we - * have to go to the left neighbor, but if generation counter - * is 0 already, we know for sure, that there is no name with - * the same hash value - */ - /* - * FIXME: this work correctly only because hash value can not - * be 0. Btw, in case of Yura's hash it is probably possible, - * so, this is a bug - */ - return NAME_NOT_FOUND; - - RFALSE(de->de_item_num, - "vs-7015: two diritems of the same directory in one node?"); - - return GOTO_PREVIOUS_ITEM; -} - -/* - * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND - * FIXME: should add something like IOERROR - */ -static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, - struct treepath *path_to_entry, - struct reiserfs_dir_entry *de) -{ - struct cpu_key key_to_search; - int retval; - - if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) - return NAME_NOT_FOUND; - - /* we will search for this key in the tree */ - make_cpu_key(&key_to_search, dir, - get_third_component(dir->i_sb, name, namelen), - TYPE_DIRENTRY, 3); - - while (1) { - retval = - search_by_entry_key(dir->i_sb, &key_to_search, - path_to_entry, de); - if (retval == IO_ERROR) { - reiserfs_error(dir->i_sb, "zam-7001", "io error"); - return IO_ERROR; - } - - /* compare names for all entries having given hash value */ - retval = - linear_search_in_dir_item(&key_to_search, de, name, - namelen); - /* - * there is no need to scan directory anymore. - * Given entry found or does not exist - */ - if (retval != GOTO_PREVIOUS_ITEM) { - path_to_entry->pos_in_item = de->de_entry_num; - return retval; - } - - /* - * there is left neighboring item of this directory - * and given entry can be there - */ - set_cpu_key_k_offset(&key_to_search, - le_ih_k_offset(de->de_ih) - 1); - pathrelse(path_to_entry); - - } /* while (1) */ -} - -static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - int retval; - struct inode *inode = NULL; - struct reiserfs_dir_entry de; - INITIALIZE_PATH(path_to_entry); - - if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) - return ERR_PTR(-ENAMETOOLONG); - - reiserfs_write_lock(dir->i_sb); - - de.de_gen_number_bit_string = NULL; - retval = - reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, - &path_to_entry, &de); - pathrelse(&path_to_entry); - if (retval == NAME_FOUND) { - inode = reiserfs_iget(dir->i_sb, - (struct cpu_key *)&de.de_dir_id); - if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock(dir->i_sb); - return ERR_PTR(-EACCES); - } - - /* - * Propagate the private flag so we know we're - * in the priv tree. Also clear xattr support - * since we don't have xattrs on xattr files. - */ - if (IS_PRIVATE(dir)) - reiserfs_init_priv_inode(inode); - } - reiserfs_write_unlock(dir->i_sb); - if (retval == IO_ERROR) { - return ERR_PTR(-EIO); - } - - return d_splice_alias(inode, dentry); -} - -/* - * looks up the dentry of the parent directory for child. - * taken from ext2_get_parent - */ -struct dentry *reiserfs_get_parent(struct dentry *child) -{ - int retval; - struct inode *inode = NULL; - struct reiserfs_dir_entry de; - INITIALIZE_PATH(path_to_entry); - struct inode *dir = d_inode(child); - - if (dir->i_nlink == 0) { - return ERR_PTR(-ENOENT); - } - de.de_gen_number_bit_string = NULL; - - reiserfs_write_lock(dir->i_sb); - retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de); - pathrelse(&path_to_entry); - if (retval != NAME_FOUND) { - reiserfs_write_unlock(dir->i_sb); - return ERR_PTR(-ENOENT); - } - inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id); - reiserfs_write_unlock(dir->i_sb); - - return d_obtain_alias(inode); -} - -/* add entry to the directory (entry can be hidden). - -insert definition of when hidden directories are used here -Hans - - Does not mark dir inode dirty, do it after successesfull call to it */ - -static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, - struct inode *dir, const char *name, int namelen, - struct inode *inode, int visible) -{ - struct cpu_key entry_key; - struct reiserfs_de_head *deh; - INITIALIZE_PATH(path); - struct reiserfs_dir_entry de; - DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); - int gen_number; - - /* - * 48 bytes now and we avoid kmalloc if we - * create file with short name - */ - char small_buf[32 + DEH_SIZE]; - - char *buffer; - int buflen, paste_size; - int retval; - - BUG_ON(!th->t_trans_id); - - /* each entry has unique key. compose it */ - make_cpu_key(&entry_key, dir, - get_third_component(dir->i_sb, name, namelen), - TYPE_DIRENTRY, 3); - - /* get memory for composing the entry */ - buflen = DEH_SIZE + ROUND_UP(namelen); - if (buflen > sizeof(small_buf)) { - buffer = kmalloc(buflen, GFP_NOFS); - if (!buffer) - return -ENOMEM; - } else - buffer = small_buf; - - paste_size = - (get_inode_sd_version(dir) == - STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; - - /* - * fill buffer : directory entry head, name[, dir objectid | , - * stat data | ,stat data, dir objectid ] - */ - deh = (struct reiserfs_de_head *)buffer; - deh->deh_location = 0; /* JDM Endian safe if 0 */ - put_deh_offset(deh, cpu_key_k_offset(&entry_key)); - deh->deh_state = 0; /* JDM Endian safe if 0 */ - /* put key (ino analog) to de */ - - /* safe: k_dir_id is le */ - deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; - /* safe: k_objectid is le */ - deh->deh_objectid = INODE_PKEY(inode)->k_objectid; - - /* copy name */ - memcpy((char *)(deh + 1), name, namelen); - /* padd by 0s to the 4 byte boundary */ - padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); - - /* - * entry is ready to be pasted into tree, set 'visibility' - * and 'stat data in entry' attributes - */ - mark_de_without_sd(deh); - visible ? mark_de_visible(deh) : mark_de_hidden(deh); - - /* find the proper place for the new entry */ - memset(bit_string, 0, sizeof(bit_string)); - de.de_gen_number_bit_string = bit_string; - retval = reiserfs_find_entry(dir, name, namelen, &path, &de); - if (retval != NAME_NOT_FOUND) { - if (buffer != small_buf) - kfree(buffer); - pathrelse(&path); - - if (retval == IO_ERROR) { - return -EIO; - } - - if (retval != NAME_FOUND) { - reiserfs_error(dir->i_sb, "zam-7002", - "reiserfs_find_entry() returned " - "unexpected value (%d)", retval); - } - - return -EEXIST; - } - - gen_number = - find_first_zero_bit(bit_string, - MAX_GENERATION_NUMBER + 1); - if (gen_number > MAX_GENERATION_NUMBER) { - /* there is no free generation number */ - reiserfs_warning(dir->i_sb, "reiserfs-7010", - "Congratulations! we have got hash function " - "screwed up"); - if (buffer != small_buf) - kfree(buffer); - pathrelse(&path); - return -EBUSY; - } - /* adjust offset of directory enrty */ - put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); - set_cpu_key_k_offset(&entry_key, deh_offset(deh)); - - /* update max-hash-collisions counter in reiserfs_sb_info */ - PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); - - /* we need to re-search for the insertion point */ - if (gen_number != 0) { - if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != - NAME_NOT_FOUND) { - reiserfs_warning(dir->i_sb, "vs-7032", - "entry with this key (%K) already " - "exists", &entry_key); - - if (buffer != small_buf) - kfree(buffer); - pathrelse(&path); - return -EBUSY; - } - } - - /* perform the insertion of the entry that we have prepared */ - retval = - reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, - paste_size); - if (buffer != small_buf) - kfree(buffer); - if (retval) { - reiserfs_check_path(&path); - return retval; - } - - dir->i_size += paste_size; - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - if (!S_ISDIR(inode->i_mode) && visible) - /* reiserfs_mkdir or reiserfs_rename will do that by itself */ - reiserfs_update_sd(th, dir); - - reiserfs_check_path(&path); - return 0; -} - -/* - * quota utility function, call if you've had to abort after calling - * new_inode_init, and have not called reiserfs_new_inode yet. - * This should only be called on inodes that do not have stat data - * inserted into the tree yet. - */ -static int drop_new_inode(struct inode *inode) -{ - dquot_drop(inode); - make_bad_inode(inode); - inode->i_flags |= S_NOQUOTA; - iput(inode); - return 0; -} - -/* - * utility function that does setup for reiserfs_new_inode. - * dquot_initialize needs lots of credits so it's better to have it - * outside of a transaction, so we had to pull some bits of - * reiserfs_new_inode out into this func. - */ -static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) -{ - /* - * Make inode invalid - just in case we are going to drop it before - * the initialization happens - */ - INODE_PKEY(inode)->k_objectid = 0; - - /* - * the quota init calls have to know who to charge the quota to, so - * we have to set uid and gid here - */ - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - return dquot_initialize(inode); -} - -static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) -{ - int retval; - struct inode *inode; - /* - * We need blocks for transaction + (user+group)*(quotas - * for new inode + update of quota for directory owner) - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 2 + - 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + - REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - struct reiserfs_transaction_handle th; - struct reiserfs_security_handle security; - - retval = dquot_initialize(dir); - if (retval) - return retval; - - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM; - } - retval = new_inode_init(inode, dir, mode); - if (retval) { - drop_new_inode(inode); - return retval; - } - - jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); - if (retval < 0) { - drop_new_inode(inode); - return retval; - } - jbegin_count += retval; - reiserfs_write_lock(dir->i_sb); - - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) { - drop_new_inode(inode); - goto out_failed; - } - - retval = - reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, - inode, &security); - if (retval) - goto out_failed; - - inode->i_op = &reiserfs_file_inode_operations; - inode->i_fop = &reiserfs_file_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - - retval = - reiserfs_add_entry(&th, dir, dentry->d_name.name, - dentry->d_name.len, inode, 1 /*visible */ ); - if (retval) { - int err; - drop_nlink(inode); - reiserfs_update_sd(&th, inode); - err = journal_end(&th); - if (err) - retval = err; - unlock_new_inode(inode); - iput(inode); - goto out_failed; - } - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - d_instantiate_new(dentry, inode); - retval = journal_end(&th); - -out_failed: - reiserfs_write_unlock(dir->i_sb); - reiserfs_security_free(&security); - return retval; -} - -static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) -{ - int retval; - struct inode *inode; - struct reiserfs_transaction_handle th; - struct reiserfs_security_handle security; - /* - * We need blocks for transaction + (user+group)*(quotas - * for new inode + update of quota for directory owner) - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + - 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + - REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - - retval = dquot_initialize(dir); - if (retval) - return retval; - - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM; - } - retval = new_inode_init(inode, dir, mode); - if (retval) { - drop_new_inode(inode); - return retval; - } - - jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); - if (retval < 0) { - drop_new_inode(inode); - return retval; - } - jbegin_count += retval; - reiserfs_write_lock(dir->i_sb); - - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) { - drop_new_inode(inode); - goto out_failed; - } - - retval = - reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, - inode, &security); - if (retval) { - goto out_failed; - } - - inode->i_op = &reiserfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - - /* FIXME: needed for block and char devices only */ - reiserfs_update_sd(&th, inode); - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - retval = - reiserfs_add_entry(&th, dir, dentry->d_name.name, - dentry->d_name.len, inode, 1 /*visible */ ); - if (retval) { - int err; - drop_nlink(inode); - reiserfs_update_sd(&th, inode); - err = journal_end(&th); - if (err) - retval = err; - unlock_new_inode(inode); - iput(inode); - goto out_failed; - } - - d_instantiate_new(dentry, inode); - retval = journal_end(&th); - -out_failed: - reiserfs_write_unlock(dir->i_sb); - reiserfs_security_free(&security); - return retval; -} - -static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - int retval; - struct inode *inode; - struct reiserfs_transaction_handle th; - struct reiserfs_security_handle security; - /* - * We need blocks for transaction + (user+group)*(quotas - * for new inode + update of quota for directory owner) - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + - 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + - REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); - - retval = dquot_initialize(dir); - if (retval) - return retval; - -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* - * set flag that new packing locality created and new blocks - * for the content of that directory are not displaced yet - */ - REISERFS_I(dir)->new_packing_locality = 1; -#endif - mode = S_IFDIR | mode; - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM; - } - retval = new_inode_init(inode, dir, mode); - if (retval) { - drop_new_inode(inode); - return retval; - } - - jbegin_count += reiserfs_cache_default_acl(dir); - retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security); - if (retval < 0) { - drop_new_inode(inode); - return retval; - } - jbegin_count += retval; - reiserfs_write_lock(dir->i_sb); - - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) { - drop_new_inode(inode); - goto out_failed; - } - - /* - * inc the link count now, so another writer doesn't overflow - * it while we sleep later on. - */ - INC_DIR_INODE_NLINK(dir) - - retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */, - old_format_only(dir->i_sb) ? - EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, - dentry, inode, &security); - if (retval) { - DEC_DIR_INODE_NLINK(dir) - goto out_failed; - } - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - inode->i_op = &reiserfs_dir_inode_operations; - inode->i_fop = &reiserfs_dir_operations; - - /* note, _this_ add_entry will not update dir's stat data */ - retval = - reiserfs_add_entry(&th, dir, dentry->d_name.name, - dentry->d_name.len, inode, 1 /*visible */ ); - if (retval) { - int err; - clear_nlink(inode); - DEC_DIR_INODE_NLINK(dir); - reiserfs_update_sd(&th, inode); - err = journal_end(&th); - if (err) - retval = err; - unlock_new_inode(inode); - iput(inode); - goto out_failed; - } - /* the above add_entry did not update dir's stat data */ - reiserfs_update_sd(&th, dir); - - d_instantiate_new(dentry, inode); - retval = journal_end(&th); -out_failed: - reiserfs_write_unlock(dir->i_sb); - reiserfs_security_free(&security); - return retval; -} - -static inline int reiserfs_empty_dir(struct inode *inode) -{ - /* - * we can cheat because an old format dir cannot have - * EMPTY_DIR_SIZE, and a new format dir cannot have - * EMPTY_DIR_SIZE_V1. So, if the inode is either size, - * regardless of disk format version, the directory is empty. - */ - if (inode->i_size != EMPTY_DIR_SIZE && - inode->i_size != EMPTY_DIR_SIZE_V1) { - return 0; - } - return 1; -} - -static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int retval, err; - struct inode *inode; - struct reiserfs_transaction_handle th; - int jbegin_count; - INITIALIZE_PATH(path); - struct reiserfs_dir_entry de; - - /* - * we will be doing 2 balancings and update 2 stat data, we - * change quotas of the owner of the directory and of the owner - * of the parent directory. The quota structure is possibly - * deleted only on last iput => outside of this transaction - */ - jbegin_count = - JOURNAL_PER_BALANCE_CNT * 2 + 2 + - 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - - retval = dquot_initialize(dir); - if (retval) - return retval; - - reiserfs_write_lock(dir->i_sb); - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) - goto out_rmdir; - - de.de_gen_number_bit_string = NULL; - if ((retval = - reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, - &path, &de)) == NAME_NOT_FOUND) { - retval = -ENOENT; - goto end_rmdir; - } else if (retval == IO_ERROR) { - retval = -EIO; - goto end_rmdir; - } - - inode = d_inode(dentry); - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - if (de.de_objectid != inode->i_ino) { - /* - * FIXME: compare key of an object and a key found in the entry - */ - retval = -EIO; - goto end_rmdir; - } - if (!reiserfs_empty_dir(inode)) { - retval = -ENOTEMPTY; - goto end_rmdir; - } - - /* cut entry from dir directory */ - retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key, - dir, NULL, /* page */ - 0 /*new file size - not used here */ ); - if (retval < 0) - goto end_rmdir; - - if (inode->i_nlink != 2 && inode->i_nlink != 1) - reiserfs_error(inode->i_sb, "reiserfs-7040", - "empty directory has nlink != 2 (%d)", - inode->i_nlink); - - clear_nlink(inode); - inode_set_mtime_to_ts(dir, - inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); - reiserfs_update_sd(&th, inode); - - DEC_DIR_INODE_NLINK(dir) - dir->i_size -= (DEH_SIZE + de.de_entrylen); - reiserfs_update_sd(&th, dir); - - /* prevent empty directory from getting lost */ - add_save_link(&th, inode, 0 /* not truncate */ ); - - retval = journal_end(&th); - reiserfs_check_path(&path); -out_rmdir: - reiserfs_write_unlock(dir->i_sb); - return retval; - -end_rmdir: - /* - * we must release path, because we did not call - * reiserfs_cut_from_item, or reiserfs_cut_from_item does not - * release path if operation was not complete - */ - pathrelse(&path); - err = journal_end(&th); - reiserfs_write_unlock(dir->i_sb); - return err ? err : retval; -} - -static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) -{ - int retval, err; - struct inode *inode; - struct reiserfs_dir_entry de; - INITIALIZE_PATH(path); - struct reiserfs_transaction_handle th; - int jbegin_count; - unsigned long savelink; - - retval = dquot_initialize(dir); - if (retval) - return retval; - - inode = d_inode(dentry); - - /* - * in this transaction we can be doing at max two balancings and - * update two stat datas, we change quotas of the owner of the - * directory and of the owner of the parent directory. The quota - * structure is possibly deleted only on iput => outside of - * this transaction - */ - jbegin_count = - JOURNAL_PER_BALANCE_CNT * 2 + 2 + - 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - - reiserfs_write_lock(dir->i_sb); - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) - goto out_unlink; - - de.de_gen_number_bit_string = NULL; - if ((retval = - reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, - &path, &de)) == NAME_NOT_FOUND) { - retval = -ENOENT; - goto end_unlink; - } else if (retval == IO_ERROR) { - retval = -EIO; - goto end_unlink; - } - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - if (de.de_objectid != inode->i_ino) { - /* - * FIXME: compare key of an object and a key found in the entry - */ - retval = -EIO; - goto end_unlink; - } - - if (!inode->i_nlink) { - reiserfs_warning(inode->i_sb, "reiserfs-7042", - "deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); - set_nlink(inode, 1); - } - - drop_nlink(inode); - - /* - * we schedule before doing the add_save_link call, save the link - * count so we don't race - */ - savelink = inode->i_nlink; - - retval = - reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL, - 0); - if (retval < 0) { - inc_nlink(inode); - goto end_unlink; - } - inode_set_ctime_current(inode); - reiserfs_update_sd(&th, inode); - - dir->i_size -= (de.de_entrylen + DEH_SIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - reiserfs_update_sd(&th, dir); - - if (!savelink) - /* prevent file from getting lost */ - add_save_link(&th, inode, 0 /* not truncate */ ); - - retval = journal_end(&th); - reiserfs_check_path(&path); - reiserfs_write_unlock(dir->i_sb); - return retval; - -end_unlink: - pathrelse(&path); - err = journal_end(&th); - reiserfs_check_path(&path); - if (err) - retval = err; -out_unlink: - reiserfs_write_unlock(dir->i_sb); - return retval; -} - -static int reiserfs_symlink(struct mnt_idmap *idmap, - struct inode *parent_dir, struct dentry *dentry, - const char *symname) -{ - int retval; - struct inode *inode; - char *name; - int item_len; - struct reiserfs_transaction_handle th; - struct reiserfs_security_handle security; - int mode = S_IFLNK | S_IRWXUGO; - /* - * We need blocks for transaction + (user+group)*(quotas for - * new inode + update of quota for directory owner) - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + - 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + - REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); - - retval = dquot_initialize(parent_dir); - if (retval) - return retval; - - if (!(inode = new_inode(parent_dir->i_sb))) { - return -ENOMEM; - } - retval = new_inode_init(inode, parent_dir, mode); - if (retval) { - drop_new_inode(inode); - return retval; - } - - retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name, - &security); - if (retval < 0) { - drop_new_inode(inode); - return retval; - } - jbegin_count += retval; - - reiserfs_write_lock(parent_dir->i_sb); - item_len = ROUND_UP(strlen(symname)); - if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { - retval = -ENAMETOOLONG; - drop_new_inode(inode); - goto out_failed; - } - - name = kmalloc(item_len, GFP_NOFS); - if (!name) { - drop_new_inode(inode); - retval = -ENOMEM; - goto out_failed; - } - memcpy(name, symname, strlen(symname)); - padd_item(name, item_len, strlen(symname)); - - retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); - if (retval) { - drop_new_inode(inode); - kfree(name); - goto out_failed; - } - - retval = - reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), - dentry, inode, &security); - kfree(name); - if (retval) { /* reiserfs_new_inode iputs for us */ - goto out_failed; - } - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(parent_dir); - - inode->i_op = &reiserfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - - retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, - dentry->d_name.len, inode, 1 /*visible */ ); - if (retval) { - int err; - drop_nlink(inode); - reiserfs_update_sd(&th, inode); - err = journal_end(&th); - if (err) - retval = err; - unlock_new_inode(inode); - iput(inode); - goto out_failed; - } - - d_instantiate_new(dentry, inode); - retval = journal_end(&th); -out_failed: - reiserfs_write_unlock(parent_dir->i_sb); - reiserfs_security_free(&security); - return retval; -} - -static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - int retval; - struct inode *inode = d_inode(old_dentry); - struct reiserfs_transaction_handle th; - /* - * We need blocks for transaction + update of quotas for - * the owners of the directory - */ - int jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + - 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - - retval = dquot_initialize(dir); - if (retval) - return retval; - - reiserfs_write_lock(dir->i_sb); - if (inode->i_nlink >= REISERFS_LINK_MAX) { - /* FIXME: sd_nlink is 32 bit for new files */ - reiserfs_write_unlock(dir->i_sb); - return -EMLINK; - } - - /* inc before scheduling so reiserfs_unlink knows we are here */ - inc_nlink(inode); - - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) { - drop_nlink(inode); - reiserfs_write_unlock(dir->i_sb); - return retval; - } - - /* create new entry */ - retval = - reiserfs_add_entry(&th, dir, dentry->d_name.name, - dentry->d_name.len, inode, 1 /*visible */ ); - - reiserfs_update_inode_transaction(inode); - reiserfs_update_inode_transaction(dir); - - if (retval) { - int err; - drop_nlink(inode); - err = journal_end(&th); - reiserfs_write_unlock(dir->i_sb); - return err ? err : retval; - } - - inode_set_ctime_current(inode); - reiserfs_update_sd(&th, inode); - - ihold(inode); - d_instantiate(dentry, inode); - retval = journal_end(&th); - reiserfs_write_unlock(dir->i_sb); - return retval; -} - -/* de contains information pointing to an entry which */ -static int de_still_valid(const char *name, int len, - struct reiserfs_dir_entry *de) -{ - struct reiserfs_dir_entry tmp = *de; - - /* recalculate pointer to name and name length */ - set_de_name_and_namelen(&tmp); - /* FIXME: could check more */ - if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) - return 0; - return 1; -} - -static int entry_points_to_object(const char *name, int len, - struct reiserfs_dir_entry *de, - struct inode *inode) -{ - if (!de_still_valid(name, len, de)) - return 0; - - if (inode) { - if (!de_visible(de->de_deh + de->de_entry_num)) - reiserfs_panic(inode->i_sb, "vs-7042", - "entry must be visible"); - return (de->de_objectid == inode->i_ino) ? 1 : 0; - } - - /* this must be added hidden entry */ - if (de_visible(de->de_deh + de->de_entry_num)) - reiserfs_panic(NULL, "vs-7043", "entry must be visible"); - - return 1; -} - -/* sets key of objectid the entry has to point to */ -static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, - struct reiserfs_key *key) -{ - /* JDM These operations are endian safe - both are le */ - de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; - de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; -} - -/* - * process, that is going to call fix_nodes/do_balance must hold only - * one path. If it holds 2 or more, it can get into endless waiting in - * get_empty_nodes or its clones - */ -static int reiserfs_rename(struct mnt_idmap *idmap, - struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - int retval; - INITIALIZE_PATH(old_entry_path); - INITIALIZE_PATH(new_entry_path); - INITIALIZE_PATH(dot_dot_entry_path); - struct item_head new_entry_ih, old_entry_ih, dot_dot_ih; - struct reiserfs_dir_entry old_de, new_de, dot_dot_de; - struct inode *old_inode, *new_dentry_inode; - struct reiserfs_transaction_handle th; - int jbegin_count; - unsigned long savelink = 1; - bool update_dir_parent = false; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - /* - * three balancings: (1) old name removal, (2) new name insertion - * and (3) maybe "save" link insertion - * stat data updates: (1) old directory, - * (2) new directory and (3) maybe old object stat data (when it is - * directory) and (4) maybe stat data of object to which new entry - * pointed initially and (5) maybe block containing ".." of - * renamed directory - * quota updates: two parent directories - */ - jbegin_count = - JOURNAL_PER_BALANCE_CNT * 3 + 5 + - 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); - - retval = dquot_initialize(old_dir); - if (retval) - return retval; - retval = dquot_initialize(new_dir); - if (retval) - return retval; - - old_inode = d_inode(old_dentry); - new_dentry_inode = d_inode(new_dentry); - - /* - * make sure that oldname still exists and points to an object we - * are going to rename - */ - old_de.de_gen_number_bit_string = NULL; - reiserfs_write_lock(old_dir->i_sb); - retval = - reiserfs_find_entry(old_dir, old_dentry->d_name.name, - old_dentry->d_name.len, &old_entry_path, - &old_de); - pathrelse(&old_entry_path); - if (retval == IO_ERROR) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - - if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -ENOENT; - } - - if (S_ISDIR(old_inode->i_mode)) { - /* - * make sure that directory being renamed has correct ".." - * and that its new parent directory has not too many links - * already - */ - if (new_dentry_inode) { - if (!reiserfs_empty_dir(new_dentry_inode)) { - reiserfs_write_unlock(old_dir->i_sb); - return -ENOTEMPTY; - } - } - - if (old_dir != new_dir) { - /* - * directory is renamed, its parent directory will be - * changed, so find ".." entry - */ - dot_dot_de.de_gen_number_bit_string = NULL; - retval = - reiserfs_find_entry(old_inode, "..", 2, - &dot_dot_entry_path, - &dot_dot_de); - pathrelse(&dot_dot_entry_path); - if (retval != NAME_FOUND) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - - /* inode number of .. must equal old_dir->i_ino */ - if (dot_dot_de.de_objectid != old_dir->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - update_dir_parent = true; - } - } - - retval = journal_begin(&th, old_dir->i_sb, jbegin_count); - if (retval) { - reiserfs_write_unlock(old_dir->i_sb); - return retval; - } - - /* add new entry (or find the existing one) */ - retval = - reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name, - new_dentry->d_name.len, old_inode, 0); - if (retval == -EEXIST) { - if (!new_dentry_inode) { - reiserfs_panic(old_dir->i_sb, "vs-7050", - "new entry is found, new inode == 0"); - } - } else if (retval) { - int err = journal_end(&th); - reiserfs_write_unlock(old_dir->i_sb); - return err ? err : retval; - } - - reiserfs_update_inode_transaction(old_dir); - reiserfs_update_inode_transaction(new_dir); - - /* - * this makes it so an fsync on an open fd for the old name will - * commit the rename operation - */ - reiserfs_update_inode_transaction(old_inode); - - if (new_dentry_inode) - reiserfs_update_inode_transaction(new_dentry_inode); - - while (1) { - /* - * look for old name using corresponding entry key - * (found by reiserfs_find_entry) - */ - if ((retval = - search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, - &old_entry_path, - &old_de)) != NAME_FOUND) { - pathrelse(&old_entry_path); - journal_end(&th); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - - copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path)); - - reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); - - /* look for new name by reiserfs_find_entry */ - new_de.de_gen_number_bit_string = NULL; - retval = - reiserfs_find_entry(new_dir, new_dentry->d_name.name, - new_dentry->d_name.len, &new_entry_path, - &new_de); - /* - * reiserfs_add_entry should not return IO_ERROR, - * because it is called with essentially same parameters from - * reiserfs_add_entry above, and we'll catch any i/o errors - * before we get here. - */ - if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { - pathrelse(&new_entry_path); - pathrelse(&old_entry_path); - journal_end(&th); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - - copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path)); - - reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); - - if (update_dir_parent) { - if ((retval = - search_by_entry_key(new_dir->i_sb, - &dot_dot_de.de_entry_key, - &dot_dot_entry_path, - &dot_dot_de)) != NAME_FOUND) { - pathrelse(&dot_dot_entry_path); - pathrelse(&new_entry_path); - pathrelse(&old_entry_path); - journal_end(&th); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - copy_item_head(&dot_dot_ih, - tp_item_head(&dot_dot_entry_path)); - /* node containing ".." gets into transaction */ - reiserfs_prepare_for_journal(old_inode->i_sb, - dot_dot_de.de_bh, 1); - } - /* - * we should check seals here, not do - * this stuff, yes? Then, having - * gathered everything into RAM we - * should lock the buffers, yes? -Hans - */ - /* - * probably. our rename needs to hold more - * than one path at once. The seals would - * have to be written to deal with multi-path - * issues -chris - */ - /* - * sanity checking before doing the rename - avoid races many - * of the above checks could have scheduled. We have to be - * sure our items haven't been shifted by another process. - */ - if (item_moved(&new_entry_ih, &new_entry_path) || - !entry_points_to_object(new_dentry->d_name.name, - new_dentry->d_name.len, - &new_de, new_dentry_inode) || - item_moved(&old_entry_ih, &old_entry_path) || - !entry_points_to_object(old_dentry->d_name.name, - old_dentry->d_name.len, - &old_de, old_inode)) { - reiserfs_restore_prepared_buffer(old_inode->i_sb, - new_de.de_bh); - reiserfs_restore_prepared_buffer(old_inode->i_sb, - old_de.de_bh); - if (update_dir_parent) - reiserfs_restore_prepared_buffer(old_inode-> - i_sb, - dot_dot_de. - de_bh); - continue; - } - if (update_dir_parent) { - if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || - !entry_points_to_object("..", 2, &dot_dot_de, - old_dir)) { - reiserfs_restore_prepared_buffer(old_inode-> - i_sb, - old_de.de_bh); - reiserfs_restore_prepared_buffer(old_inode-> - i_sb, - new_de.de_bh); - reiserfs_restore_prepared_buffer(old_inode-> - i_sb, - dot_dot_de. - de_bh); - continue; - } - } - - RFALSE(update_dir_parent && - !buffer_journal_prepared(dot_dot_de.de_bh), ""); - - break; - } - - /* - * ok, all the changes can be done in one fell swoop when we - * have claimed all the buffers needed. - */ - - mark_de_visible(new_de.de_deh + new_de.de_entry_num); - set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); - journal_mark_dirty(&th, new_de.de_bh); - - mark_de_hidden(old_de.de_deh + old_de.de_entry_num); - journal_mark_dirty(&th, old_de.de_bh); - /* - * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch - * which adds ctime update of renamed object - */ - simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); - - if (new_dentry_inode) { - /* adjust link number of the victim */ - if (S_ISDIR(new_dentry_inode->i_mode)) { - clear_nlink(new_dentry_inode); - } else { - drop_nlink(new_dentry_inode); - } - savelink = new_dentry_inode->i_nlink; - } - - if (update_dir_parent) { - /* adjust ".." of renamed directory */ - set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); - journal_mark_dirty(&th, dot_dot_de.de_bh); - } - if (S_ISDIR(old_inode->i_mode)) { - /* - * there (in new_dir) was no directory, so it got new link - * (".." of renamed directory) - */ - if (!new_dentry_inode) - INC_DIR_INODE_NLINK(new_dir); - - /* old directory lost one link - ".. " of renamed directory */ - DEC_DIR_INODE_NLINK(old_dir); - } - /* - * looks like in 2.3.99pre3 brelse is atomic. - * so we can use pathrelse - */ - pathrelse(&new_entry_path); - pathrelse(&dot_dot_entry_path); - - /* - * FIXME: this reiserfs_cut_from_item's return value may screw up - * anybody, but it will panic if will not be able to find the - * entry. This needs one more clean up - */ - if (reiserfs_cut_from_item - (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL, - 0) < 0) - reiserfs_error(old_dir->i_sb, "vs-7060", - "couldn't not cut old name. Fsck later?"); - - old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; - - reiserfs_update_sd(&th, old_dir); - reiserfs_update_sd(&th, new_dir); - reiserfs_update_sd(&th, old_inode); - - if (new_dentry_inode) { - if (savelink == 0) - add_save_link(&th, new_dentry_inode, - 0 /* not truncate */ ); - reiserfs_update_sd(&th, new_dentry_inode); - } - - retval = journal_end(&th); - reiserfs_write_unlock(old_dir->i_sb); - return retval; -} - -static const struct inode_operations reiserfs_priv_dir_inode_operations = { - .create = reiserfs_create, - .lookup = reiserfs_lookup, - .link = reiserfs_link, - .unlink = reiserfs_unlink, - .symlink = reiserfs_symlink, - .mkdir = reiserfs_mkdir, - .rmdir = reiserfs_rmdir, - .mknod = reiserfs_mknod, - .rename = reiserfs_rename, - .setattr = reiserfs_setattr, - .permission = reiserfs_permission, - .fileattr_get = reiserfs_fileattr_get, - .fileattr_set = reiserfs_fileattr_set, -}; - -static const struct inode_operations reiserfs_priv_symlink_inode_operations = { - .get_link = page_get_link, - .setattr = reiserfs_setattr, - .permission = reiserfs_permission, -}; - -static const struct inode_operations reiserfs_priv_special_inode_operations = { - .setattr = reiserfs_setattr, - .permission = reiserfs_permission, -}; - -void reiserfs_init_priv_inode(struct inode *inode) -{ - inode->i_flags |= S_PRIVATE; - inode->i_opflags &= ~IOP_XATTR; - - if (S_ISREG(inode->i_mode)) - inode->i_op = &reiserfs_priv_file_inode_operations; - else if (S_ISDIR(inode->i_mode)) - inode->i_op = &reiserfs_priv_dir_inode_operations; - else if (S_ISLNK(inode->i_mode)) - inode->i_op = &reiserfs_priv_symlink_inode_operations; - else - inode->i_op = &reiserfs_priv_special_inode_operations; -} - -/* directories can handle most operations... */ -const struct inode_operations reiserfs_dir_inode_operations = { - .create = reiserfs_create, - .lookup = reiserfs_lookup, - .link = reiserfs_link, - .unlink = reiserfs_unlink, - .symlink = reiserfs_symlink, - .mkdir = reiserfs_mkdir, - .rmdir = reiserfs_rmdir, - .mknod = reiserfs_mknod, - .rename = reiserfs_rename, - .setattr = reiserfs_setattr, - .listxattr = reiserfs_listxattr, - .permission = reiserfs_permission, - .get_inode_acl = reiserfs_get_acl, - .set_acl = reiserfs_set_acl, - .fileattr_get = reiserfs_fileattr_get, - .fileattr_set = reiserfs_fileattr_set, -}; - -/* - * symlink operations.. same as page_symlink_inode_operations, with xattr - * stuff added - */ -const struct inode_operations reiserfs_symlink_inode_operations = { - .get_link = page_get_link, - .setattr = reiserfs_setattr, - .listxattr = reiserfs_listxattr, - .permission = reiserfs_permission, -}; - -/* - * special file operations.. just xattr/acl stuff - */ -const struct inode_operations reiserfs_special_inode_operations = { - .setattr = reiserfs_setattr, - .listxattr = reiserfs_listxattr, - .permission = reiserfs_permission, - .get_inode_acl = reiserfs_get_acl, - .set_acl = reiserfs_set_acl, -}; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c deleted file mode 100644 index 34baf5c0f265..000000000000 --- a/fs/reiserfs/objectid.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/string.h> -#include <linux/time.h> -#include <linux/uuid.h> -#include "reiserfs.h" - -/* find where objectid map starts */ -#define objectid_map(s,rs) (old_format_only (s) ? \ - (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ - (__le32 *)((rs) + 1)) - -#ifdef CONFIG_REISERFS_CHECK - -static void check_objectid_map(struct super_block *s, __le32 * map) -{ - if (le32_to_cpu(map[0]) != 1) - reiserfs_panic(s, "vs-15010", "map corrupted: %lx", - (long unsigned int)le32_to_cpu(map[0])); - - /* FIXME: add something else here */ -} - -#else -static void check_objectid_map(struct super_block *s, __le32 * map) -{; -} -#endif - -/* - * When we allocate objectids we allocate the first unused objectid. - * Each sequence of objectids in use (the odd sequences) is followed - * by a sequence of objectids not in use (the even sequences). We - * only need to record the last objectid in each of these sequences - * (both the odd and even sequences) in order to fully define the - * boundaries of the sequences. A consequence of allocating the first - * objectid not in use is that under most conditions this scheme is - * extremely compact. The exception is immediately after a sequence - * of operations which deletes a large number of objects of - * non-sequential objectids, and even then it will become compact - * again as soon as more objects are created. Note that many - * interesting optimizations of layout could result from complicating - * objectid assignment, but we have deferred making them for now. - */ - -/* get unique object identifier */ -__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) -{ - struct super_block *s = th->t_super; - struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); - __le32 *map = objectid_map(s, rs); - __u32 unused_objectid; - - BUG_ON(!th->t_trans_id); - - check_objectid_map(s, map); - - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - /* comment needed -Hans */ - unused_objectid = le32_to_cpu(map[1]); - if (unused_objectid == U32_MAX) { - reiserfs_warning(s, "reiserfs-15100", "no more object ids"); - reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); - return 0; - } - - /* - * This incrementation allocates the first unused objectid. That - * is to say, the first entry on the objectid map is the first - * unused objectid, and by incrementing it we use it. See below - * where we check to see if we eliminated a sequence of unused - * objectids.... - */ - map[1] = cpu_to_le32(unused_objectid + 1); - - /* - * Now we check to see if we eliminated the last remaining member of - * the first even sequence (and can eliminate the sequence by - * eliminating its last objectid from oids), and can collapse the - * first two odd sequences into one sequence. If so, then the net - * result is to eliminate a pair of objectids from oids. We do this - * by shifting the entire map to the left. - */ - if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { - memmove(map + 1, map + 3, - (sb_oid_cursize(rs) - 3) * sizeof(__u32)); - set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); - } - - journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); - return unused_objectid; -} - -/* makes object identifier unused */ -void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, - __u32 objectid_to_release) -{ - struct super_block *s = th->t_super; - struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); - __le32 *map = objectid_map(s, rs); - int i = 0; - - BUG_ON(!th->t_trans_id); - /*return; */ - check_objectid_map(s, map); - - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); - - /* - * start at the beginning of the objectid map (i = 0) and go to - * the end of it (i = disk_sb->s_oid_cursize). Linear search is - * what we use, though it is possible that binary search would be - * more efficient after performing lots of deletions (which is - * when oids is large.) We only check even i's. - */ - while (i < sb_oid_cursize(rs)) { - if (objectid_to_release == le32_to_cpu(map[i])) { - /* This incrementation unallocates the objectid. */ - le32_add_cpu(&map[i], 1); - - /* - * Did we unallocate the last member of an - * odd sequence, and can shrink oids? - */ - if (map[i] == map[i + 1]) { - /* shrink objectid map */ - memmove(map + i, map + i + 2, - (sb_oid_cursize(rs) - i - - 2) * sizeof(__u32)); - set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); - - RFALSE(sb_oid_cursize(rs) < 2 || - sb_oid_cursize(rs) > sb_oid_maxsize(rs), - "vs-15005: objectid map corrupted cur_size == %d (max == %d)", - sb_oid_cursize(rs), sb_oid_maxsize(rs)); - } - return; - } - - if (objectid_to_release > le32_to_cpu(map[i]) && - objectid_to_release < le32_to_cpu(map[i + 1])) { - /* size of objectid map is not changed */ - if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { - le32_add_cpu(&map[i + 1], -1); - return; - } - - /* - * JDM comparing two little-endian values for - * equality -- safe - */ - /* - * objectid map must be expanded, but - * there is no space - */ - if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { - PROC_INFO_INC(s, leaked_oid); - return; - } - - /* expand the objectid map */ - memmove(map + i + 3, map + i + 1, - (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); - map[i + 1] = cpu_to_le32(objectid_to_release); - map[i + 2] = cpu_to_le32(objectid_to_release + 1); - set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2); - return; - } - i += 2; - } - - reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)", - (long unsigned)objectid_to_release); -} - -int reiserfs_convert_objectid_map_v1(struct super_block *s) -{ - struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s); - int cur_size = sb_oid_cursize(disk_sb); - int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; - int old_max = sb_oid_maxsize(disk_sb); - struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map; - int i; - - disk_sb_v1 = - (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); - objectid_map = (__le32 *) (disk_sb_v1 + 1); - - if (cur_size > new_size) { - /* - * mark everyone used that was listed as free at - * the end of the objectid map - */ - objectid_map[new_size - 1] = objectid_map[cur_size - 1]; - set_sb_oid_cursize(disk_sb, new_size); - } - /* move the smaller objectid map past the end of the new super */ - for (i = new_size - 1; i >= 0; i--) { - objectid_map[i + (old_max - new_size)] = objectid_map[i]; - } - - /* set the max size so we don't overflow later */ - set_sb_oid_maxsize(disk_sb, new_size); - - /* Zero out label and generate random UUID */ - memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)); - generate_random_uuid(disk_sb->s_uuid); - - /* finally, zero out the unused chunk of the new super */ - memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)); - return 0; -} diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c deleted file mode 100644 index 84a194b77f19..000000000000 --- a/fs/reiserfs/prints.c +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include "reiserfs.h" -#include <linux/string.h> -#include <linux/buffer_head.h> - -#include <linux/stdarg.h> - -static char error_buf[1024]; -static char fmt_buf[1024]; -static char off_buf[80]; - -static char *reiserfs_cpu_offset(struct cpu_key *key) -{ - if (cpu_key_k_type(key) == TYPE_DIRENTRY) - sprintf(off_buf, "%llu(%llu)", - (unsigned long long) - GET_HASH_VALUE(cpu_key_k_offset(key)), - (unsigned long long) - GET_GENERATION_NUMBER(cpu_key_k_offset(key))); - else - sprintf(off_buf, "0x%Lx", - (unsigned long long)cpu_key_k_offset(key)); - return off_buf; -} - -static char *le_offset(struct reiserfs_key *key) -{ - int version; - - version = le_key_version(key); - if (le_key_k_type(version, key) == TYPE_DIRENTRY) - sprintf(off_buf, "%llu(%llu)", - (unsigned long long) - GET_HASH_VALUE(le_key_k_offset(version, key)), - (unsigned long long) - GET_GENERATION_NUMBER(le_key_k_offset(version, key))); - else - sprintf(off_buf, "0x%Lx", - (unsigned long long)le_key_k_offset(version, key)); - return off_buf; -} - -static char *cpu_type(struct cpu_key *key) -{ - if (cpu_key_k_type(key) == TYPE_STAT_DATA) - return "SD"; - if (cpu_key_k_type(key) == TYPE_DIRENTRY) - return "DIR"; - if (cpu_key_k_type(key) == TYPE_DIRECT) - return "DIRECT"; - if (cpu_key_k_type(key) == TYPE_INDIRECT) - return "IND"; - return "UNKNOWN"; -} - -static char *le_type(struct reiserfs_key *key) -{ - int version; - - version = le_key_version(key); - - if (le_key_k_type(version, key) == TYPE_STAT_DATA) - return "SD"; - if (le_key_k_type(version, key) == TYPE_DIRENTRY) - return "DIR"; - if (le_key_k_type(version, key) == TYPE_DIRECT) - return "DIRECT"; - if (le_key_k_type(version, key) == TYPE_INDIRECT) - return "IND"; - return "UNKNOWN"; -} - -/* %k */ -static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key) -{ - if (key) - return scnprintf(buf, size, "[%d %d %s %s]", - le32_to_cpu(key->k_dir_id), - le32_to_cpu(key->k_objectid), le_offset(key), - le_type(key)); - else - return scnprintf(buf, size, "[NULL]"); -} - -/* %K */ -static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key) -{ - if (key) - return scnprintf(buf, size, "[%d %d %s %s]", - key->on_disk_key.k_dir_id, - key->on_disk_key.k_objectid, - reiserfs_cpu_offset(key), cpu_type(key)); - else - return scnprintf(buf, size, "[NULL]"); -} - -static int scnprintf_de_head(char *buf, size_t size, - struct reiserfs_de_head *deh) -{ - if (deh) - return scnprintf(buf, size, - "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", - deh_offset(deh), deh_dir_id(deh), - deh_objectid(deh), deh_location(deh), - deh_state(deh)); - else - return scnprintf(buf, size, "[NULL]"); - -} - -static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih) -{ - if (ih) { - char *p = buf; - char * const end = buf + size; - - p += scnprintf(p, end - p, "%s", - (ih_version(ih) == KEY_FORMAT_3_6) ? - "*3.6* " : "*3.5*"); - - p += scnprintf_le_key(p, end - p, &ih->ih_key); - - p += scnprintf(p, end - p, - ", item_len %d, item_location %d, free_space(entry_count) %d", - ih_item_len(ih), ih_location(ih), - ih_free_space(ih)); - return p - buf; - } else - return scnprintf(buf, size, "[NULL]"); -} - -static int scnprintf_direntry(char *buf, size_t size, - struct reiserfs_dir_entry *de) -{ - char name[20]; - - memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); - name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; - return scnprintf(buf, size, "\"%s\"==>[%d %d]", - name, de->de_dir_id, de->de_objectid); -} - -static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh) -{ - return scnprintf(buf, size, - "level=%d, nr_items=%d, free_space=%d rdkey ", - B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); -} - -static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh) -{ - return scnprintf(buf, size, - "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bh->b_bdev, bh->b_size, - (unsigned long long)bh->b_blocknr, - atomic_read(&(bh->b_count)), - bh->b_state, bh->b_page, - buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", - buffer_dirty(bh) ? "DIRTY" : "CLEAN", - buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); -} - -static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc) -{ - return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]", - dc_block_number(dc), dc_size(dc)); -} - -static char *is_there_reiserfs_struct(char *fmt, int *what) -{ - char *k = fmt; - - while ((k = strchr(k, '%')) != NULL) { - if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || - k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { - *what = k[1]; - break; - } - k++; - } - return k; -} - -/* - * debugging reiserfs we used to print out a lot of different - * variables, like keys, item headers, buffer heads etc. Values of - * most fields matter. So it took a long time just to write - * appropriative printk. With this reiserfs_warning you can use format - * specification for complex structures like you used to do with - * printfs for integers, doubles and pointers. For instance, to print - * out key structure you have to write just: - * reiserfs_warning ("bad key %k", key); - * instead of - * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, - * key->k_offset, key->k_uniqueness); - */ -static DEFINE_SPINLOCK(error_lock); -static void prepare_error_buf(const char *fmt, va_list args) -{ - char *fmt1 = fmt_buf; - char *k; - char *p = error_buf; - char * const end = &error_buf[sizeof(error_buf)]; - int what; - - spin_lock(&error_lock); - - if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) { - strscpy(error_buf, "format string too long", end - error_buf); - goto out_unlock; - } - - while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { - *k = 0; - - p += vscnprintf(p, end - p, fmt1, args); - - switch (what) { - case 'k': - p += scnprintf_le_key(p, end - p, - va_arg(args, struct reiserfs_key *)); - break; - case 'K': - p += scnprintf_cpu_key(p, end - p, - va_arg(args, struct cpu_key *)); - break; - case 'h': - p += scnprintf_item_head(p, end - p, - va_arg(args, struct item_head *)); - break; - case 't': - p += scnprintf_direntry(p, end - p, - va_arg(args, struct reiserfs_dir_entry *)); - break; - case 'y': - p += scnprintf_disk_child(p, end - p, - va_arg(args, struct disk_child *)); - break; - case 'z': - p += scnprintf_block_head(p, end - p, - va_arg(args, struct buffer_head *)); - break; - case 'b': - p += scnprintf_buffer_head(p, end - p, - va_arg(args, struct buffer_head *)); - break; - case 'a': - p += scnprintf_de_head(p, end - p, - va_arg(args, struct reiserfs_de_head *)); - break; - } - - fmt1 = k + 2; - } - p += vscnprintf(p, end - p, fmt1, args); -out_unlock: - spin_unlock(&error_lock); - -} - -/* - * in addition to usual conversion specifiers this accepts reiserfs - * specific conversion specifiers: - * %k to print little endian key, - * %K to print cpu key, - * %h to print item_head, - * %t to print directory entry - * %z to print block head (arg must be struct buffer_head * - * %b to print buffer_head - */ - -#define do_reiserfs_warning(fmt)\ -{\ - va_list args;\ - va_start( args, fmt );\ - prepare_error_buf( fmt, args );\ - va_end( args );\ -} - -void __reiserfs_warning(struct super_block *sb, const char *id, - const char *function, const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - if (sb) - printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: " - "%s\n", sb->s_id, id ? id : "", id ? " " : "", - function, error_buf); - else - printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n", - id ? id : "", id ? " " : "", function, error_buf); -} - -/* No newline.. reiserfs_info calls can be followed by printk's */ -void reiserfs_info(struct super_block *sb, const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - if (sb) - printk(KERN_NOTICE "REISERFS (device %s): %s", - sb->s_id, error_buf); - else - printk(KERN_NOTICE "REISERFS %s:", error_buf); -} - -/* No newline.. reiserfs_printk calls can be followed by printk's */ -static void reiserfs_printk(const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - printk(error_buf); -} - -void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) -{ -#ifdef CONFIG_REISERFS_CHECK - do_reiserfs_warning(fmt); - if (s) - printk(KERN_DEBUG "REISERFS debug (device %s): %s\n", - s->s_id, error_buf); - else - printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf); -#endif -} - -/* - * The format: - * - * maintainer-errorid: [function-name:] message - * - * where errorid is unique to the maintainer and function-name is - * optional, is recommended, so that anyone can easily find the bug - * with a simple grep for the short to type string - * maintainer-errorid. Don't bother with reusing errorids, there are - * lots of numbers out there. - * - * Example: - * - * reiserfs_panic( - * p_sb, "reiser-29: reiserfs_new_blocknrs: " - * "one of search_start or rn(%d) is equal to MAX_B_NUM," - * "which means that we are optimizing location based on the " - * "bogus location of a temp buffer (%p).", - * rn, bh - * ); - * - * Regular panic()s sometimes clear the screen before the message can - * be read, thus the need for the while loop. - * - * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely - * ignores this scheme, and considers it pointless complexity): - * - * panics in reiserfs_fs.h have numbers from 1000 to 1999 - * super.c 2000 to 2999 - * preserve.c (unused) 3000 to 3999 - * bitmap.c 4000 to 4999 - * stree.c 5000 to 5999 - * prints.c 6000 to 6999 - * namei.c 7000 to 7999 - * fix_nodes.c 8000 to 8999 - * dir.c 9000 to 9999 - * lbalance.c 10000 to 10999 - * ibalance.c 11000 to 11999 not ready - * do_balan.c 12000 to 12999 - * inode.c 13000 to 13999 - * file.c 14000 to 14999 - * objectid.c 15000 - 15999 - * buffer.c 16000 - 16999 - * symlink.c 17000 - 17999 - * - * . */ - -void __reiserfs_panic(struct super_block *sb, const char *id, - const char *function, const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - -#ifdef CONFIG_REISERFS_CHECK - dump_stack(); -#endif - if (sb) - printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", - sb->s_id, id ? id : "", id ? " " : "", - function, error_buf); - else - printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", - id ? id : "", id ? " " : "", function, error_buf); - BUG(); -} - -void __reiserfs_error(struct super_block *sb, const char *id, - const char *function, const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - - BUG_ON(sb == NULL); - - if (reiserfs_error_panic(sb)) - __reiserfs_panic(sb, id, function, error_buf); - - if (id && id[0]) - printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n", - sb->s_id, id, function, error_buf); - else - printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", - sb->s_id, function, error_buf); - - if (sb_rdonly(sb)) - return; - - reiserfs_info(sb, "Remounting filesystem read-only\n"); - sb->s_flags |= SB_RDONLY; - reiserfs_abort_journal(sb, -EIO); -} - -void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) -{ - do_reiserfs_warning(fmt); - - if (reiserfs_error_panic(sb)) { - panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id, - error_buf); - } - - if (reiserfs_is_journal_aborted(SB_JOURNAL(sb))) - return; - - printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, - error_buf); - - sb->s_flags |= SB_RDONLY; - reiserfs_abort_journal(sb, errno); -} - -/* - * this prints internal nodes (4 keys/items in line) (dc_number, - * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, - * dc_size)... - */ -static int print_internal(struct buffer_head *bh, int first, int last) -{ - struct reiserfs_key *key; - struct disk_child *dc; - int i; - int from, to; - - if (!B_IS_KEYS_LEVEL(bh)) - return 1; - - check_internal(bh); - - if (first == -1) { - from = 0; - to = B_NR_ITEMS(bh); - } else { - from = first; - to = min_t(int, last, B_NR_ITEMS(bh)); - } - - reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); - - dc = B_N_CHILD(bh, from); - reiserfs_printk("PTR %d: %y ", from, dc); - - for (i = from, key = internal_key(bh, from), dc++; i < to; - i++, key++, dc++) { - reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); - if (i && i % 4 == 0) - printk("\n"); - } - printk("\n"); - return 0; -} - -static int print_leaf(struct buffer_head *bh, int print_mode, int first, - int last) -{ - struct block_head *blkh; - struct item_head *ih; - int i, nr; - int from, to; - - if (!B_IS_ITEMS_LEVEL(bh)) - return 1; - - check_leaf(bh); - - blkh = B_BLK_HEAD(bh); - ih = item_head(bh, 0); - nr = blkh_nr_item(blkh); - - printk - ("\n===================================================================\n"); - reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); - - if (!(print_mode & PRINT_LEAF_ITEMS)) { - reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", - &(ih->ih_key), &((ih + nr - 1)->ih_key)); - return 0; - } - - if (first < 0 || first > nr - 1) - from = 0; - else - from = first; - - if (last < 0 || last > nr) - to = nr; - else - to = last; - - ih += from; - printk - ("-------------------------------------------------------------------------------\n"); - printk - ("|##| type | key | ilen | free_space | version | loc |\n"); - for (i = from; i < to; i++, ih++) { - printk - ("-------------------------------------------------------------------------------\n"); - reiserfs_printk("|%2d| %h |\n", i, ih); - if (print_mode & PRINT_LEAF_ITEMS) - op_print_item(ih, ih_item_body(bh, ih)); - } - - printk - ("===================================================================\n"); - - return 0; -} - -char *reiserfs_hashname(int code) -{ - if (code == YURA_HASH) - return "rupasov"; - if (code == TEA_HASH) - return "tea"; - if (code == R5_HASH) - return "r5"; - - return "unknown"; -} - -/* return 1 if this is not super block */ -static int print_super_block(struct buffer_head *bh) -{ - struct reiserfs_super_block *rs = - (struct reiserfs_super_block *)(bh->b_data); - int skipped, data_blocks; - char *version; - - if (is_reiserfs_3_5(rs)) { - version = "3.5"; - } else if (is_reiserfs_3_6(rs)) { - version = "3.6"; - } else if (is_reiserfs_jr(rs)) { - version = ((sb_version(rs) == REISERFS_VERSION_2) ? - "3.6" : "3.5"); - } else { - return 1; - } - - printk("%pg\'s super block is in block %llu\n", bh->b_bdev, - (unsigned long long)bh->b_blocknr); - printk("Reiserfs version %s\n", version); - printk("Block count %u\n", sb_block_count(rs)); - printk("Blocksize %d\n", sb_blocksize(rs)); - printk("Free blocks %u\n", sb_free_blocks(rs)); - /* - * FIXME: this would be confusing if - * someone stores reiserfs super block in some data block ;) -// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); - */ - skipped = bh->b_blocknr; - data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - - (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + - 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); - printk - ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" - "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), - (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : - sb_reserved_for_journal(rs)), data_blocks); - printk("Root block %u\n", sb_root_block(rs)); - printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); - printk("Journal dev %d\n", sb_jp_journal_dev(rs)); - printk("Journal orig size %d\n", sb_jp_journal_size(rs)); - printk("FS state %d\n", sb_fs_state(rs)); - printk("Hash function \"%s\"\n", - reiserfs_hashname(sb_hash_function_code(rs))); - - printk("Tree height %d\n", sb_tree_height(rs)); - return 0; -} - -static int print_desc_block(struct buffer_head *bh) -{ - struct reiserfs_journal_desc *desc; - - if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) - return 1; - - desc = (struct reiserfs_journal_desc *)(bh->b_data); - printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", - (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), - get_desc_mount_id(desc), get_desc_trans_len(desc)); - - return 0; -} -/* ..., int print_mode, int first, int last) */ -void print_block(struct buffer_head *bh, ...) -{ - va_list args; - int mode, first, last; - - if (!bh) { - printk("print_block: buffer is NULL\n"); - return; - } - - va_start(args, bh); - - mode = va_arg(args, int); - first = va_arg(args, int); - last = va_arg(args, int); - if (print_leaf(bh, mode, first, last)) - if (print_internal(bh, first, last)) - if (print_super_block(bh)) - if (print_desc_block(bh)) - printk - ("Block %llu contains unformatted data\n", - (unsigned long long)bh->b_blocknr); - - va_end(args); -} - -static char print_tb_buf[2048]; - -/* this stores initial state of tree balance in the print_tb_buf */ -void store_print_tb(struct tree_balance *tb) -{ - int h = 0; - int i; - struct buffer_head *tbSh, *tbFh; - - if (!tb) - return; - - sprintf(print_tb_buf, "\n" - "BALANCING %d\n" - "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" - "=====================================================================\n" - "* h * S * L * R * F * FL * FR * CFL * CFR *\n", - REISERFS_SB(tb->tb_sb)->s_do_balance, - tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), - tb->tb_path->pos_in_item); - - for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) { - if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= - tb->tb_path->path_length - && PATH_H_PATH_OFFSET(tb->tb_path, - h) > ILLEGAL_PATH_ELEMENT_OFFSET) { - tbSh = PATH_H_PBUFFER(tb->tb_path, h); - tbFh = PATH_H_PPARENT(tb->tb_path, h); - } else { - tbSh = NULL; - tbFh = NULL; - } - sprintf(print_tb_buf + strlen(print_tb_buf), - "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", - h, - (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), - (tbSh) ? atomic_read(&tbSh->b_count) : -1, - (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), - (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, - (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), - (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, - (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), - (tb->FL[h]) ? (long long)(tb->FL[h]-> - b_blocknr) : (-1LL), - (tb->FR[h]) ? (long long)(tb->FR[h]-> - b_blocknr) : (-1LL), - (tb->CFL[h]) ? (long long)(tb->CFL[h]-> - b_blocknr) : (-1LL), - (tb->CFR[h]) ? (long long)(tb->CFR[h]-> - b_blocknr) : (-1LL)); - } - - sprintf(print_tb_buf + strlen(print_tb_buf), - "=====================================================================\n" - "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" - "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", - tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], - tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], - tb->sbytes[0], tb->snum[1], tb->sbytes[1], - tb->cur_blknum, tb->lkey[0], tb->rkey[0]); - - /* this prints balance parameters for non-leaf levels */ - h = 0; - do { - h++; - sprintf(print_tb_buf + strlen(print_tb_buf), - "* %d * %4d * %2d * * %2d * * %2d *\n", - h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], - tb->blknum[h]); - } while (tb->insert_size[h]); - - sprintf(print_tb_buf + strlen(print_tb_buf), - "=====================================================================\n" - "FEB list: "); - - /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ - h = 0; - for (i = 0; i < ARRAY_SIZE(tb->FEB); i++) - sprintf(print_tb_buf + strlen(print_tb_buf), - "%p (%llu %d)%s", tb->FEB[i], - tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> - b_blocknr : 0ULL, - tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, - (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); - - sprintf(print_tb_buf + strlen(print_tb_buf), - "======================== the end ====================================\n"); -} - -void print_cur_tb(char *mes) -{ - printk("%s\n%s", mes, print_tb_buf); -} - -static void check_leaf_block_head(struct buffer_head *bh) -{ - struct block_head *blkh; - int nr; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) - reiserfs_panic(NULL, "vs-6010", "invalid item number %z", - bh); - if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) - reiserfs_panic(NULL, "vs-6020", "invalid free space %z", - bh); - -} - -static void check_internal_block_head(struct buffer_head *bh) -{ - if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) - reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); - - if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) - reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh); - - if (B_FREE_SPACE(bh) != - bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - - DC_SIZE * (B_NR_ITEMS(bh) + 1)) - reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh); - -} - -void check_leaf(struct buffer_head *bh) -{ - int i; - struct item_head *ih; - - if (!bh) - return; - check_leaf_block_head(bh); - for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) - op_check_item(ih, ih_item_body(bh, ih)); -} - -void check_internal(struct buffer_head *bh) -{ - if (!bh) - return; - check_internal_block_head(bh); -} - -void print_statistics(struct super_block *s) -{ - - /* - printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ - bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", - REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, - REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, - REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); - */ - -} diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c deleted file mode 100644 index 5c68a4a52d78..000000000000 --- a/fs/reiserfs/procfs.c +++ /dev/null @@ -1,490 +0,0 @@ -/* -*- linux-c -*- */ - -/* fs/reiserfs/procfs.c */ - -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -/* proc info support a la one created by Sizif@Botik.RU for PGC */ - -#include <linux/module.h> -#include <linux/time.h> -#include <linux/seq_file.h> -#include <linux/uaccess.h> -#include "reiserfs.h" -#include <linux/init.h> -#include <linux/proc_fs.h> -#include <linux/blkdev.h> - -/* - * LOCKING: - * - * These guys are evicted from procfs as the very first step in ->kill_sb(). - * - */ - -static int show_version(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - char *format; - - if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { - format = "3.6"; - } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { - format = "3.5"; - } else { - format = "unknown"; - } - - seq_printf(m, "%s format\twith checks %s\n", format, -#if defined( CONFIG_REISERFS_CHECK ) - "on" -#else - "off" -#endif - ); - return 0; -} - -#define SF( x ) ( r -> x ) -#define SFP( x ) SF( s_proc_info_data.x ) -#define SFPL( x ) SFP( x[ level ] ) -#define SFPF( x ) SFP( scan_bitmap.x ) -#define SFPJ( x ) SFP( journal.x ) - -#define D2C( x ) le16_to_cpu( x ) -#define D4C( x ) le32_to_cpu( x ) -#define DF( x ) D2C( rs -> s_v1.x ) -#define DFL( x ) D4C( rs -> s_v1.x ) - -#define objectid_map( s, rs ) (old_format_only (s) ? \ - (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ - (__le32 *)(rs + 1)) -#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) - -#define DJF( x ) le32_to_cpu( rs -> x ) -#define DJP( x ) le32_to_cpu( jp -> x ) -#define JF( x ) ( r -> s_journal -> x ) - -static int show_super(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *r = REISERFS_SB(sb); - - seq_printf(m, "state: \t%s\n" - "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" - "gen. counter: \t%i\n" - "s_disk_reads: \t%i\n" - "s_disk_writes: \t%i\n" - "s_fix_nodes: \t%i\n" - "s_do_balance: \t%i\n" - "s_unneeded_left_neighbor: \t%i\n" - "s_good_search_by_key_reada: \t%i\n" - "s_bmaps: \t%i\n" - "s_bmaps_without_search: \t%i\n" - "s_direct2indirect: \t%i\n" - "s_indirect2direct: \t%i\n" - "\n" - "max_hash_collisions: \t%i\n" - "breads: \t%lu\n" - "bread_misses: \t%lu\n" - "search_by_key: \t%lu\n" - "search_by_key_fs_changed: \t%lu\n" - "search_by_key_restarted: \t%lu\n" - "insert_item_restarted: \t%lu\n" - "paste_into_item_restarted: \t%lu\n" - "cut_from_item_restarted: \t%lu\n" - "delete_solid_item_restarted: \t%lu\n" - "delete_item_restarted: \t%lu\n" - "leaked_oid: \t%lu\n" - "leaves_removable: \t%lu\n", - SF(s_mount_state) == REISERFS_VALID_FS ? - "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", - reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", - reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", - reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", - reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", - reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", - reiserfs_no_unhashed_relocation(sb) ? - "NO_UNHASHED_RELOCATION " : "", - reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", - reiserfs_test4(sb) ? "TEST4 " : "", - have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? - "SMALL_TAILS " : "NO_TAILS ", - replay_only(sb) ? "REPLAY_ONLY " : "", - convert_reiserfs(sb) ? "CONV " : "", - atomic_read(&r->s_generation_counter), - SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), - SF(s_do_balance), SF(s_unneeded_left_neighbor), - SF(s_good_search_by_key_reada), SF(s_bmaps), - SF(s_bmaps_without_search), SF(s_direct2indirect), - SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), - SFP(bread_miss), SFP(search_by_key), - SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), - SFP(insert_item_restarted), SFP(paste_into_item_restarted), - SFP(cut_from_item_restarted), - SFP(delete_solid_item_restarted), SFP(delete_item_restarted), - SFP(leaked_oid), SFP(leaves_removable)); - - return 0; -} - -static int show_per_level(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *r = REISERFS_SB(sb); - int level; - - seq_printf(m, "level\t" - " balances" - " [sbk: reads" - " fs_changed" - " restarted]" - " free space" - " items" - " can_remove" - " lnum" - " rnum" - " lbytes" - " rbytes" - " get_neig" - " get_neig_res" " need_l_neig" " need_r_neig" "\n"); - - for (level = 0; level < MAX_HEIGHT; ++level) { - seq_printf(m, "%i\t" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12li" - " %12li" - " %12li" - " %12li" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - "\n", - level, - SFPL(balance_at), - SFPL(sbk_read_at), - SFPL(sbk_fs_changed), - SFPL(sbk_restarted), - SFPL(free_at), - SFPL(items_at), - SFPL(can_node_be_removed), - SFPL(lnum), - SFPL(rnum), - SFPL(lbytes), - SFPL(rbytes), - SFPL(get_neighbors), - SFPL(get_neighbors_restart), - SFPL(need_l_neighbor), SFPL(need_r_neighbor) - ); - } - return 0; -} - -static int show_bitmap(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *r = REISERFS_SB(sb); - - seq_printf(m, "free_block: %lu\n" - " scan_bitmap:" - " wait" - " bmap" - " retry" - " stolen" - " journal_hint" - "journal_nohint" - "\n" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - "\n", - SFP(free_block), - SFPF(call), - SFPF(wait), - SFPF(bmap), - SFPF(retry), - SFPF(stolen), - SFPF(in_journal_hint), SFPF(in_journal_nohint)); - - return 0; -} - -static int show_on_disk_super(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); - struct reiserfs_super_block *rs = sb_info->s_rs; - int hash_code = DFL(s_hash_function_code); - __u32 flags = DJF(s_flags); - - seq_printf(m, "block_count: \t%i\n" - "free_blocks: \t%i\n" - "root_block: \t%i\n" - "blocksize: \t%i\n" - "oid_maxsize: \t%i\n" - "oid_cursize: \t%i\n" - "umount_state: \t%i\n" - "magic: \t%10.10s\n" - "fs_state: \t%i\n" - "hash: \t%s\n" - "tree_height: \t%i\n" - "bmap_nr: \t%i\n" - "version: \t%i\n" - "flags: \t%x[%s]\n" - "reserved_for_journal: \t%i\n", - DFL(s_block_count), - DFL(s_free_blocks), - DFL(s_root_block), - DF(s_blocksize), - DF(s_oid_maxsize), - DF(s_oid_cursize), - DF(s_umount_state), - rs->s_v1.s_magic, - DF(s_fs_state), - hash_code == TEA_HASH ? "tea" : - (hash_code == YURA_HASH) ? "rupasov" : - (hash_code == R5_HASH) ? "r5" : - (hash_code == UNSET_HASH) ? "unset" : "unknown", - DF(s_tree_height), - DF(s_bmap_nr), - DF(s_version), flags, (flags & reiserfs_attrs_cleared) - ? "attrs_cleared" : "", DF(s_reserved_for_journal)); - - return 0; -} - -static int show_oidmap(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); - struct reiserfs_super_block *rs = sb_info->s_rs; - unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); - unsigned long total_used = 0; - int i; - - for (i = 0; i < mapsize; ++i) { - __u32 right; - - right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); - seq_printf(m, "%s: [ %x .. %x )\n", - (i & 1) ? "free" : "used", MAP(i), right); - if (!(i & 1)) { - total_used += right - MAP(i); - } - } -#if defined( REISERFS_USE_OIDMAPF ) - if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { - loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; - total_used += size / sizeof(reiserfs_oidinterval_d_t); - } -#endif - seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", - mapsize, - mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); - return 0; -} - -static time64_t ktime_mono_to_real_seconds(time64_t mono) -{ - ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2); - - return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC); -} - -static int show_journal(struct seq_file *m, void *unused) -{ - struct super_block *sb = m->private; - struct reiserfs_sb_info *r = REISERFS_SB(sb); - struct reiserfs_super_block *rs = r->s_rs; - struct journal_params *jp = &rs->s_v1.s_journal; - - seq_printf(m, /* on-disk fields */ - "jp_journal_1st_block: \t%i\n" - "jp_journal_dev: \t%pg[%x]\n" - "jp_journal_size: \t%i\n" - "jp_journal_trans_max: \t%i\n" - "jp_journal_magic: \t%i\n" - "jp_journal_max_batch: \t%i\n" - "jp_journal_max_commit_age: \t%i\n" - "jp_journal_max_trans_age: \t%i\n" - /* incore fields */ - "j_1st_reserved_block: \t%i\n" - "j_state: \t%li\n" - "j_trans_id: \t%u\n" - "j_mount_id: \t%lu\n" - "j_start: \t%lu\n" - "j_len: \t%lu\n" - "j_len_alloc: \t%lu\n" - "j_wcount: \t%i\n" - "j_bcount: \t%lu\n" - "j_first_unflushed_offset: \t%lu\n" - "j_last_flush_trans_id: \t%u\n" - "j_trans_start_time: \t%lli\n" - "j_list_bitmap_index: \t%i\n" - "j_must_wait: \t%i\n" - "j_next_full_flush: \t%i\n" - "j_next_async_flush: \t%i\n" - "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" - /* reiserfs_proc_info_data_t.journal fields */ - "in_journal: \t%12lu\n" - "in_journal_bitmap: \t%12lu\n" - "in_journal_reusable: \t%12lu\n" - "lock_journal: \t%12lu\n" - "lock_journal_wait: \t%12lu\n" - "journal_begin: \t%12lu\n" - "journal_relock_writers: \t%12lu\n" - "journal_relock_wcount: \t%12lu\n" - "mark_dirty: \t%12lu\n" - "mark_dirty_already: \t%12lu\n" - "mark_dirty_notjournal: \t%12lu\n" - "restore_prepared: \t%12lu\n" - "prepare: \t%12lu\n" - "prepare_retry: \t%12lu\n", - DJP(jp_journal_1st_block), - file_bdev(SB_JOURNAL(sb)->j_bdev_file), - DJP(jp_journal_dev), - DJP(jp_journal_size), - DJP(jp_journal_trans_max), - DJP(jp_journal_magic), - DJP(jp_journal_max_batch), - SB_JOURNAL(sb)->j_max_commit_age, - DJP(jp_journal_max_trans_age), - JF(j_1st_reserved_block), - JF(j_state), - JF(j_trans_id), - JF(j_mount_id), - JF(j_start), - JF(j_len), - JF(j_len_alloc), - atomic_read(&r->s_journal->j_wcount), - JF(j_bcount), - JF(j_first_unflushed_offset), - JF(j_last_flush_trans_id), - ktime_mono_to_real_seconds(JF(j_trans_start_time)), - JF(j_list_bitmap_index), - JF(j_must_wait), - JF(j_next_full_flush), - JF(j_next_async_flush), - JF(j_cnode_used), - JF(j_cnode_free), - SFPJ(in_journal), - SFPJ(in_journal_bitmap), - SFPJ(in_journal_reusable), - SFPJ(lock_journal), - SFPJ(lock_journal_wait), - SFPJ(journal_being), - SFPJ(journal_relock_writers), - SFPJ(journal_relock_wcount), - SFPJ(mark_dirty), - SFPJ(mark_dirty_already), - SFPJ(mark_dirty_notjournal), - SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) - ); - return 0; -} - -static struct proc_dir_entry *proc_info_root = NULL; -static const char proc_info_root_name[] = "fs/reiserfs"; - -static void add_file(struct super_block *sb, char *name, - int (*func) (struct seq_file *, void *)) -{ - proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); -} - -int reiserfs_proc_info_init(struct super_block *sb) -{ - char b[BDEVNAME_SIZE]; - char *s; - - /* Some block devices use /'s */ - strscpy(b, sb->s_id, BDEVNAME_SIZE); - s = strchr(b, '/'); - if (s) - *s = '!'; - - spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); - if (REISERFS_SB(sb)->procdir) { - add_file(sb, "version", show_version); - add_file(sb, "super", show_super); - add_file(sb, "per-level", show_per_level); - add_file(sb, "bitmap", show_bitmap); - add_file(sb, "on-disk-super", show_on_disk_super); - add_file(sb, "oidmap", show_oidmap); - add_file(sb, "journal", show_journal); - return 0; - } - reiserfs_warning(sb, "cannot create /proc/%s/%s", - proc_info_root_name, b); - return 1; -} - -int reiserfs_proc_info_done(struct super_block *sb) -{ - struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; - if (de) { - char b[BDEVNAME_SIZE]; - char *s; - - /* Some block devices use /'s */ - strscpy(b, sb->s_id, BDEVNAME_SIZE); - s = strchr(b, '/'); - if (s) - *s = '!'; - - remove_proc_subtree(b, proc_info_root); - REISERFS_SB(sb)->procdir = NULL; - } - return 0; -} - -int reiserfs_proc_info_global_init(void) -{ - if (proc_info_root == NULL) { - proc_info_root = proc_mkdir(proc_info_root_name, NULL); - if (!proc_info_root) { - reiserfs_warning(NULL, "cannot create /proc/%s", - proc_info_root_name); - return 1; - } - } - return 0; -} - -int reiserfs_proc_info_global_done(void) -{ - if (proc_info_root != NULL) { - proc_info_root = NULL; - remove_proc_entry(proc_info_root_name, NULL); - } - return 0; -} -/* - * Revision 1.1.8.2 2001/07/15 17:08:42 god - * . use get_super() in procfs.c - * . remove remove_save_link() from reiserfs_do_truncate() - * - * I accept terms and conditions stated in the Legal Agreement - * (available at http://www.namesys.com/legalese.html) - * - * Revision 1.1.8.1 2001/07/11 16:48:50 god - * proc info support - * - * I accept terms and conditions stated in the Legal Agreement - * (available at http://www.namesys.com/legalese.html) - * - */ diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h deleted file mode 100644 index 12fc20af8e17..000000000000 --- a/fs/reiserfs/reiserfs.h +++ /dev/null @@ -1,3419 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for - * licensing and copyright details - */ - -#include <linux/reiserfs_fs.h> - -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <linux/bug.h> -#include <linux/workqueue.h> -#include <linux/unaligned.h> -#include <linux/bitops.h> -#include <linux/proc_fs.h> -#include <linux/buffer_head.h> - -/* the 32 bit compat definitions with int argument */ -#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) -#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION -#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION - -struct reiserfs_journal_list; - -/* bitmasks for i_flags field in reiserfs-specific part of inode */ -typedef enum { - /* - * this says what format of key do all items (but stat data) of - * an object have. If this is set, that format is 3.6 otherwise - 3.5 - */ - i_item_key_version_mask = 0x0001, - - /* - * If this is unset, object has 3.5 stat data, otherwise, - * it has 3.6 stat data with 64bit size, 32bit nlink etc. - */ - i_stat_data_version_mask = 0x0002, - - /* file might need tail packing on close */ - i_pack_on_close_mask = 0x0004, - - /* don't pack tail of file */ - i_nopack_mask = 0x0008, - - /* - * If either of these are set, "safe link" was created for this - * file during truncate or unlink. Safe link is used to avoid - * leakage of disk space on crash with some files open, but unlinked. - */ - i_link_saved_unlink_mask = 0x0010, - i_link_saved_truncate_mask = 0x0020, - - i_has_xattr_dir = 0x0040, - i_data_log = 0x0080, -} reiserfs_inode_flags; - -struct reiserfs_inode_info { - __u32 i_key[4]; /* key is still 4 32 bit integers */ - - /* - * transient inode flags that are never stored on disk. Bitmasks - * for this field are defined above. - */ - __u32 i_flags; - - /* offset of first byte stored in direct item. */ - __u32 i_first_direct_byte; - - /* copy of persistent inode flags read from sd_attrs. */ - __u32 i_attrs; - - /* first unused block of a sequence of unused blocks */ - int i_prealloc_block; - int i_prealloc_count; /* length of that sequence */ - - /* per-transaction list of inodes which have preallocated blocks */ - struct list_head i_prealloc_list; - - /* - * new_packing_locality is created; new blocks for the contents - * of this directory should be displaced - */ - unsigned new_packing_locality:1; - - /* - * we use these for fsync or O_SYNC to decide which transaction - * needs to be committed in order for this inode to be properly - * flushed - */ - unsigned int i_trans_id; - - struct reiserfs_journal_list *i_jl; - atomic_t openers; - struct mutex tailpack; -#ifdef CONFIG_REISERFS_FS_XATTR - struct rw_semaphore i_xattr_sem; -#endif -#ifdef CONFIG_QUOTA - struct dquot __rcu *i_dquot[MAXQUOTAS]; -#endif - - struct inode vfs_inode; -}; - -typedef enum { - reiserfs_attrs_cleared = 0x00000001, -} reiserfs_super_block_flags; - -/* - * struct reiserfs_super_block accessors/mutators since this is a disk - * structure, it will always be in little endian format. - */ -#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count)) -#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) -#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks)) -#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v)) -#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block)) -#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v)) - -#define sb_jp_journal_1st_block(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block)) -#define set_sb_jp_journal_1st_block(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v)) -#define sb_jp_journal_dev(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev)) -#define set_sb_jp_journal_dev(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v)) -#define sb_jp_journal_size(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size)) -#define set_sb_jp_journal_size(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v)) -#define sb_jp_journal_trans_max(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max)) -#define set_sb_jp_journal_trans_max(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v)) -#define sb_jp_journal_magic(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic)) -#define set_sb_jp_journal_magic(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v)) -#define sb_jp_journal_max_batch(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch)) -#define set_sb_jp_journal_max_batch(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v)) -#define sb_jp_jourmal_max_commit_age(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age)) -#define set_sb_jp_journal_max_commit_age(sbp,v) \ - ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v)) - -#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize)) -#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v)) -#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize)) -#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v)) -#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize)) -#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v)) -#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state)) -#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v)) -#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state)) -#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v)) -#define sb_hash_function_code(sbp) \ - (le32_to_cpu((sbp)->s_v1.s_hash_function_code)) -#define set_sb_hash_function_code(sbp,v) \ - ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v)) -#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height)) -#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v)) -#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr)) -#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v)) -#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version)) -#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v)) - -#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count)) -#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v)) - -#define sb_reserved_for_journal(sbp) \ - (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal)) -#define set_sb_reserved_for_journal(sbp,v) \ - ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v)) - -/* LOGGING -- */ - -/* - * These all interelate for performance. - * - * If the journal block count is smaller than n transactions, you lose speed. - * I don't know what n is yet, I'm guessing 8-16. - * - * typical transaction size depends on the application, how often fsync is - * called, and how many metadata blocks you dirty in a 30 second period. - * The more small files (<16k) you use, the larger your transactions will - * be. - * - * If your journal fills faster than dirty buffers get flushed to disk, it - * must flush them before allowing the journal to wrap, which slows things - * down. If you need high speed meta data updates, the journal should be - * big enough to prevent wrapping before dirty meta blocks get to disk. - * - * If the batch max is smaller than the transaction max, you'll waste space - * at the end of the journal because journal_end sets the next transaction - * to start at 0 if the next transaction has any chance of wrapping. - * - * The large the batch max age, the better the speed, and the more meta - * data changes you'll lose after a crash. - */ - -/* don't mess with these for a while */ -/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ -#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ -#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ -#define JOURNAL_HASH_SIZE 8192 - -/* number of copies of the bitmaps to have floating. Must be >= 2 */ -#define JOURNAL_NUM_BITMAPS 5 - -/* - * One of these for every block in every transaction - * Each one is in two hash tables. First, a hash of the current transaction, - * and after journal_end, a hash of all the in memory transactions. - * next and prev are used by the current transaction (journal_hash). - * hnext and hprev are used by journal_list_hash. If a block is in more - * than one transaction, the journal_list_hash links it in multiple times. - * This allows flush_journal_list to remove just the cnode belonging to a - * given transaction. - */ -struct reiserfs_journal_cnode { - struct buffer_head *bh; /* real buffer head */ - struct super_block *sb; /* dev of real buffer head */ - - /* block number of real buffer head, == 0 when buffer on disk */ - __u32 blocknr; - - unsigned long state; - - /* journal list this cnode lives in */ - struct reiserfs_journal_list *jlist; - - struct reiserfs_journal_cnode *next; /* next in transaction list */ - struct reiserfs_journal_cnode *prev; /* prev in transaction list */ - struct reiserfs_journal_cnode *hprev; /* prev in hash list */ - struct reiserfs_journal_cnode *hnext; /* next in hash list */ -}; - -struct reiserfs_bitmap_node { - int id; - char *data; - struct list_head list; -}; - -struct reiserfs_list_bitmap { - struct reiserfs_journal_list *journal_list; - struct reiserfs_bitmap_node **bitmaps; -}; - -/* - * one of these for each transaction. The most important part here is the - * j_realblock. this list of cnodes is used to hash all the blocks in all - * the commits, to mark all the real buffer heads dirty once all the commits - * hit the disk, and to make sure every real block in a transaction is on - * disk before allowing the log area to be overwritten - */ -struct reiserfs_journal_list { - unsigned long j_start; - unsigned long j_state; - unsigned long j_len; - atomic_t j_nonzerolen; - atomic_t j_commit_left; - - /* all commits older than this on disk */ - atomic_t j_older_commits_done; - - struct mutex j_commit_mutex; - unsigned int j_trans_id; - time64_t j_timestamp; /* write-only but useful for crash dump analysis */ - struct reiserfs_list_bitmap *j_list_bitmap; - struct buffer_head *j_commit_bh; /* commit buffer head */ - struct reiserfs_journal_cnode *j_realblock; - struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */ - /* time ordered list of all active transactions */ - struct list_head j_list; - - /* - * time ordered list of all transactions we haven't tried - * to flush yet - */ - struct list_head j_working_list; - - /* list of tail conversion targets in need of flush before commit */ - struct list_head j_tail_bh_list; - - /* list of data=ordered buffers in need of flush before commit */ - struct list_head j_bh_list; - int j_refcount; -}; - -struct reiserfs_journal { - struct buffer_head **j_ap_blocks; /* journal blocks on disk */ - /* newest journal block */ - struct reiserfs_journal_cnode *j_last; - - /* oldest journal block. start here for traverse */ - struct reiserfs_journal_cnode *j_first; - - struct file *j_bdev_file; - - /* first block on s_dev of reserved area journal */ - int j_1st_reserved_block; - - unsigned long j_state; - unsigned int j_trans_id; - unsigned long j_mount_id; - - /* start of current waiting commit (index into j_ap_blocks) */ - unsigned long j_start; - unsigned long j_len; /* length of current waiting commit */ - - /* number of buffers requested by journal_begin() */ - unsigned long j_len_alloc; - - atomic_t j_wcount; /* count of writers for current commit */ - - /* batch count. allows turning X transactions into 1 */ - unsigned long j_bcount; - - /* first unflushed transactions offset */ - unsigned long j_first_unflushed_offset; - - /* last fully flushed journal timestamp */ - unsigned j_last_flush_trans_id; - - struct buffer_head *j_header_bh; - - time64_t j_trans_start_time; /* time this transaction started */ - struct mutex j_mutex; - struct mutex j_flush_mutex; - - /* wait for current transaction to finish before starting new one */ - wait_queue_head_t j_join_wait; - - atomic_t j_jlock; /* lock for j_join_wait */ - int j_list_bitmap_index; /* number of next list bitmap to use */ - - /* no more journal begins allowed. MUST sleep on j_join_wait */ - int j_must_wait; - - /* next journal_end will flush all journal list */ - int j_next_full_flush; - - /* next journal_end will flush all async commits */ - int j_next_async_flush; - - int j_cnode_used; /* number of cnodes on the used list */ - int j_cnode_free; /* number of cnodes on the free list */ - - /* max number of blocks in a transaction. */ - unsigned int j_trans_max; - - /* max number of blocks to batch into a trans */ - unsigned int j_max_batch; - - /* in seconds, how old can an async commit be */ - unsigned int j_max_commit_age; - - /* in seconds, how old can a transaction be */ - unsigned int j_max_trans_age; - - /* the default for the max commit age */ - unsigned int j_default_max_commit_age; - - struct reiserfs_journal_cnode *j_cnode_free_list; - - /* orig pointer returned from vmalloc */ - struct reiserfs_journal_cnode *j_cnode_free_orig; - - struct reiserfs_journal_list *j_current_jl; - int j_free_bitmap_nodes; - int j_used_bitmap_nodes; - - int j_num_lists; /* total number of active transactions */ - int j_num_work_lists; /* number that need attention from kreiserfsd */ - - /* debugging to make sure things are flushed in order */ - unsigned int j_last_flush_id; - - /* debugging to make sure things are committed in order */ - unsigned int j_last_commit_id; - - struct list_head j_bitmap_nodes; - struct list_head j_dirty_buffers; - spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */ - - /* list of all active transactions */ - struct list_head j_journal_list; - - /* lists that haven't been touched by writeback attempts */ - struct list_head j_working_list; - - /* hash table for real buffer heads in current trans */ - struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; - - /* hash table for all the real buffer heads in all the transactions */ - struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; - - /* array of bitmaps to record the deleted blocks */ - struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; - - /* list of inodes which have preallocated blocks */ - struct list_head j_prealloc_list; - int j_persistent_trans; - unsigned long j_max_trans_size; - unsigned long j_max_batch_size; - - int j_errno; - - /* when flushing ordered buffers, throttle new ordered writers */ - struct delayed_work j_work; - struct super_block *j_work_sb; - atomic_t j_async_throttle; -}; - -enum journal_state_bits { - J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */ - J_WRITERS_QUEUED, /* set when log is full due to too many writers */ - J_ABORTED, /* set when log is aborted */ -}; - -/* ick. magic string to find desc blocks in the journal */ -#define JOURNAL_DESC_MAGIC "ReIsErLB" - -typedef __u32(*hashf_t) (const signed char *, int); - -struct reiserfs_bitmap_info { - __u32 free_count; -}; - -struct proc_dir_entry; - -#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO ) -typedef unsigned long int stat_cnt_t; -typedef struct reiserfs_proc_info_data { - spinlock_t lock; - int exiting; - int max_hash_collisions; - - stat_cnt_t breads; - stat_cnt_t bread_miss; - stat_cnt_t search_by_key; - stat_cnt_t search_by_key_fs_changed; - stat_cnt_t search_by_key_restarted; - - stat_cnt_t insert_item_restarted; - stat_cnt_t paste_into_item_restarted; - stat_cnt_t cut_from_item_restarted; - stat_cnt_t delete_solid_item_restarted; - stat_cnt_t delete_item_restarted; - - stat_cnt_t leaked_oid; - stat_cnt_t leaves_removable; - - /* - * balances per level. - * Use explicit 5 as MAX_HEIGHT is not visible yet. - */ - stat_cnt_t balance_at[5]; /* XXX */ - /* sbk == search_by_key */ - stat_cnt_t sbk_read_at[5]; /* XXX */ - stat_cnt_t sbk_fs_changed[5]; - stat_cnt_t sbk_restarted[5]; - stat_cnt_t items_at[5]; /* XXX */ - stat_cnt_t free_at[5]; /* XXX */ - stat_cnt_t can_node_be_removed[5]; /* XXX */ - long int lnum[5]; /* XXX */ - long int rnum[5]; /* XXX */ - long int lbytes[5]; /* XXX */ - long int rbytes[5]; /* XXX */ - stat_cnt_t get_neighbors[5]; - stat_cnt_t get_neighbors_restart[5]; - stat_cnt_t need_l_neighbor[5]; - stat_cnt_t need_r_neighbor[5]; - - stat_cnt_t free_block; - struct __scan_bitmap_stats { - stat_cnt_t call; - stat_cnt_t wait; - stat_cnt_t bmap; - stat_cnt_t retry; - stat_cnt_t in_journal_hint; - stat_cnt_t in_journal_nohint; - stat_cnt_t stolen; - } scan_bitmap; - struct __journal_stats { - stat_cnt_t in_journal; - stat_cnt_t in_journal_bitmap; - stat_cnt_t in_journal_reusable; - stat_cnt_t lock_journal; - stat_cnt_t lock_journal_wait; - stat_cnt_t journal_being; - stat_cnt_t journal_relock_writers; - stat_cnt_t journal_relock_wcount; - stat_cnt_t mark_dirty; - stat_cnt_t mark_dirty_already; - stat_cnt_t mark_dirty_notjournal; - stat_cnt_t restore_prepared; - stat_cnt_t prepare; - stat_cnt_t prepare_retry; - } journal; -} reiserfs_proc_info_data_t; -#else -typedef struct reiserfs_proc_info_data { -} reiserfs_proc_info_data_t; -#endif - -/* Number of quota types we support */ -#define REISERFS_MAXQUOTAS 2 - -/* reiserfs union of in-core super block data */ -struct reiserfs_sb_info { - /* Buffer containing the super block */ - struct buffer_head *s_sbh; - - /* Pointer to the on-disk super block in the buffer */ - struct reiserfs_super_block *s_rs; - struct reiserfs_bitmap_info *s_ap_bitmap; - - /* pointer to journal information */ - struct reiserfs_journal *s_journal; - - unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ - - /* Serialize writers access, replace the old bkl */ - struct mutex lock; - - /* Owner of the lock (can be recursive) */ - struct task_struct *lock_owner; - - /* Depth of the lock, start from -1 like the bkl */ - int lock_depth; - - struct workqueue_struct *commit_wq; - - /* Comment? -Hans */ - void (*end_io_handler) (struct buffer_head *, int); - - /* - * pointer to function which is used to sort names in directory. - * Set on mount - */ - hashf_t s_hash_function; - - /* reiserfs's mount options are set here */ - unsigned long s_mount_opt; - - /* This is a structure that describes block allocator options */ - struct { - /* Bitfield for enable/disable kind of options */ - unsigned long bits; - - /* - * size started from which we consider file - * to be a large one (in blocks) - */ - unsigned long large_file_size; - - int border; /* percentage of disk, border takes */ - - /* - * Minimal file size (in blocks) starting - * from which we do preallocations - */ - int preallocmin; - - /* - * Number of blocks we try to prealloc when file - * reaches preallocmin size (in blocks) or prealloc_list - is empty. - */ - int preallocsize; - } s_alloc_options; - - /* Comment? -Hans */ - wait_queue_head_t s_wait; - /* increased by one every time the tree gets re-balanced */ - atomic_t s_generation_counter; - - /* File system properties. Currently holds on-disk FS format */ - unsigned long s_properties; - - /* session statistics */ - int s_disk_reads; - int s_disk_writes; - int s_fix_nodes; - int s_do_balance; - int s_unneeded_left_neighbor; - int s_good_search_by_key_reada; - int s_bmaps; - int s_bmaps_without_search; - int s_direct2indirect; - int s_indirect2direct; - - /* - * set up when it's ok for reiserfs_read_inode2() to read from - * disk inode with nlink==0. Currently this is only used during - * finish_unfinished() processing at mount time - */ - int s_is_unlinked_ok; - - reiserfs_proc_info_data_t s_proc_info_data; - struct proc_dir_entry *procdir; - - /* amount of blocks reserved for further allocations */ - int reserved_blocks; - - - /* this lock on now only used to protect reserved_blocks variable */ - spinlock_t bitmap_lock; - struct dentry *priv_root; /* root of /.reiserfs_priv */ - struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ - int j_errno; - - int work_queued; /* non-zero delayed work is queued */ - struct delayed_work old_work; /* old transactions flush delayed work */ - spinlock_t old_work_lock; /* protects old_work and work_queued */ - -#ifdef CONFIG_QUOTA - char *s_qf_names[REISERFS_MAXQUOTAS]; - int s_jquota_fmt; -#endif - char *s_jdev; /* Stored jdev for mount option showing */ -#ifdef CONFIG_REISERFS_CHECK - - /* - * Detects whether more than one copy of tb exists per superblock - * as a means of checking whether do_balance is executing - * concurrently against another tree reader/writer on a same - * mount point. - */ - struct tree_balance *cur_tb; -#endif -}; - -/* Definitions of reiserfs on-disk properties: */ -#define REISERFS_3_5 0 -#define REISERFS_3_6 1 -#define REISERFS_OLD_FORMAT 2 - -/* Mount options */ -enum reiserfs_mount_options { - /* large tails will be created in a session */ - REISERFS_LARGETAIL, - /* - * small (for files less than block size) tails will - * be created in a session - */ - REISERFS_SMALLTAIL, - - /* replay journal and return 0. Use by fsck */ - REPLAYONLY, - - /* - * -o conv: causes conversion of old format super block to the - * new format. If not specified - old partition will be dealt - * with in a manner of 3.5.x - */ - REISERFS_CONVERT, - - /* - * -o hash={tea, rupasov, r5, detect} is meant for properly mounting - * reiserfs disks from 3.5.19 or earlier. 99% of the time, this - * option is not required. If the normal autodection code can't - * determine which hash to use (because both hashes had the same - * value for a file) use this option to force a specific hash. - * It won't allow you to override the existing hash on the FS, so - * if you have a tea hash disk, and mount with -o hash=rupasov, - * the mount will fail. - */ - FORCE_TEA_HASH, /* try to force tea hash on mount */ - FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ - FORCE_R5_HASH, /* try to force rupasov hash on mount */ - FORCE_HASH_DETECT, /* try to detect hash function on mount */ - - REISERFS_DATA_LOG, - REISERFS_DATA_ORDERED, - REISERFS_DATA_WRITEBACK, - - /* - * used for testing experimental features, makes benchmarking new - * features with and without more convenient, should never be used by - * users in any code shipped to users (ideally) - */ - - REISERFS_NO_BORDER, - REISERFS_NO_UNHASHED_RELOCATION, - REISERFS_HASHED_RELOCATION, - REISERFS_ATTRS, - REISERFS_XATTRS_USER, - REISERFS_POSIXACL, - REISERFS_EXPOSE_PRIVROOT, - REISERFS_BARRIER_NONE, - REISERFS_BARRIER_FLUSH, - - /* Actions on error */ - REISERFS_ERROR_PANIC, - REISERFS_ERROR_RO, - REISERFS_ERROR_CONTINUE, - - REISERFS_USRQUOTA, /* User quota option specified */ - REISERFS_GRPQUOTA, /* Group quota option specified */ - - REISERFS_TEST1, - REISERFS_TEST2, - REISERFS_TEST3, - REISERFS_TEST4, - REISERFS_UNSUPPORTED_OPT, -}; - -#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH)) -#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH)) -#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH)) -#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT)) -#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER)) -#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION)) -#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION)) -#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4)) - -#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL)) -#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL)) -#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY)) -#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS)) -#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5)) -#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT)) -#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG)) -#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED)) -#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) -#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) -#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) -#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT)) -#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) -#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) -#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) - -#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC)) -#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO)) - -void reiserfs_file_buffer(struct buffer_head *bh, int list); -extern struct file_system_type reiserfs_fs_type; -int reiserfs_resize(struct super_block *, unsigned long); - -#define CARRY_ON 0 -#define SCHEDULE_OCCURRED 1 - -#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) -#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) -#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) -#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) -#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) - -#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) - -#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) -static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal - *journal) -{ - return test_bit(J_ABORTED, &journal->j_state); -} - -/* - * Locking primitives. The write lock is a per superblock - * special mutex that has properties close to the Big Kernel Lock - * which was used in the previous locking scheme. - */ -void reiserfs_write_lock(struct super_block *s); -void reiserfs_write_unlock(struct super_block *s); -int __must_check reiserfs_write_unlock_nested(struct super_block *s); -void reiserfs_write_lock_nested(struct super_block *s, int depth); - -#ifdef CONFIG_REISERFS_CHECK -void reiserfs_lock_check_recursive(struct super_block *s); -#else -static inline void reiserfs_lock_check_recursive(struct super_block *s) { } -#endif - -/* - * Several mutexes depend on the write lock. - * However sometimes we want to relax the write lock while we hold - * these mutexes, according to the release/reacquire on schedule() - * properties of the Bkl that were used. - * Reiserfs performances and locking were based on this scheme. - * Now that the write lock is a mutex and not the bkl anymore, doing so - * may result in a deadlock: - * - * A acquire write_lock - * A acquire j_commit_mutex - * A release write_lock and wait for something - * B acquire write_lock - * B can't acquire j_commit_mutex and sleep - * A can't acquire write lock anymore - * deadlock - * - * What we do here is avoiding such deadlock by playing the same game - * than the Bkl: if we can't acquire a mutex that depends on the write lock, - * we release the write lock, wait a bit and then retry. - * - * The mutexes concerned by this hack are: - * - The commit mutex of a journal list - * - The flush mutex - * - The journal lock - * - The inode mutex - */ -static inline void reiserfs_mutex_lock_safe(struct mutex *m, - struct super_block *s) -{ - int depth; - - depth = reiserfs_write_unlock_nested(s); - mutex_lock(m); - reiserfs_write_lock_nested(s, depth); -} - -static inline void -reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, - struct super_block *s) -{ - int depth; - - depth = reiserfs_write_unlock_nested(s); - mutex_lock_nested(m, subclass); - reiserfs_write_lock_nested(s, depth); -} - -static inline void -reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) -{ - int depth; - depth = reiserfs_write_unlock_nested(s); - down_read(sem); - reiserfs_write_lock_nested(s, depth); -} - -/* - * When we schedule, we usually want to also release the write lock, - * according to the previous bkl based locking scheme of reiserfs. - */ -static inline void reiserfs_cond_resched(struct super_block *s) -{ - if (need_resched()) { - int depth; - - depth = reiserfs_write_unlock_nested(s); - schedule(); - reiserfs_write_lock_nested(s, depth); - } -} - -struct fid; - -/* - * in reading the #defines, it may help to understand that they employ - * the following abbreviations: - * - * B = Buffer - * I = Item header - * H = Height within the tree (should be changed to LEV) - * N = Number of the item in the node - * STAT = stat data - * DEH = Directory Entry Header - * EC = Entry Count - * E = Entry number - * UL = Unsigned Long - * BLKH = BLocK Header - * UNFM = UNForMatted node - * DC = Disk Child - * P = Path - * - * These #defines are named by concatenating these abbreviations, - * where first comes the arguments, and last comes the return value, - * of the macro. - */ - -#define USE_INODE_GENERATION_COUNTER - -#define REISERFS_PREALLOCATE -#define DISPLACE_NEW_PACKING_LOCALITIES -#define PREALLOCATION_SIZE 9 - -/* n must be power of 2 */ -#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) - -/* - * to be ok for alpha and others we have to align structures to 8 byte - * boundary. - * FIXME: do not change 4 by anything else: there is code which relies on that - */ -#define ROUND_UP(x) _ROUND_UP(x,8LL) - -/* - * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug - * messages. - */ -#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ - -void __reiserfs_warning(struct super_block *s, const char *id, - const char *func, const char *fmt, ...); -#define reiserfs_warning(s, id, fmt, args...) \ - __reiserfs_warning(s, id, __func__, fmt, ##args) -/* assertions handling */ - -/* always check a condition and panic if it's false. */ -#define __RASSERT(cond, scond, format, args...) \ -do { \ - if (!(cond)) \ - reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \ - __FILE__ ":%i:%s: " format "\n", \ - __LINE__, __func__ , ##args); \ -} while (0) - -#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args) - -#if defined( CONFIG_REISERFS_CHECK ) -#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args) -#else -#define RFALSE( cond, format, args... ) do {;} while( 0 ) -#endif - -#define CONSTF __attribute_const__ -/* - * Disk Data Structures - */ - -/*************************************************************************** - * SUPER BLOCK * - ***************************************************************************/ - -/* - * Structure of super block on disk, a version of which in RAM is often - * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger - * structure containing fields never written to disk. - */ -#define UNSET_HASH 0 /* Detect hash on disk */ -#define TEA_HASH 1 -#define YURA_HASH 2 -#define R5_HASH 3 -#define DEFAULT_HASH R5_HASH - -struct journal_params { - /* where does journal start from on its * device */ - __le32 jp_journal_1st_block; - - /* journal device st_rdev */ - __le32 jp_journal_dev; - - /* size of the journal */ - __le32 jp_journal_size; - - /* max number of blocks in a transaction. */ - __le32 jp_journal_trans_max; - - /* - * random value made on fs creation - * (this was sb_journal_block_count) - */ - __le32 jp_journal_magic; - - /* max number of blocks to batch into a trans */ - __le32 jp_journal_max_batch; - - /* in seconds, how old can an async commit be */ - __le32 jp_journal_max_commit_age; - - /* in seconds, how old can a transaction be */ - __le32 jp_journal_max_trans_age; -}; - -/* this is the super from 3.5.X, where X >= 10 */ -struct reiserfs_super_block_v1 { - __le32 s_block_count; /* blocks count */ - __le32 s_free_blocks; /* free blocks count */ - __le32 s_root_block; /* root block number */ - struct journal_params s_journal; - __le16 s_blocksize; /* block size */ - - /* max size of object id array, see get_objectid() commentary */ - __le16 s_oid_maxsize; - __le16 s_oid_cursize; /* current size of object id array */ - - /* this is set to 1 when filesystem was umounted, to 2 - when not */ - __le16 s_umount_state; - - /* - * reiserfs magic string indicates that file system is reiserfs: - * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" - */ - char s_magic[10]; - - /* - * it is set to used by fsck to mark which - * phase of rebuilding is done - */ - __le16 s_fs_state; - /* - * indicate, what hash function is being use - * to sort names in a directory - */ - __le32 s_hash_function_code; - __le16 s_tree_height; /* height of disk tree */ - - /* - * amount of bitmap blocks needed to address - * each block of file system - */ - __le16 s_bmap_nr; - - /* - * this field is only reliable on filesystem with non-standard journal - */ - __le16 s_version; - - /* - * size in blocks of journal area on main device, we need to - * keep after making fs with non-standard journal - */ - __le16 s_reserved_for_journal; -} __attribute__ ((__packed__)); - -#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) - -/* this is the on disk super block */ -struct reiserfs_super_block { - struct reiserfs_super_block_v1 s_v1; - __le32 s_inode_generation; - - /* Right now used only by inode-attributes, if enabled */ - __le32 s_flags; - - unsigned char s_uuid[16]; /* filesystem unique identifier */ - unsigned char s_label[16]; /* filesystem volume label */ - __le16 s_mnt_count; /* Count of mounts since last fsck */ - __le16 s_max_mnt_count; /* Maximum mounts before check */ - __le32 s_lastcheck; /* Timestamp of last fsck */ - __le32 s_check_interval; /* Interval between checks */ - - /* - * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1() - * so any additions must be updated there as well. */ - char s_unused[76]; -} __attribute__ ((__packed__)); - -#define SB_SIZE (sizeof(struct reiserfs_super_block)) - -#define REISERFS_VERSION_1 0 -#define REISERFS_VERSION_2 2 - -/* on-disk super block fields converted to cpu form */ -#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) -#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) -#define SB_BLOCKSIZE(s) \ - le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize)) -#define SB_BLOCK_COUNT(s) \ - le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count)) -#define SB_FREE_BLOCKS(s) \ - le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks)) -#define SB_REISERFS_MAGIC(s) \ - (SB_V1_DISK_SUPER_BLOCK(s)->s_magic) -#define SB_ROOT_BLOCK(s) \ - le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block)) -#define SB_TREE_HEIGHT(s) \ - le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height)) -#define SB_REISERFS_STATE(s) \ - le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state)) -#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version)) -#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr)) - -#define PUT_SB_BLOCK_COUNT(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0) -#define PUT_SB_FREE_BLOCKS(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0) -#define PUT_SB_ROOT_BLOCK(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0) -#define PUT_SB_TREE_HEIGHT(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0) -#define PUT_SB_REISERFS_STATE(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0) -#define PUT_SB_VERSION(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0) -#define PUT_SB_BMAP_NR(s, val) \ - do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0) - -#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal) -#define SB_ONDISK_JOURNAL_SIZE(s) \ - le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size)) -#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \ - le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block)) -#define SB_ONDISK_JOURNAL_DEVICE(s) \ - le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev)) -#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \ - le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal)) - -#define is_block_in_log_or_reserved_area(s, block) \ - block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \ - && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \ - ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \ - SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s))) - -int is_reiserfs_3_5(struct reiserfs_super_block *rs); -int is_reiserfs_3_6(struct reiserfs_super_block *rs); -int is_reiserfs_jr(struct reiserfs_super_block *rs); - -/* - * ReiserFS leaves the first 64k unused, so that partition labels have - * enough space. If someone wants to write a fancy bootloader that - * needs more than 64k, let us know, and this will be increased in size. - * This number must be larger than the largest block size on any - * platform, or code will break. -Hans - */ -#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) -#define REISERFS_FIRST_BLOCK unused_define -#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES - -/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */ -#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024) - -/* reiserfs internal error code (used by search_by_key and fix_nodes)) */ -#define CARRY_ON 0 -#define REPEAT_SEARCH -1 -#define IO_ERROR -2 -#define NO_DISK_SPACE -3 -#define NO_BALANCING_NEEDED (-4) -#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) -#define QUOTA_EXCEEDED -6 - -typedef __u32 b_blocknr_t; -typedef __le32 unp_t; - -struct unfm_nodeinfo { - unp_t unfm_nodenum; - unsigned short unfm_freespace; -}; - -/* there are two formats of keys: 3.5 and 3.6 */ -#define KEY_FORMAT_3_5 0 -#define KEY_FORMAT_3_6 1 - -/* there are two stat datas */ -#define STAT_DATA_V1 0 -#define STAT_DATA_V2 1 - -static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode) -{ - return container_of(inode, struct reiserfs_inode_info, vfs_inode); -} - -static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* - * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 - * which overflows on large file systems. - */ -static inline __u32 reiserfs_bmap_count(struct super_block *sb) -{ - return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; -} - -static inline int bmap_would_wrap(unsigned bmap_nr) -{ - return bmap_nr > ((1LL << 16) - 1); -} - -extern const struct xattr_handler * const reiserfs_xattr_handlers[]; - -/* - * this says about version of key of all items (but stat data) the - * object consists of - */ -#define get_inode_item_key_version( inode ) \ - ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) - -#define set_inode_item_key_version( inode, version ) \ - ({ if((version)==KEY_FORMAT_3_6) \ - REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \ - else \ - REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; }) - -#define get_inode_sd_version(inode) \ - ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1) - -#define set_inode_sd_version(inode, version) \ - ({ if((version)==STAT_DATA_V2) \ - REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \ - else \ - REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) - -/* - * This is an aggressive tail suppression policy, I am hoping it - * improves our benchmarks. The principle behind it is that percentage - * space saving is what matters, not absolute space saving. This is - * non-intuitive, but it helps to understand it if you consider that the - * cost to access 4 blocks is not much more than the cost to access 1 - * block, if you have to do a seek and rotate. A tail risks a - * non-linear disk access that is significant as a percentage of total - * time cost for a 4 block file and saves an amount of space that is - * less significant as a percentage of space, or so goes the hypothesis. - * -Hans - */ -#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ -(\ - (!(n_tail_size)) || \ - (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ - ( (n_file_size) >= (n_block_size) * 4 ) || \ - ( ( (n_file_size) >= (n_block_size) * 3 ) && \ - ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \ - ( ( (n_file_size) >= (n_block_size) * 2 ) && \ - ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \ - ( ( (n_file_size) >= (n_block_size) ) && \ - ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ -) - -/* - * Another strategy for tails, this one means only create a tail if all the - * file would fit into one DIRECT item. - * Primary intention for this one is to increase performance by decreasing - * seeking. -*/ -#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ -(\ - (!(n_tail_size)) || \ - (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \ -) - -/* - * values for s_umount_state field - */ -#define REISERFS_VALID_FS 1 -#define REISERFS_ERROR_FS 2 - -/* - * there are 5 item types currently - */ -#define TYPE_STAT_DATA 0 -#define TYPE_INDIRECT 1 -#define TYPE_DIRECT 2 -#define TYPE_DIRENTRY 3 -#define TYPE_MAXTYPE 3 -#define TYPE_ANY 15 /* FIXME: comment is required */ - -/*************************************************************************** - * KEY & ITEM HEAD * - ***************************************************************************/ - -/* * directories use this key as well as old files */ -struct offset_v1 { - __le32 k_offset; - __le32 k_uniqueness; -} __attribute__ ((__packed__)); - -struct offset_v2 { - __le64 v; -} __attribute__ ((__packed__)); - -static inline __u16 offset_v2_k_type(const struct offset_v2 *v2) -{ - __u8 type = le64_to_cpu(v2->v) >> 60; - return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY; -} - -static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type) -{ - v2->v = - (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60); -} - -static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2) -{ - return le64_to_cpu(v2->v) & (~0ULL >> 4); -} - -static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) -{ - offset &= (~0ULL >> 4); - v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); -} - -/* - * Key of an item determines its location in the S+tree, and - * is composed of 4 components - */ -struct reiserfs_key { - /* packing locality: by default parent directory object id */ - __le32 k_dir_id; - - __le32 k_objectid; /* object identifier */ - union { - struct offset_v1 k_offset_v1; - struct offset_v2 k_offset_v2; - } __attribute__ ((__packed__)) u; -} __attribute__ ((__packed__)); - -struct in_core_key { - /* packing locality: by default parent directory object id */ - __u32 k_dir_id; - __u32 k_objectid; /* object identifier */ - __u64 k_offset; - __u8 k_type; -}; - -struct cpu_key { - struct in_core_key on_disk_key; - int version; - /* 3 in all cases but direct2indirect and indirect2direct conversion */ - int key_length; -}; - -/* - * Our function for comparing keys can compare keys of different - * lengths. It takes as a parameter the length of the keys it is to - * compare. These defines are used in determining what is to be passed - * to it as that parameter. - */ -#define REISERFS_FULL_KEY_LEN 4 -#define REISERFS_SHORT_KEY_LEN 2 - -/* The result of the key compare */ -#define FIRST_GREATER 1 -#define SECOND_GREATER -1 -#define KEYS_IDENTICAL 0 -#define KEY_FOUND 1 -#define KEY_NOT_FOUND 0 - -#define KEY_SIZE (sizeof(struct reiserfs_key)) - -/* return values for search_by_key and clones */ -#define ITEM_FOUND 1 -#define ITEM_NOT_FOUND 0 -#define ENTRY_FOUND 1 -#define ENTRY_NOT_FOUND 0 -#define DIRECTORY_NOT_FOUND -1 -#define REGULAR_FILE_FOUND -2 -#define DIRECTORY_FOUND -3 -#define BYTE_FOUND 1 -#define BYTE_NOT_FOUND 0 -#define FILE_NOT_FOUND -1 - -#define POSITION_FOUND 1 -#define POSITION_NOT_FOUND 0 - -/* return values for reiserfs_find_entry and search_by_entry_key */ -#define NAME_FOUND 1 -#define NAME_NOT_FOUND 0 -#define GOTO_PREVIOUS_ITEM 2 -#define NAME_FOUND_INVISIBLE 3 - -/* - * Everything in the filesystem is stored as a set of items. The - * item head contains the key of the item, its free space (for - * indirect items) and specifies the location of the item itself - * within the block. - */ - -struct item_head { - /* - * Everything in the tree is found by searching for it based on - * its key. - */ - struct reiserfs_key ih_key; - union { - /* - * The free space in the last unformatted node of an - * indirect item if this is an indirect item. This - * equals 0xFFFF iff this is a direct item or stat data - * item. Note that the key, not this field, is used to - * determine the item type, and thus which field this - * union contains. - */ - __le16 ih_free_space_reserved; - - /* - * Iff this is a directory item, this field equals the - * number of directory entries in the directory item. - */ - __le16 ih_entry_count; - } __attribute__ ((__packed__)) u; - __le16 ih_item_len; /* total size of the item body */ - - /* an offset to the item body within the block */ - __le16 ih_item_location; - - /* - * 0 for all old items, 2 for new ones. Highest bit is set by fsck - * temporary, cleaned after all done - */ - __le16 ih_version; -} __attribute__ ((__packed__)); -/* size of item header */ -#define IH_SIZE (sizeof(struct item_head)) - -#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved) -#define ih_version(ih) le16_to_cpu((ih)->ih_version) -#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count) -#define ih_location(ih) le16_to_cpu((ih)->ih_item_location) -#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len) - -#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0) -#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0) -#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0) -#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0) -#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0) - -#define unreachable_item(ih) (ih_version(ih) & (1 << 15)) - -#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) -#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) - -/* - * these operate on indirect items, where you've got an array of ints - * at a possibly unaligned location. These are a noop on ia32 - * - * p is the array of __u32, i is the index into the array, v is the value - * to store there. - */ -#define get_block_num(p, i) get_unaligned_le32((p) + (i)) -#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) - -/* * in old version uniqueness field shows key type */ -#define V1_SD_UNIQUENESS 0 -#define V1_INDIRECT_UNIQUENESS 0xfffffffe -#define V1_DIRECT_UNIQUENESS 0xffffffff -#define V1_DIRENTRY_UNIQUENESS 500 -#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */ - -/* here are conversion routines */ -static inline int uniqueness2type(__u32 uniqueness) CONSTF; -static inline int uniqueness2type(__u32 uniqueness) -{ - switch ((int)uniqueness) { - case V1_SD_UNIQUENESS: - return TYPE_STAT_DATA; - case V1_INDIRECT_UNIQUENESS: - return TYPE_INDIRECT; - case V1_DIRECT_UNIQUENESS: - return TYPE_DIRECT; - case V1_DIRENTRY_UNIQUENESS: - return TYPE_DIRENTRY; - case V1_ANY_UNIQUENESS: - default: - return TYPE_ANY; - } -} - -static inline __u32 type2uniqueness(int type) CONSTF; -static inline __u32 type2uniqueness(int type) -{ - switch (type) { - case TYPE_STAT_DATA: - return V1_SD_UNIQUENESS; - case TYPE_INDIRECT: - return V1_INDIRECT_UNIQUENESS; - case TYPE_DIRECT: - return V1_DIRECT_UNIQUENESS; - case TYPE_DIRENTRY: - return V1_DIRENTRY_UNIQUENESS; - case TYPE_ANY: - default: - return V1_ANY_UNIQUENESS; - } -} - -/* - * key is pointer to on disk key which is stored in le, result is cpu, - * there is no way to get version of object from key, so, provide - * version to these defines - */ -static inline loff_t le_key_k_offset(int version, - const struct reiserfs_key *key) -{ - return (version == KEY_FORMAT_3_5) ? - le32_to_cpu(key->u.k_offset_v1.k_offset) : - offset_v2_k_offset(&(key->u.k_offset_v2)); -} - -static inline loff_t le_ih_k_offset(const struct item_head *ih) -{ - return le_key_k_offset(ih_version(ih), &(ih->ih_key)); -} - -static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) -{ - if (version == KEY_FORMAT_3_5) { - loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness); - return uniqueness2type(val); - } else - return offset_v2_k_type(&(key->u.k_offset_v2)); -} - -static inline loff_t le_ih_k_type(const struct item_head *ih) -{ - return le_key_k_type(ih_version(ih), &(ih->ih_key)); -} - -static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, - loff_t offset) -{ - if (version == KEY_FORMAT_3_5) - key->u.k_offset_v1.k_offset = cpu_to_le32(offset); - else - set_offset_v2_k_offset(&key->u.k_offset_v2, offset); -} - -static inline void add_le_key_k_offset(int version, struct reiserfs_key *key, - loff_t offset) -{ - set_le_key_k_offset(version, key, - le_key_k_offset(version, key) + offset); -} - -static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset) -{ - add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); -} - -static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) -{ - set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); -} - -static inline void set_le_key_k_type(int version, struct reiserfs_key *key, - int type) -{ - if (version == KEY_FORMAT_3_5) { - type = type2uniqueness(type); - key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type); - } else - set_offset_v2_k_type(&key->u.k_offset_v2, type); -} - -static inline void set_le_ih_k_type(struct item_head *ih, int type) -{ - set_le_key_k_type(ih_version(ih), &(ih->ih_key), type); -} - -static inline int is_direntry_le_key(int version, struct reiserfs_key *key) -{ - return le_key_k_type(version, key) == TYPE_DIRENTRY; -} - -static inline int is_direct_le_key(int version, struct reiserfs_key *key) -{ - return le_key_k_type(version, key) == TYPE_DIRECT; -} - -static inline int is_indirect_le_key(int version, struct reiserfs_key *key) -{ - return le_key_k_type(version, key) == TYPE_INDIRECT; -} - -static inline int is_statdata_le_key(int version, struct reiserfs_key *key) -{ - return le_key_k_type(version, key) == TYPE_STAT_DATA; -} - -/* item header has version. */ -static inline int is_direntry_le_ih(struct item_head *ih) -{ - return is_direntry_le_key(ih_version(ih), &ih->ih_key); -} - -static inline int is_direct_le_ih(struct item_head *ih) -{ - return is_direct_le_key(ih_version(ih), &ih->ih_key); -} - -static inline int is_indirect_le_ih(struct item_head *ih) -{ - return is_indirect_le_key(ih_version(ih), &ih->ih_key); -} - -static inline int is_statdata_le_ih(struct item_head *ih) -{ - return is_statdata_le_key(ih_version(ih), &ih->ih_key); -} - -/* key is pointer to cpu key, result is cpu */ -static inline loff_t cpu_key_k_offset(const struct cpu_key *key) -{ - return key->on_disk_key.k_offset; -} - -static inline loff_t cpu_key_k_type(const struct cpu_key *key) -{ - return key->on_disk_key.k_type; -} - -static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset) -{ - key->on_disk_key.k_offset = offset; -} - -static inline void set_cpu_key_k_type(struct cpu_key *key, int type) -{ - key->on_disk_key.k_type = type; -} - -static inline void cpu_key_k_offset_dec(struct cpu_key *key) -{ - key->on_disk_key.k_offset--; -} - -#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY) -#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT) -#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT) -#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA) - -/* are these used ? */ -#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key))) -#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key))) -#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key))) -#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key))) - -#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \ - (!COMP_SHORT_KEYS(ih, key) && \ - I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize)) - -/* maximal length of item */ -#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE) -#define MIN_ITEM_LEN 1 - -/* object identifier for root dir */ -#define REISERFS_ROOT_OBJECTID 2 -#define REISERFS_ROOT_PARENT_OBJECTID 1 - -extern struct reiserfs_key root_key; - -/* - * Picture represents a leaf of the S+tree - * ______________________________________________________ - * | | Array of | | | - * |Block | Object-Item | F r e e | Objects- | - * | head | Headers | S p a c e | Items | - * |______|_______________|___________________|___________| - */ - -/* - * Header of a disk block. More precisely, header of a formatted leaf - * or internal node, and not the header of an unformatted node. - */ -struct block_head { - __le16 blk_level; /* Level of a block in the tree. */ - __le16 blk_nr_item; /* Number of keys/items in a block. */ - __le16 blk_free_space; /* Block free space in bytes. */ - __le16 blk_reserved; - /* dump this in v4/planA */ - - /* kept only for compatibility */ - struct reiserfs_key blk_right_delim_key; -}; - -#define BLKH_SIZE (sizeof(struct block_head)) -#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level)) -#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item)) -#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space)) -#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved)) -#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val)) -#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val)) -#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val)) -#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val)) -#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key) -#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val) - -/* values for blk_level field of the struct block_head */ - -/* - * When node gets removed from the tree its blk_level is set to FREE_LEVEL. - * It is then used to see whether the node is still in the tree - */ -#define FREE_LEVEL 0 - -#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */ - -/* - * Given the buffer head of a formatted node, resolve to the - * block head of that node. - */ -#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data)) -/* Number of items that are in buffer. */ -#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh))) -#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh))) -#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh))) - -#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0) -#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0) -#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0) - -/* Get right delimiting key. -- little endian */ -#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh)))) - -/* Does the buffer contain a disk leaf. */ -#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL) - -/* Does the buffer contain a disk internal node */ -#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ - && B_LEVEL(bh) <= MAX_HEIGHT) - -/*************************************************************************** - * STAT DATA * - ***************************************************************************/ - -/* - * old stat data is 32 bytes long. We are going to distinguish new one by - * different size -*/ -struct stat_data_v1 { - __le16 sd_mode; /* file type, permissions */ - __le16 sd_nlink; /* number of hard links */ - __le16 sd_uid; /* owner */ - __le16 sd_gid; /* group */ - __le32 sd_size; /* file size */ - __le32 sd_atime; /* time of last access */ - __le32 sd_mtime; /* time file was last modified */ - - /* - * time inode (stat data) was last changed - * (except changes to sd_atime and sd_mtime) - */ - __le32 sd_ctime; - union { - __le32 sd_rdev; - __le32 sd_blocks; /* number of blocks file uses */ - } __attribute__ ((__packed__)) u; - - /* - * first byte of file which is stored in a direct item: except that if - * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no - * direct item. The existence of this field really grates on me. - * Let's replace it with a macro based on sd_size and our tail - * suppression policy. Someday. -Hans - */ - __le32 sd_first_direct_byte; -} __attribute__ ((__packed__)); - -#define SD_V1_SIZE (sizeof(struct stat_data_v1)) -#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5) -#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) -#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) -#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink)) -#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v)) -#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid)) -#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v)) -#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid)) -#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v)) -#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size)) -#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v)) -#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) -#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) -#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) -#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) -#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) -#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) -#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) -#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) -#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks)) -#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v)) -#define sd_v1_first_direct_byte(sdp) \ - (le32_to_cpu((sdp)->sd_first_direct_byte)) -#define set_sd_v1_first_direct_byte(sdp,v) \ - ((sdp)->sd_first_direct_byte = cpu_to_le32(v)) - -/* inode flags stored in sd_attrs (nee sd_reserved) */ - -/* - * we want common flags to have the same values as in ext2, - * so chattr(1) will work without problems - */ -#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL -#define REISERFS_APPEND_FL FS_APPEND_FL -#define REISERFS_SYNC_FL FS_SYNC_FL -#define REISERFS_NOATIME_FL FS_NOATIME_FL -#define REISERFS_NODUMP_FL FS_NODUMP_FL -#define REISERFS_SECRM_FL FS_SECRM_FL -#define REISERFS_UNRM_FL FS_UNRM_FL -#define REISERFS_COMPR_FL FS_COMPR_FL -#define REISERFS_NOTAIL_FL FS_NOTAIL_FL - -/* persistent flags that file inherits from the parent directory */ -#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ - REISERFS_SYNC_FL | \ - REISERFS_NOATIME_FL | \ - REISERFS_NODUMP_FL | \ - REISERFS_SECRM_FL | \ - REISERFS_COMPR_FL | \ - REISERFS_NOTAIL_FL ) - -/* - * Stat Data on disk (reiserfs version of UFS disk inode minus the - * address blocks) - */ -struct stat_data { - __le16 sd_mode; /* file type, permissions */ - __le16 sd_attrs; /* persistent inode flags */ - __le32 sd_nlink; /* number of hard links */ - __le64 sd_size; /* file size */ - __le32 sd_uid; /* owner */ - __le32 sd_gid; /* group */ - __le32 sd_atime; /* time of last access */ - __le32 sd_mtime; /* time file was last modified */ - - /* - * time inode (stat data) was last changed - * (except changes to sd_atime and sd_mtime) - */ - __le32 sd_ctime; - __le32 sd_blocks; - union { - __le32 sd_rdev; - __le32 sd_generation; - } __attribute__ ((__packed__)) u; -} __attribute__ ((__packed__)); - -/* this is 44 bytes long */ -#define SD_SIZE (sizeof(struct stat_data)) -#define SD_V2_SIZE SD_SIZE -#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6) -#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode)) -#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v)) -/* sd_reserved */ -/* set_sd_reserved */ -#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink)) -#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v)) -#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size)) -#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v)) -#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid)) -#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v)) -#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid)) -#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v)) -#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime)) -#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v)) -#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime)) -#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v)) -#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime)) -#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v)) -#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks)) -#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v)) -#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev)) -#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v)) -#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation)) -#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v)) -#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs)) -#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v)) - -/*************************************************************************** - * DIRECTORY STRUCTURE * - ***************************************************************************/ -/* - * Picture represents the structure of directory items - * ________________________________________________ - * | Array of | | | | | | - * | directory |N-1| N-2 | .... | 1st |0th| - * | entry headers | | | | | | - * |_______________|___|_____|________|_______|___| - * <---- directory entries ------> - * - * First directory item has k_offset component 1. We store "." and ".." - * in one item, always, we never split "." and ".." into differing - * items. This makes, among other things, the code for removing - * directories simpler. - */ -#define SD_OFFSET 0 -#define SD_UNIQUENESS 0 -#define DOT_OFFSET 1 -#define DOT_DOT_OFFSET 2 -#define DIRENTRY_UNIQUENESS 500 - -#define FIRST_ITEM_OFFSET 1 - -/* - * Q: How to get key of object pointed to by entry from entry? - * - * A: Each directory entry has its header. This header has deh_dir_id - * and deh_objectid fields, those are key of object, entry points to - */ - -/* - * NOT IMPLEMENTED: - * Directory will someday contain stat data of object - */ - -struct reiserfs_de_head { - __le32 deh_offset; /* third component of the directory entry key */ - - /* - * objectid of the parent directory of the object, that is referenced - * by directory entry - */ - __le32 deh_dir_id; - - /* objectid of the object, that is referenced by directory entry */ - __le32 deh_objectid; - __le16 deh_location; /* offset of name in the whole item */ - - /* - * whether 1) entry contains stat data (for future), and - * 2) whether entry is hidden (unlinked) - */ - __le16 deh_state; -} __attribute__ ((__packed__)); -#define DEH_SIZE sizeof(struct reiserfs_de_head) -#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset)) -#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id)) -#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid)) -#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location)) -#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state)) - -#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v))) -#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v))) -#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v))) -#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v))) -#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v))) - -/* empty directory contains two entries "." and ".." and their headers */ -#define EMPTY_DIR_SIZE \ -(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1)) - -/* old format directories have this size when empty */ -#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) - -#define DEH_Statdata 0 /* not used now */ -#define DEH_Visible 2 - -/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */ -#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__) -# define ADDR_UNALIGNED_BITS (3) -#endif - -/* - * These are only used to manipulate deh_state. - * Because of this, we'll use the ext2_ bit routines, - * since they are little endian - */ -#ifdef ADDR_UNALIGNED_BITS - -# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) -# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3) - -# define set_bit_unaligned(nr, addr) \ - __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) -# define clear_bit_unaligned(nr, addr) \ - __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) -# define test_bit_unaligned(nr, addr) \ - test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) - -#else - -# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr) -# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr) -# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr) - -#endif - -#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) -#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) -#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state)) -#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state)) - -#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) -#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) -#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) - -extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid); -extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid); - -/* two entries per block (at least) */ -#define REISERFS_MAX_NAME(block_size) 255 - -/* - * this structure is used for operations on directory entries. It is - * not a disk structure. - * - * When reiserfs_find_entry or search_by_entry_key find directory - * entry, they return filled reiserfs_dir_entry structure - */ -struct reiserfs_dir_entry { - struct buffer_head *de_bh; - int de_item_num; - struct item_head *de_ih; - int de_entry_num; - struct reiserfs_de_head *de_deh; - int de_entrylen; - int de_namelen; - char *de_name; - unsigned long *de_gen_number_bit_string; - - __u32 de_dir_id; - __u32 de_objectid; - - struct cpu_key de_entry_key; -}; - -/* - * these defines are useful when a particular member of - * a reiserfs_dir_entry is needed - */ - -/* pointer to file name, stored in entry */ -#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \ - (ih_item_body(bh, ih) + deh_location(deh)) - -/* length of name */ -#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ -(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0)) - -/* hash value occupies bits from 7 up to 30 */ -#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL) -/* generation number occupies 7 bits starting from 0 up to 6 */ -#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL) -#define MAX_GENERATION_NUMBER 127 - -#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number)) - -/* - * Picture represents an internal node of the reiserfs tree - * ______________________________________________________ - * | | Array of | Array of | Free | - * |block | keys | pointers | space | - * | head | N | N+1 | | - * |______|_______________|___________________|___________| - */ - -/*************************************************************************** - * DISK CHILD * - ***************************************************************************/ -/* - * Disk child pointer: - * The pointer from an internal node of the tree to a node that is on disk. - */ -struct disk_child { - __le32 dc_block_number; /* Disk child's block number. */ - __le16 dc_size; /* Disk child's used space. */ - __le16 dc_reserved; -}; - -#define DC_SIZE (sizeof(struct disk_child)) -#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number)) -#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size)) -#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0) -#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0) - -/* Get disk child by buffer header and position in the tree node. */ -#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\ -((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos))) - -/* Get disk child number by buffer header and position in the tree node. */ -#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos))) -#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \ - (put_dc_block_number(B_N_CHILD(bh, n_pos), val)) - - /* maximal value of field child_size in structure disk_child */ - /* child size is the combined size of all items and their headers */ -#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE )) - -/* amount of used space in buffer (not including block head) */ -#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur))) - -/* max and min number of keys in internal node */ -#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) -#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) - -/*************************************************************************** - * PATH STRUCTURES AND DEFINES * - ***************************************************************************/ - -/* - * search_by_key fills up the path from the root to the leaf as it descends - * the tree looking for the key. It uses reiserfs_bread to try to find - * buffers in the cache given their block number. If it does not find - * them in the cache it reads them from disk. For each node search_by_key - * finds using reiserfs_bread it then uses bin_search to look through that - * node. bin_search will find the position of the block_number of the next - * node if it is looking through an internal node. If it is looking through - * a leaf node bin_search will find the position of the item which has key - * either equal to given key, or which is the maximal key less than the - * given key. - */ - -struct path_element { - /* Pointer to the buffer at the path in the tree. */ - struct buffer_head *pe_buffer; - /* Position in the tree node which is placed in the buffer above. */ - int pe_position; -}; - -/* - * maximal height of a tree. don't change this without - * changing JOURNAL_PER_BALANCE_CNT - */ -#define MAX_HEIGHT 5 - -/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ -#define EXTENDED_MAX_HEIGHT 7 - -/* Must be equal to at least 2. */ -#define FIRST_PATH_ELEMENT_OFFSET 2 - -/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ -#define ILLEGAL_PATH_ELEMENT_OFFSET 1 - -/* this MUST be MAX_HEIGHT + 1. See about FEB below */ -#define MAX_FEB_SIZE 6 - -/* - * We need to keep track of who the ancestors of nodes are. When we - * perform a search we record which nodes were visited while - * descending the tree looking for the node we searched for. This list - * of nodes is called the path. This information is used while - * performing balancing. Note that this path information may become - * invalid, and this means we must check it when using it to see if it - * is still valid. You'll need to read search_by_key and the comments - * in it, especially about decrement_counters_in_path(), to understand - * this structure. - * - * Paths make the code so much harder to work with and debug.... An - * enormous number of bugs are due to them, and trying to write or modify - * code that uses them just makes my head hurt. They are based on an - * excessive effort to avoid disturbing the precious VFS code.:-( The - * gods only know how we are going to SMP the code that uses them. - * znodes are the way! - */ - -#define PATH_READA 0x1 /* do read ahead */ -#define PATH_READA_BACK 0x2 /* read backwards */ - -struct treepath { - int path_length; /* Length of the array above. */ - int reada; - /* Array of the path elements. */ - struct path_element path_elements[EXTENDED_MAX_HEIGHT]; - int pos_in_item; -}; - -#define pos_in_item(path) ((path)->pos_in_item) - -#define INITIALIZE_PATH(var) \ -struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} - -/* Get path element by path and path position. */ -#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset)) - -/* Get buffer header at the path by path and path position. */ -#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer) - -/* Get position in the element at the path by path and path position. */ -#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) - -#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) - -/* - * you know, to the person who didn't write this the macro name does not - * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or - * maybe we should just focus on dumping paths... -Hans - */ -#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) - -/* - * in do_balance leaf has h == 0 in contrast with path structure, - * where root has level == 0. That is why we need these defines - */ - -/* tb->S[h] */ -#define PATH_H_PBUFFER(path, h) \ - PATH_OFFSET_PBUFFER(path, path->path_length - (h)) - -/* tb->F[h] or tb->S[0]->b_parent */ -#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1) - -#define PATH_H_POSITION(path, h) \ - PATH_OFFSET_POSITION(path, path->path_length - (h)) - -/* tb->S[h]->b_item_order */ -#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) - -#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) - -static inline void *reiserfs_node_data(const struct buffer_head *bh) -{ - return bh->b_data + sizeof(struct block_head); -} - -/* get key from internal node */ -static inline struct reiserfs_key *internal_key(struct buffer_head *bh, - int item_num) -{ - struct reiserfs_key *key = reiserfs_node_data(bh); - - return &key[item_num]; -} - -/* get the item header from leaf node */ -static inline struct item_head *item_head(const struct buffer_head *bh, - int item_num) -{ - struct item_head *ih = reiserfs_node_data(bh); - - return &ih[item_num]; -} - -/* get the key from leaf node */ -static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh, - int item_num) -{ - return &item_head(bh, item_num)->ih_key; -} - -static inline void *ih_item_body(const struct buffer_head *bh, - const struct item_head *ih) -{ - return bh->b_data + ih_location(ih); -} - -/* get item body from leaf node */ -static inline void *item_body(const struct buffer_head *bh, int item_num) -{ - return ih_item_body(bh, item_head(bh, item_num)); -} - -static inline struct item_head *tp_item_head(const struct treepath *path) -{ - return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); -} - -static inline void *tp_item_body(const struct treepath *path) -{ - return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); -} - -#define get_last_bh(path) PATH_PLAST_BUFFER(path) -#define get_item_pos(path) PATH_LAST_POSITION(path) -#define item_moved(ih,path) comp_items(ih, path) -#define path_changed(ih,path) comp_items (ih, path) - -/* array of the entry headers */ - /* get item body */ -#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih))) - -/* - * length of the directory entry in directory item. This define - * calculates length of i-th directory entry using directory entry - * locations from dir entry head. When it calculates length of 0-th - * directory entry, it uses length of whole item in place of entry - * location of the non-existent following entry in the calculation. - * See picture above. - */ -static inline int entry_length(const struct buffer_head *bh, - const struct item_head *ih, int pos_in_item) -{ - struct reiserfs_de_head *deh; - - deh = B_I_DEH(bh, ih) + pos_in_item; - if (pos_in_item) - return deh_location(deh - 1) - deh_location(deh); - - return ih_item_len(ih) - deh_location(deh); -} - -/*************************************************************************** - * MISC * - ***************************************************************************/ - -/* Size of pointer to the unformatted node. */ -#define UNFM_P_SIZE (sizeof(unp_t)) -#define UNFM_P_SHIFT 2 - -/* in in-core inode key is stored on le form */ -#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) - -#define MAX_UL_INT 0xffffffff -#define MAX_INT 0x7ffffff -#define MAX_US_INT 0xffff - -// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset -static inline loff_t max_reiserfs_offset(struct inode *inode) -{ - if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) - return (loff_t) U32_MAX; - - return (loff_t) ((~(__u64) 0) >> 4); -} - -#define MAX_KEY_OBJECTID MAX_UL_INT - -#define MAX_B_NUM MAX_UL_INT -#define MAX_FC_NUM MAX_US_INT - -/* the purpose is to detect overflow of an unsigned short */ -#define REISERFS_LINK_MAX (MAX_US_INT - 1000) - -/* - * The following defines are used in reiserfs_insert_item - * and reiserfs_append_item - */ -#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */ -#define REISERFS_USER_MEM 1 /* user memory mode */ - -#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) -#define get_generation(s) atomic_read (&fs_generation(s)) -#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) -#define __fs_changed(gen,s) (gen != get_generation (s)) -#define fs_changed(gen,s) \ -({ \ - reiserfs_cond_resched(s); \ - __fs_changed(gen, s); \ -}) - -/*************************************************************************** - * FIXATE NODES * - ***************************************************************************/ - -#define VI_TYPE_LEFT_MERGEABLE 1 -#define VI_TYPE_RIGHT_MERGEABLE 2 - -/* - * To make any changes in the tree we always first find node, that - * contains item to be changed/deleted or place to insert a new - * item. We call this node S. To do balancing we need to decide what - * we will shift to left/right neighbor, or to a new node, where new - * item will be etc. To make this analysis simpler we build virtual - * node. Virtual node is an array of items, that will replace items of - * node S. (For instance if we are going to delete an item, virtual - * node does not contain it). Virtual node keeps information about - * item sizes and types, mergeability of first and last items, sizes - * of all entries in directory item. We use this array of items when - * calculating what we can shift to neighbors and how many nodes we - * have to have if we do not any shiftings, if we shift to left/right - * neighbor or to both. - */ -struct virtual_item { - int vi_index; /* index in the array of item operations */ - unsigned short vi_type; /* left/right mergeability */ - - /* length of item that it will have after balancing */ - unsigned short vi_item_len; - - struct item_head *vi_ih; - const char *vi_item; /* body of item (old or new) */ - const void *vi_new_data; /* 0 always but paste mode */ - void *vi_uarea; /* item specific area */ -}; - -struct virtual_node { - /* this is a pointer to the free space in the buffer */ - char *vn_free_ptr; - - unsigned short vn_nr_item; /* number of items in virtual node */ - - /* - * size of node , that node would have if it has - * unlimited size and no balancing is performed - */ - short vn_size; - - /* mode of balancing (paste, insert, delete, cut) */ - short vn_mode; - - short vn_affected_item_num; - short vn_pos_in_item; - - /* item header of inserted item, 0 for other modes */ - struct item_head *vn_ins_ih; - const void *vn_data; - - /* array of items (including a new one, excluding item to be deleted) */ - struct virtual_item *vn_vi; -}; - -/* used by directory items when creating virtual nodes */ -struct direntry_uarea { - int flags; - __u16 entry_count; - __u16 entry_sizes[]; -} __attribute__ ((__packed__)); - -/*************************************************************************** - * TREE BALANCE * - ***************************************************************************/ - -/* - * This temporary structure is used in tree balance algorithms, and - * constructed as we go to the extent that its various parts are - * needed. It contains arrays of nodes that can potentially be - * involved in the balancing of node S, and parameters that define how - * each of the nodes must be balanced. Note that in these algorithms - * for balancing the worst case is to need to balance the current node - * S and the left and right neighbors and all of their parents plus - * create a new node. We implement S1 balancing for the leaf nodes - * and S0 balancing for the internal nodes (S1 and S0 are defined in - * our papers.) - */ - -/* size of the array of buffers to free at end of do_balance */ -#define MAX_FREE_BLOCK 7 - -/* maximum number of FEB blocknrs on a single level */ -#define MAX_AMOUNT_NEEDED 2 - -/* someday somebody will prefix every field in this struct with tb_ */ -struct tree_balance { - int tb_mode; - int need_balance_dirty; - struct super_block *tb_sb; - struct reiserfs_transaction_handle *transaction_handle; - struct treepath *tb_path; - - /* array of left neighbors of nodes in the path */ - struct buffer_head *L[MAX_HEIGHT]; - - /* array of right neighbors of nodes in the path */ - struct buffer_head *R[MAX_HEIGHT]; - - /* array of fathers of the left neighbors */ - struct buffer_head *FL[MAX_HEIGHT]; - - /* array of fathers of the right neighbors */ - struct buffer_head *FR[MAX_HEIGHT]; - /* array of common parents of center node and its left neighbor */ - struct buffer_head *CFL[MAX_HEIGHT]; - - /* array of common parents of center node and its right neighbor */ - struct buffer_head *CFR[MAX_HEIGHT]; - - /* - * array of empty buffers. Number of buffers in array equals - * cur_blknum. - */ - struct buffer_head *FEB[MAX_FEB_SIZE]; - struct buffer_head *used[MAX_FEB_SIZE]; - struct buffer_head *thrown[MAX_FEB_SIZE]; - - /* - * array of number of items which must be shifted to the left in - * order to balance the current node; for leaves includes item that - * will be partially shifted; for internal nodes, it is the number - * of child pointers rather than items. It includes the new item - * being created. The code sometimes subtracts one to get the - * number of wholly shifted items for other purposes. - */ - int lnum[MAX_HEIGHT]; - - /* substitute right for left in comment above */ - int rnum[MAX_HEIGHT]; - - /* - * array indexed by height h mapping the key delimiting L[h] and - * S[h] to its item number within the node CFL[h] - */ - int lkey[MAX_HEIGHT]; - - /* substitute r for l in comment above */ - int rkey[MAX_HEIGHT]; - - /* - * the number of bytes by we are trying to add or remove from - * S[h]. A negative value means removing. - */ - int insert_size[MAX_HEIGHT]; - - /* - * number of nodes that will replace node S[h] after balancing - * on the level h of the tree. If 0 then S is being deleted, - * if 1 then S is remaining and no new nodes are being created, - * if 2 or 3 then 1 or 2 new nodes is being created - */ - int blknum[MAX_HEIGHT]; - - /* fields that are used only for balancing leaves of the tree */ - - /* number of empty blocks having been already allocated */ - int cur_blknum; - - /* number of items that fall into left most node when S[0] splits */ - int s0num; - - /* - * number of bytes which can flow to the left neighbor from the left - * most liquid item that cannot be shifted from S[0] entirely - * if -1 then nothing will be partially shifted - */ - int lbytes; - - /* - * number of bytes which will flow to the right neighbor from the right - * most liquid item that cannot be shifted from S[0] entirely - * if -1 then nothing will be partially shifted - */ - int rbytes; - - - /* - * index into the array of item headers in - * S[0] of the affected item - */ - int item_pos; - - /* new nodes allocated to hold what could not fit into S */ - struct buffer_head *S_new[2]; - - /* - * number of items that will be placed into nodes in S_new - * when S[0] splits - */ - int snum[2]; - - /* - * number of bytes which flow to nodes in S_new when S[0] splits - * note: if S[0] splits into 3 nodes, then items do not need to be cut - */ - int sbytes[2]; - - int pos_in_item; - int zeroes_num; - - /* - * buffers which are to be freed after do_balance finishes - * by unfix_nodes - */ - struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; - - /* - * kmalloced memory. Used to create virtual node and keep - * map of dirtied bitmap blocks - */ - char *vn_buf; - - int vn_buf_size; /* size of the vn_buf */ - - /* VN starts after bitmap of bitmap blocks */ - struct virtual_node *tb_vn; - - /* - * saved value of `reiserfs_generation' counter see - * FILESYSTEM_CHANGED() macro in reiserfs_fs.h - */ - int fs_gen; - -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* - * key pointer, to pass to block allocator or - * another low-level subsystem - */ - struct in_core_key key; -#endif -}; - -/* These are modes of balancing */ - -/* When inserting an item. */ -#define M_INSERT 'i' -/* - * When inserting into (directories only) or appending onto an already - * existent item. - */ -#define M_PASTE 'p' -/* When deleting an item. */ -#define M_DELETE 'd' -/* When truncating an item or removing an entry from a (directory) item. */ -#define M_CUT 'c' - -/* used when balancing on leaf level skipped (in reiserfsck) */ -#define M_INTERNAL 'n' - -/* - * When further balancing is not needed, then do_balance does not need - * to be called. - */ -#define M_SKIP_BALANCING 's' -#define M_CONVERT 'v' - -/* modes of leaf_move_items */ -#define LEAF_FROM_S_TO_L 0 -#define LEAF_FROM_S_TO_R 1 -#define LEAF_FROM_R_TO_L 2 -#define LEAF_FROM_L_TO_R 3 -#define LEAF_FROM_S_TO_SNEW 4 - -#define FIRST_TO_LAST 0 -#define LAST_TO_FIRST 1 - -/* - * used in do_balance for passing parent of node information that has - * been gotten from tb struct - */ -struct buffer_info { - struct tree_balance *tb; - struct buffer_head *bi_bh; - struct buffer_head *bi_parent; - int bi_position; -}; - -static inline struct super_block *sb_from_tb(struct tree_balance *tb) -{ - return tb ? tb->tb_sb : NULL; -} - -static inline struct super_block *sb_from_bi(struct buffer_info *bi) -{ - return bi ? sb_from_tb(bi->tb) : NULL; -} - -/* - * there are 4 types of items: stat data, directory item, indirect, direct. - * +-------------------+------------+--------------+------------+ - * | | k_offset | k_uniqueness | mergeable? | - * +-------------------+------------+--------------+------------+ - * | stat data | 0 | 0 | no | - * +-------------------+------------+--------------+------------+ - * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no | - * | non 1st directory | hash value | UNIQUENESS | yes | - * | item | | | | - * +-------------------+------------+--------------+------------+ - * | indirect item | offset + 1 |TYPE_INDIRECT | [1] | - * +-------------------+------------+--------------+------------+ - * | direct item | offset + 1 |TYPE_DIRECT | [2] | - * +-------------------+------------+--------------+------------+ - * - * [1] if this is not the first indirect item of the object - * [2] if this is not the first direct item of the object -*/ - -struct item_operations { - int (*bytes_number) (struct item_head * ih, int block_size); - void (*decrement_key) (struct cpu_key *); - int (*is_left_mergeable) (struct reiserfs_key * ih, - unsigned long bsize); - void (*print_item) (struct item_head *, char *item); - void (*check_item) (struct item_head *, char *item); - - int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, - int is_affected, int insert_size); - int (*check_left) (struct virtual_item * vi, int free, - int start_skip, int end_skip); - int (*check_right) (struct virtual_item * vi, int free); - int (*part_size) (struct virtual_item * vi, int from, int to); - int (*unit_num) (struct virtual_item * vi); - void (*print_vi) (struct virtual_item * vi); -}; - -extern struct item_operations *item_ops[TYPE_ANY + 1]; - -#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) -#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) -#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item) -#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item) -#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size) -#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip) -#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free) -#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to) -#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi) -#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi) - -#define COMP_SHORT_KEYS comp_short_keys - -/* number of blocks pointed to by the indirect item */ -#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE) - -/* - * the used space within the unformatted node corresponding - * to pos within the item pointed to by ih - */ -#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) - -/* - * number of bytes contained by the direct item or the - * unformatted nodes the indirect item points to - */ - -/* following defines use reiserfs buffer header and item header */ - -/* get stat-data */ -#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) - -/* this is 3976 for size==4096 */ -#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) - -/* - * indirect items consist of entries which contain blocknrs, pos - * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the - * blocknr contained by the entry pos points to - */ -#define B_I_POS_UNFM_POINTER(bh, ih, pos) \ - le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos))) -#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \ - (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val)) - -struct reiserfs_iget_args { - __u32 objectid; - __u32 dirid; -}; - -/*************************************************************************** - * FUNCTION DECLARATIONS * - ***************************************************************************/ - -#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) - -#define journal_trans_half(blocksize) \ - ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32)) - -/* journal.c see journal.c for all the comments here */ - -/* first block written in a commit. */ -struct reiserfs_journal_desc { - __le32 j_trans_id; /* id of commit */ - - /* length of commit. len +1 is the commit block */ - __le32 j_len; - - __le32 j_mount_id; /* mount id of this trans */ - __le32 j_realblock[]; /* real locations for each block */ -}; - -#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) -#define get_desc_trans_len(d) le32_to_cpu((d)->j_len) -#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id) - -#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0) -#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0) -#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0) - -/* last block written in a commit */ -struct reiserfs_journal_commit { - __le32 j_trans_id; /* must match j_trans_id from the desc block */ - __le32 j_len; /* ditto */ - __le32 j_realblock[]; /* real locations for each block */ -}; - -#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) -#define get_commit_trans_len(c) le32_to_cpu((c)->j_len) -#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id) - -#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) -#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0) - -/* - * this header block gets written whenever a transaction is considered - * fully flushed, and is more recent than the last fully flushed transaction. - * fully flushed means all the log blocks and all the real blocks are on - * disk, and this transaction does not need to be replayed. - */ -struct reiserfs_journal_header { - /* id of last fully flushed transaction */ - __le32 j_last_flush_trans_id; - - /* offset in the log of where to start replay after a crash */ - __le32 j_first_unflushed_offset; - - __le32 j_mount_id; - /* 12 */ struct journal_params jh_journal; -}; - -/* biggest tunable defines are right here */ -#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ - -/* biggest possible single transaction, don't change for now (8/3/99) */ -#define JOURNAL_TRANS_MAX_DEFAULT 1024 -#define JOURNAL_TRANS_MIN_DEFAULT 256 - -/* - * max blocks to batch into one transaction, - * don't make this any bigger than 900 - */ -#define JOURNAL_MAX_BATCH_DEFAULT 900 -#define JOURNAL_MIN_RATIO 2 -#define JOURNAL_MAX_COMMIT_AGE 30 -#define JOURNAL_MAX_TRANS_AGE 30 -#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9) -#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \ - 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \ - REISERFS_QUOTA_TRANS_BLOCKS(sb))) - -#ifdef CONFIG_QUOTA -#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA)) -/* We need to update data and inode (atime) */ -#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0) -/* 1 balancing, 1 bitmap, 1 data per write + stat data update */ -#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ -(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0) -/* same as with INIT */ -#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ -(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0) -#else -#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0 -#define REISERFS_QUOTA_INIT_BLOCKS(s) 0 -#define REISERFS_QUOTA_DEL_BLOCKS(s) 0 -#endif - -/* - * both of these can be as low as 1, or as high as you want. The min is the - * number of 4k bitmap nodes preallocated on mount. New nodes are allocated - * as needed, and released when transactions are committed. On release, if - * the current number of nodes is > max, the node is freed, otherwise, - * it is put on a free list for faster use later. -*/ -#define REISERFS_MIN_BITMAP_NODES 10 -#define REISERFS_MAX_BITMAP_NODES 100 - -/* these are based on journal hash size of 8192 */ -#define JBH_HASH_SHIFT 13 -#define JBH_HASH_MASK 8191 - -#define _jhashfn(sb,block) \ - (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \ - (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) -#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) - -/* We need these to make journal.c code more readable */ -#define journal_find_get_block(s, block) __find_get_block(\ - file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize) -#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ - block, s->s_blocksize) -#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\ - block, s->s_blocksize) - -enum reiserfs_bh_state_bits { - BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ - BH_JDirty_wait, - /* - * disk block was taken off free list before being in a - * finished transaction, or written to disk. Can be reused immed. - */ - BH_JNew, - BH_JPrepared, - BH_JRestore_dirty, - BH_JTest, /* debugging only will go away */ -}; - -BUFFER_FNS(JDirty, journaled); -TAS_BUFFER_FNS(JDirty, journaled); -BUFFER_FNS(JDirty_wait, journal_dirty); -TAS_BUFFER_FNS(JDirty_wait, journal_dirty); -BUFFER_FNS(JNew, journal_new); -TAS_BUFFER_FNS(JNew, journal_new); -BUFFER_FNS(JPrepared, journal_prepared); -TAS_BUFFER_FNS(JPrepared, journal_prepared); -BUFFER_FNS(JRestore_dirty, journal_restore_dirty); -TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); -BUFFER_FNS(JTest, journal_test); -TAS_BUFFER_FNS(JTest, journal_test); - -/* transaction handle which is passed around for all journal calls */ -struct reiserfs_transaction_handle { - /* - * super for this FS when journal_begin was called. saves calls to - * reiserfs_get_super also used by nested transactions to make - * sure they are nesting on the right FS _must_ be first - * in the handle - */ - struct super_block *t_super; - - int t_refcount; - int t_blocks_logged; /* number of blocks this writer has logged */ - int t_blocks_allocated; /* number of blocks this writer allocated */ - - /* sanity check, equals the current trans id */ - unsigned int t_trans_id; - - void *t_handle_save; /* save existing current->journal_info */ - - /* - * if new block allocation occurres, that block - * should be displaced from others - */ - unsigned displace_new_blocks:1; - - struct list_head t_list; -}; - -/* - * used to keep track of ordered and tail writes, attached to the buffer - * head through b_journal_head. - */ -struct reiserfs_jh { - struct reiserfs_journal_list *jl; - struct buffer_head *bh; - struct list_head list; -}; - -void reiserfs_free_jh(struct buffer_head *bh); -int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); -int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); -int journal_mark_dirty(struct reiserfs_transaction_handle *, - struct buffer_head *bh); - -static inline int reiserfs_file_data_log(struct inode *inode) -{ - if (reiserfs_data_log(inode->i_sb) || - (REISERFS_I(inode)->i_flags & i_data_log)) - return 1; - return 0; -} - -static inline int reiserfs_transaction_running(struct super_block *s) -{ - struct reiserfs_transaction_handle *th = current->journal_info; - if (th && th->t_super == s) - return 1; - if (th && th->t_super == NULL) - BUG(); - return 0; -} - -static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th) -{ - return th->t_blocks_allocated - th->t_blocks_logged; -} - -struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct - super_block - *, - int count); -int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); -void reiserfs_vfs_truncate_file(struct inode *inode); -int reiserfs_commit_page(struct inode *inode, struct page *page, - unsigned from, unsigned to); -void reiserfs_flush_old_commits(struct super_block *); -int reiserfs_commit_for_inode(struct inode *); -int reiserfs_inode_needs_commit(struct inode *); -void reiserfs_update_inode_transaction(struct inode *); -void reiserfs_wait_on_write_block(struct super_block *s); -void reiserfs_block_writes(struct reiserfs_transaction_handle *th); -void reiserfs_allow_writes(struct super_block *s); -void reiserfs_check_lock_depth(struct super_block *s, char *caller); -int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, - int wait); -void reiserfs_restore_prepared_buffer(struct super_block *, - struct buffer_head *bh); -int journal_init(struct super_block *, const char *j_dev_name, int old_format, - unsigned int); -int journal_release(struct reiserfs_transaction_handle *, struct super_block *); -int journal_release_error(struct reiserfs_transaction_handle *, - struct super_block *); -int journal_end(struct reiserfs_transaction_handle *); -int journal_end_sync(struct reiserfs_transaction_handle *); -int journal_mark_freed(struct reiserfs_transaction_handle *, - struct super_block *, b_blocknr_t blocknr); -int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); -int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, - int bit_nr, int searchall, b_blocknr_t *next); -int journal_begin(struct reiserfs_transaction_handle *, - struct super_block *sb, unsigned long); -int journal_join_abort(struct reiserfs_transaction_handle *, - struct super_block *sb); -void reiserfs_abort_journal(struct super_block *sb, int errno); -void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); -int reiserfs_allocate_list_bitmaps(struct super_block *s, - struct reiserfs_list_bitmap *, unsigned int); - -void reiserfs_schedule_old_flush(struct super_block *s); -void reiserfs_cancel_old_flush(struct super_block *s); -void add_save_link(struct reiserfs_transaction_handle *th, - struct inode *inode, int truncate); -int remove_save_link(struct inode *inode, int truncate); - -/* objectid.c */ -__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th); -void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, - __u32 objectid_to_release); -int reiserfs_convert_objectid_map_v1(struct super_block *); - -/* stree.c */ -int B_IS_IN_TREE(const struct buffer_head *); -extern void copy_item_head(struct item_head *to, - const struct item_head *from); - -/* first key is in cpu form, second - le */ -extern int comp_short_keys(const struct reiserfs_key *le_key, - const struct cpu_key *cpu_key); -extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); - -/* both are in le form */ -extern int comp_le_keys(const struct reiserfs_key *, - const struct reiserfs_key *); -extern int comp_short_le_keys(const struct reiserfs_key *, - const struct reiserfs_key *); - -/* * get key version from on disk key - kludge */ -static inline int le_key_version(const struct reiserfs_key *key) -{ - int type; - - type = offset_v2_k_type(&(key->u.k_offset_v2)); - if (type != TYPE_DIRECT && type != TYPE_INDIRECT - && type != TYPE_DIRENTRY) - return KEY_FORMAT_3_5; - - return KEY_FORMAT_3_6; - -} - -static inline void copy_key(struct reiserfs_key *to, - const struct reiserfs_key *from) -{ - memcpy(to, from, KEY_SIZE); -} - -int comp_items(const struct item_head *stored_ih, const struct treepath *path); -const struct reiserfs_key *get_rkey(const struct treepath *chk_path, - const struct super_block *sb); -int search_by_key(struct super_block *, const struct cpu_key *, - struct treepath *, int); -#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL) -int search_for_position_by_key(struct super_block *sb, - const struct cpu_key *cpu_key, - struct treepath *search_path); -extern void decrement_bcount(struct buffer_head *bh); -void decrement_counters_in_path(struct treepath *search_path); -void pathrelse(struct treepath *search_path); -int reiserfs_check_path(struct treepath *p); -void pathrelse_and_restore(struct super_block *s, struct treepath *search_path); - -int reiserfs_insert_item(struct reiserfs_transaction_handle *th, - struct treepath *path, - const struct cpu_key *key, - struct item_head *ih, - struct inode *inode, const char *body); - -int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, - struct treepath *path, - const struct cpu_key *key, - struct inode *inode, - const char *body, int paste_size); - -int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, - struct treepath *path, - struct cpu_key *key, - struct inode *inode, - struct page *page, loff_t new_file_size); - -int reiserfs_delete_item(struct reiserfs_transaction_handle *th, - struct treepath *path, - const struct cpu_key *key, - struct inode *inode, struct buffer_head *un_bh); - -void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, - struct inode *inode, struct reiserfs_key *key); -int reiserfs_delete_object(struct reiserfs_transaction_handle *th, - struct inode *inode); -int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, - struct inode *inode, struct page *, - int update_timestamps); - -#define i_block_size(inode) ((inode)->i_sb->s_blocksize) -#define file_size(inode) ((inode)->i_size) -#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1)) - -#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\ -!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 ) - -void padd_item(char *item, int total_length, int length); - -/* inode.c */ -/* args for the create parameter of reiserfs_get_block */ -#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ -#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ -#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ -#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ -#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ -#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ - -void reiserfs_read_locked_inode(struct inode *inode, - struct reiserfs_iget_args *args); -int reiserfs_find_actor(struct inode *inode, void *p); -int reiserfs_init_locked_inode(struct inode *inode, void *p); -void reiserfs_evict_inode(struct inode *inode); -int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int reiserfs_get_block(struct inode *inode, sector_t block, - struct buffer_head *bh_result, int create); -struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, - struct inode *parent); - -int reiserfs_truncate_file(struct inode *, int update_timestamps); -void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, - int type, int key_length); -void make_le_item_head(struct item_head *ih, const struct cpu_key *key, - int version, - loff_t offset, int type, int length, int entry_count); -struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key); - -struct reiserfs_security_handle; -int reiserfs_new_inode(struct reiserfs_transaction_handle *th, - struct inode *dir, umode_t mode, - const char *symname, loff_t i_size, - struct dentry *dentry, struct inode *inode, - struct reiserfs_security_handle *security); - -void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, - struct inode *inode, loff_t size); - -static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, - struct inode *inode) -{ - reiserfs_update_sd_size(th, inode, inode->i_size); -} - -void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr); - -int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); - -/* namei.c */ -void reiserfs_init_priv_inode(struct inode *inode); -void set_de_name_and_namelen(struct reiserfs_dir_entry *de); -int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, - struct treepath *path, struct reiserfs_dir_entry *de); -struct dentry *reiserfs_get_parent(struct dentry *); - -#ifdef CONFIG_REISERFS_PROC_INFO -int reiserfs_proc_info_init(struct super_block *sb); -int reiserfs_proc_info_done(struct super_block *sb); -int reiserfs_proc_info_global_init(void); -int reiserfs_proc_info_global_done(void); - -#define PROC_EXP( e ) e - -#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data -#define PROC_INFO_MAX( sb, field, value ) \ - __PINFO( sb ).field = \ - max( REISERFS_SB( sb ) -> s_proc_info_data.field, value ) -#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) ) -#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) ) -#define PROC_INFO_BH_STAT( sb, bh, level ) \ - PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \ - PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \ - PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) ) -#else -static inline int reiserfs_proc_info_init(struct super_block *sb) -{ - return 0; -} - -static inline int reiserfs_proc_info_done(struct super_block *sb) -{ - return 0; -} - -static inline int reiserfs_proc_info_global_init(void) -{ - return 0; -} - -static inline int reiserfs_proc_info_global_done(void) -{ - return 0; -} - -#define PROC_EXP( e ) -#define VOID_V ( ( void ) 0 ) -#define PROC_INFO_MAX( sb, field, value ) VOID_V -#define PROC_INFO_INC( sb, field ) VOID_V -#define PROC_INFO_ADD( sb, field, val ) VOID_V -#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V -#endif - -/* dir.c */ -extern const struct inode_operations reiserfs_dir_inode_operations; -extern const struct inode_operations reiserfs_symlink_inode_operations; -extern const struct inode_operations reiserfs_special_inode_operations; -extern const struct file_operations reiserfs_dir_operations; -int reiserfs_readdir_inode(struct inode *, struct dir_context *); - -/* tail_conversion.c */ -int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, - struct treepath *, struct buffer_head *, loff_t); -int indirect2direct(struct reiserfs_transaction_handle *, struct inode *, - struct page *, struct treepath *, const struct cpu_key *, - loff_t, char *); -void reiserfs_unmap_buffer(struct buffer_head *); - -/* file.c */ -extern const struct inode_operations reiserfs_file_inode_operations; -extern const struct inode_operations reiserfs_priv_file_inode_operations; -extern const struct file_operations reiserfs_file_operations; -extern const struct address_space_operations reiserfs_address_space_operations; - -/* fix_nodes.c */ - -int fix_nodes(int n_op_mode, struct tree_balance *tb, - struct item_head *ins_ih, const void *); -void unfix_nodes(struct tree_balance *); - -/* prints.c */ -void __reiserfs_panic(struct super_block *s, const char *id, - const char *function, const char *fmt, ...) - __attribute__ ((noreturn)); -#define reiserfs_panic(s, id, fmt, args...) \ - __reiserfs_panic(s, id, __func__, fmt, ##args) -void __reiserfs_error(struct super_block *s, const char *id, - const char *function, const char *fmt, ...); -#define reiserfs_error(s, id, fmt, args...) \ - __reiserfs_error(s, id, __func__, fmt, ##args) -void reiserfs_info(struct super_block *s, const char *fmt, ...); -void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...); -void print_indirect_item(struct buffer_head *bh, int item_num); -void store_print_tb(struct tree_balance *tb); -void print_cur_tb(char *mes); -void print_de(struct reiserfs_dir_entry *de); -void print_bi(struct buffer_info *bi, char *mes); -#define PRINT_LEAF_ITEMS 1 /* print all items */ -#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */ -#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */ -void print_block(struct buffer_head *bh, ...); -void print_bmap(struct super_block *s, int silent); -void print_bmap_block(int i, char *data, int size, int silent); -/*void print_super_block (struct super_block * s, char * mes);*/ -void print_objectid_map(struct super_block *s); -void print_block_head(struct buffer_head *bh, char *mes); -void check_leaf(struct buffer_head *bh); -void check_internal(struct buffer_head *bh); -void print_statistics(struct super_block *s); -char *reiserfs_hashname(int code); - -/* lbalance.c */ -int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, - int mov_bytes, struct buffer_head *Snew); -int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes); -int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); -void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, - int del_num, int del_bytes); -void leaf_insert_into_buf(struct buffer_info *bi, int before, - struct item_head * const inserted_item_ih, - const char * const inserted_item_body, - int zeros_number); -void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, - int pos_in_item, int paste_size, - const char * const body, int zeros_number); -void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, - int pos_in_item, int cut_size); -void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, - int new_entry_count, struct reiserfs_de_head *new_dehs, - const char *records, int paste_size); -/* ibalance.c */ -int balance_internal(struct tree_balance *, int, int, struct item_head *, - struct buffer_head **); - -/* do_balance.c */ -void do_balance_mark_leaf_dirty(struct tree_balance *tb, - struct buffer_head *bh, int flag); -#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty -#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty - -void do_balance(struct tree_balance *tb, struct item_head *ih, - const char *body, int flag); -void reiserfs_invalidate_buffer(struct tree_balance *tb, - struct buffer_head *bh); - -int get_left_neighbor_position(struct tree_balance *tb, int h); -int get_right_neighbor_position(struct tree_balance *tb, int h); -void replace_key(struct tree_balance *tb, struct buffer_head *, int, - struct buffer_head *, int); -void make_empty_node(struct buffer_info *); -struct buffer_head *get_FEB(struct tree_balance *); - -/* bitmap.c */ - -/* - * structure contains hints for block allocator, and it is a container for - * arguments, such as node, search path, transaction_handle, etc. - */ -struct __reiserfs_blocknr_hint { - /* inode passed to allocator, if we allocate unf. nodes */ - struct inode *inode; - - sector_t block; /* file offset, in blocks */ - struct in_core_key key; - - /* - * search path, used by allocator to deternine search_start by - * various ways - */ - struct treepath *path; - - /* - * transaction handle is needed to log super blocks - * and bitmap blocks changes - */ - struct reiserfs_transaction_handle *th; - - b_blocknr_t beg, end; - - /* - * a field used to transfer search start value (block number) - * between different block allocator procedures - * (determine_search_start() and others) - */ - b_blocknr_t search_start; - - /* - * is set in determine_prealloc_size() function, - * used by underlayed function that do actual allocation - */ - int prealloc_size; - - /* - * the allocator uses different polices for getting disk - * space for formatted/unformatted blocks with/without preallocation - */ - unsigned formatted_node:1; - unsigned preallocate:1; -}; - -typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t; - -int reiserfs_parse_alloc_options(struct super_block *, char *); -void reiserfs_init_alloc_options(struct super_block *s); - -/* - * given a directory, this will tell you what packing locality - * to use for a new object underneat it. The locality is returned - * in disk byte order (le). - */ -__le32 reiserfs_choose_packing(struct inode *dir); - -void show_alloc_options(struct seq_file *seq, struct super_block *s); -int reiserfs_init_bitmap_cache(struct super_block *sb); -void reiserfs_free_bitmap_cache(struct super_block *sb); -void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); -struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap); -int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); -void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *, - b_blocknr_t, int for_unformatted); -int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int, - int); -static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb, - b_blocknr_t * new_blocknrs, - int amount_needed) -{ - reiserfs_blocknr_hint_t hint = { - .th = tb->transaction_handle, - .path = tb->tb_path, - .inode = NULL, - .key = tb->key, - .block = 0, - .formatted_node = 1 - }; - return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed, - 0); -} - -static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle - *th, struct inode *inode, - b_blocknr_t * new_blocknrs, - struct treepath *path, - sector_t block) -{ - reiserfs_blocknr_hint_t hint = { - .th = th, - .path = path, - .inode = inode, - .block = block, - .formatted_node = 0, - .preallocate = 0 - }; - return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); -} - -#ifdef REISERFS_PREALLOCATE -static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle - *th, struct inode *inode, - b_blocknr_t * new_blocknrs, - struct treepath *path, - sector_t block) -{ - reiserfs_blocknr_hint_t hint = { - .th = th, - .path = path, - .inode = inode, - .block = block, - .formatted_node = 0, - .preallocate = 1 - }; - return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); -} - -void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, - struct inode *inode); -void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th); -#endif - -/* hashes.c */ -__u32 keyed_hash(const signed char *msg, int len); -__u32 yura_hash(const signed char *msg, int len); -__u32 r5_hash(const signed char *msg, int len); - -#define reiserfs_set_le_bit __set_bit_le -#define reiserfs_test_and_set_le_bit __test_and_set_bit_le -#define reiserfs_clear_le_bit __clear_bit_le -#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le -#define reiserfs_test_le_bit test_bit_le -#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le - -/* - * sometimes reiserfs_truncate may require to allocate few new blocks - * to perform indirect2direct conversion. People probably used to - * think, that truncate should work without problems on a filesystem - * without free disk space. They may complain that they can not - * truncate due to lack of free disk space. This spare space allows us - * to not worry about it. 500 is probably too much, but it should be - * absolutely safe - */ -#define SPARE_SPACE 500 - -/* prototypes from ioctl.c */ -int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int reiserfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -long reiserfs_compat_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg); -int reiserfs_unpack(struct inode *inode); diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c deleted file mode 100644 index 7b498a0d060b..000000000000 --- a/fs/reiserfs/resize.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -/* - * Written by Alexander Zarochentcev. - * - * The kernel part of the (on-line) reiserfs resizer. - */ - -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/string.h> -#include <linux/errno.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> - -int reiserfs_resize(struct super_block *s, unsigned long block_count_new) -{ - int err = 0; - struct reiserfs_super_block *sb; - struct reiserfs_bitmap_info *bitmap; - struct reiserfs_bitmap_info *info; - struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); - struct buffer_head *bh; - struct reiserfs_transaction_handle th; - unsigned int bmap_nr_new, bmap_nr; - unsigned int block_r_new, block_r; - - struct reiserfs_list_bitmap *jb; - struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; - - unsigned long int block_count, free_blocks; - int i; - int copy_size; - int depth; - - sb = SB_DISK_SUPER_BLOCK(s); - - if (SB_BLOCK_COUNT(s) >= block_count_new) { - printk("can\'t shrink filesystem on-line\n"); - return -EINVAL; - } - - /* check the device size */ - depth = reiserfs_write_unlock_nested(s); - bh = sb_bread(s, block_count_new - 1); - reiserfs_write_lock_nested(s, depth); - if (!bh) { - printk("reiserfs_resize: can\'t read last block\n"); - return -EINVAL; - } - bforget(bh); - - /* - * old disk layout detection; those partitions can be mounted, but - * cannot be resized - */ - if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size - != REISERFS_DISK_OFFSET_IN_BYTES) { - printk - ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); - return -ENOTSUPP; - } - - /* count used bits in last bitmap block */ - block_r = SB_BLOCK_COUNT(s) - - (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8; - - /* count bitmap blocks in new fs */ - bmap_nr_new = block_count_new / (s->s_blocksize * 8); - block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; - if (block_r_new) - bmap_nr_new++; - else - block_r_new = s->s_blocksize * 8; - - /* save old values */ - block_count = SB_BLOCK_COUNT(s); - bmap_nr = reiserfs_bmap_count(s); - - /* resizing of reiserfs bitmaps (journal and real), if needed */ - if (bmap_nr_new > bmap_nr) { - /* reallocate journal bitmaps */ - if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { - printk - ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); - return -ENOMEM; - } - /* - * the new journal bitmaps are zero filled, now we copy i - * the bitmap node pointers from the old journal bitmap - * structs, and then transfer the new data structures - * into the journal struct. - * - * using the copy_size var below allows this code to work for - * both shrinking and expanding the FS. - */ - copy_size = min(bmap_nr_new, bmap_nr); - copy_size = - copy_size * sizeof(struct reiserfs_list_bitmap_node *); - for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { - struct reiserfs_bitmap_node **node_tmp; - jb = SB_JOURNAL(s)->j_list_bitmap + i; - memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); - - /* - * just in case vfree schedules on us, copy the new - * pointer into the journal struct before freeing the - * old one - */ - node_tmp = jb->bitmaps; - jb->bitmaps = jbitmap[i].bitmaps; - vfree(node_tmp); - } - - /* - * allocate additional bitmap blocks, reallocate - * array of bitmap block pointers - */ - bitmap = - vzalloc(array_size(bmap_nr_new, - sizeof(struct reiserfs_bitmap_info))); - if (!bitmap) { - /* - * Journal bitmaps are still supersized, but the - * memory isn't leaked, so I guess it's ok - */ - printk("reiserfs_resize: unable to allocate memory.\n"); - return -ENOMEM; - } - for (i = 0; i < bmap_nr; i++) - bitmap[i] = old_bitmap[i]; - - /* - * This doesn't go through the journal, but it doesn't have to. - * The changes are still atomic: We're synced up when the - * journal transaction begins, and the new bitmaps don't - * matter if the transaction fails. - */ - for (i = bmap_nr; i < bmap_nr_new; i++) { - int depth; - /* - * don't use read_bitmap_block since it will cache - * the uninitialized bitmap - */ - depth = reiserfs_write_unlock_nested(s); - bh = sb_bread(s, i * s->s_blocksize * 8); - reiserfs_write_lock_nested(s, depth); - if (!bh) { - vfree(bitmap); - return -EIO; - } - memset(bh->b_data, 0, sb_blocksize(sb)); - reiserfs_set_le_bit(0, bh->b_data); - reiserfs_cache_bitmap_metadata(s, bh, bitmap + i); - - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - depth = reiserfs_write_unlock_nested(s); - sync_dirty_buffer(bh); - reiserfs_write_lock_nested(s, depth); - /* update bitmap_info stuff */ - bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; - brelse(bh); - } - /* free old bitmap blocks array */ - SB_AP_BITMAP(s) = bitmap; - vfree(old_bitmap); - } - - /* - * begin transaction, if there was an error, it's fine. Yes, we have - * incorrect bitmaps now, but none of it is ever going to touch the - * disk anyway. - */ - err = journal_begin(&th, s, 10); - if (err) - return err; - - /* Extend old last bitmap block - new blocks have been made available */ - info = SB_AP_BITMAP(s) + bmap_nr - 1; - bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); - if (!bh) { - int jerr = journal_end(&th); - if (jerr) - return jerr; - return -EIO; - } - - reiserfs_prepare_for_journal(s, bh, 1); - for (i = block_r; i < s->s_blocksize * 8; i++) - reiserfs_clear_le_bit(i, bh->b_data); - info->free_count += s->s_blocksize * 8 - block_r; - - journal_mark_dirty(&th, bh); - brelse(bh); - - /* Correct new last bitmap block - It may not be full */ - info = SB_AP_BITMAP(s) + bmap_nr_new - 1; - bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); - if (!bh) { - int jerr = journal_end(&th); - if (jerr) - return jerr; - return -EIO; - } - - reiserfs_prepare_for_journal(s, bh, 1); - for (i = block_r_new; i < s->s_blocksize * 8; i++) - reiserfs_set_le_bit(i, bh->b_data); - journal_mark_dirty(&th, bh); - brelse(bh); - - info->free_count -= s->s_blocksize * 8 - block_r_new; - /* update super */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - free_blocks = SB_FREE_BLOCKS(s); - PUT_SB_FREE_BLOCKS(s, - free_blocks + (block_count_new - block_count - - (bmap_nr_new - bmap_nr))); - PUT_SB_BLOCK_COUNT(s, block_count_new); - PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); - - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - - SB_JOURNAL(s)->j_must_wait = 1; - return journal_end(&th); -} diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c deleted file mode 100644 index 5faf702f8d15..000000000000 --- a/fs/reiserfs/stree.c +++ /dev/null @@ -1,2280 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - */ - -/* - * Written by Anatoly P. Pinchuk pap@namesys.botik.ru - * Programm System Institute - * Pereslavl-Zalessky Russia - */ - -#include <linux/time.h> -#include <linux/string.h> -#include <linux/pagemap.h> -#include <linux/bio.h> -#include "reiserfs.h" -#include <linux/buffer_head.h> -#include <linux/quotaops.h> - -/* Does the buffer contain a disk block which is in the tree. */ -inline int B_IS_IN_TREE(const struct buffer_head *bh) -{ - - RFALSE(B_LEVEL(bh) > MAX_HEIGHT, - "PAP-1010: block (%b) has too big level (%z)", bh, bh); - - return (B_LEVEL(bh) != FREE_LEVEL); -} - -/* to get item head in le form */ -inline void copy_item_head(struct item_head *to, - const struct item_head *from) -{ - memcpy(to, from, IH_SIZE); -} - -/* - * k1 is pointer to on-disk structure which is stored in little-endian - * form. k2 is pointer to cpu variable. For key of items of the same - * object this returns 0. - * Returns: -1 if key1 < key2 - * 0 if key1 == key2 - * 1 if key1 > key2 - */ -inline int comp_short_keys(const struct reiserfs_key *le_key, - const struct cpu_key *cpu_key) -{ - __u32 n; - n = le32_to_cpu(le_key->k_dir_id); - if (n < cpu_key->on_disk_key.k_dir_id) - return -1; - if (n > cpu_key->on_disk_key.k_dir_id) - return 1; - n = le32_to_cpu(le_key->k_objectid); - if (n < cpu_key->on_disk_key.k_objectid) - return -1; - if (n > cpu_key->on_disk_key.k_objectid) - return 1; - return 0; -} - -/* - * k1 is pointer to on-disk structure which is stored in little-endian - * form. k2 is pointer to cpu variable. - * Compare keys using all 4 key fields. - * Returns: -1 if key1 < key2 0 - * if key1 = key2 1 if key1 > key2 - */ -static inline int comp_keys(const struct reiserfs_key *le_key, - const struct cpu_key *cpu_key) -{ - int retval; - - retval = comp_short_keys(le_key, cpu_key); - if (retval) - return retval; - if (le_key_k_offset(le_key_version(le_key), le_key) < - cpu_key_k_offset(cpu_key)) - return -1; - if (le_key_k_offset(le_key_version(le_key), le_key) > - cpu_key_k_offset(cpu_key)) - return 1; - - if (cpu_key->key_length == 3) - return 0; - - /* this part is needed only when tail conversion is in progress */ - if (le_key_k_type(le_key_version(le_key), le_key) < - cpu_key_k_type(cpu_key)) - return -1; - - if (le_key_k_type(le_key_version(le_key), le_key) > - cpu_key_k_type(cpu_key)) - return 1; - - return 0; -} - -inline int comp_short_le_keys(const struct reiserfs_key *key1, - const struct reiserfs_key *key2) -{ - __u32 *k1_u32, *k2_u32; - int key_length = REISERFS_SHORT_KEY_LEN; - - k1_u32 = (__u32 *) key1; - k2_u32 = (__u32 *) key2; - for (; key_length--; ++k1_u32, ++k2_u32) { - if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32)) - return -1; - if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32)) - return 1; - } - return 0; -} - -inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) -{ - int version; - to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); - to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); - - /* find out version of the key */ - version = le_key_version(from); - to->version = version; - to->on_disk_key.k_offset = le_key_k_offset(version, from); - to->on_disk_key.k_type = le_key_k_type(version, from); -} - -/* - * this does not say which one is bigger, it only returns 1 if keys - * are not equal, 0 otherwise - */ -inline int comp_le_keys(const struct reiserfs_key *k1, - const struct reiserfs_key *k2) -{ - return memcmp(k1, k2, sizeof(struct reiserfs_key)); -} - -/************************************************************************** - * Binary search toolkit function * - * Search for an item in the array by the item key * - * Returns: 1 if found, 0 if not found; * - * *pos = number of the searched element if found, else the * - * number of the first element that is larger than key. * - **************************************************************************/ -/* - * For those not familiar with binary search: lbound is the leftmost item - * that it could be, rbound the rightmost item that it could be. We examine - * the item halfway between lbound and rbound, and that tells us either - * that we can increase lbound, or decrease rbound, or that we have found it, - * or if lbound <= rbound that there are no possible items, and we have not - * found it. With each examination we cut the number of possible items it - * could be by one more than half rounded down, or we find it. - */ -static inline int bin_search(const void *key, /* Key to search for. */ - const void *base, /* First item in the array. */ - int num, /* Number of items in the array. */ - /* - * Item size in the array. searched. Lest the - * reader be confused, note that this is crafted - * as a general function, and when it is applied - * specifically to the array of item headers in a - * node, width is actually the item header size - * not the item size. - */ - int width, - int *pos /* Number of the searched for element. */ - ) -{ - int rbound, lbound, j; - - for (j = ((rbound = num - 1) + (lbound = 0)) / 2; - lbound <= rbound; j = (rbound + lbound) / 2) - switch (comp_keys - ((struct reiserfs_key *)((char *)base + j * width), - (struct cpu_key *)key)) { - case -1: - lbound = j + 1; - continue; - case 1: - rbound = j - 1; - continue; - case 0: - *pos = j; - return ITEM_FOUND; /* Key found in the array. */ - } - - /* - * bin_search did not find given key, it returns position of key, - * that is minimal and greater than the given one. - */ - *pos = lbound; - return ITEM_NOT_FOUND; -} - - -/* Minimal possible key. It is never in the tree. */ -const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; - -/* Maximal possible key. It is never in the tree. */ -static const struct reiserfs_key MAX_KEY = { - cpu_to_le32(0xffffffff), - cpu_to_le32(0xffffffff), - {{cpu_to_le32(0xffffffff), - cpu_to_le32(0xffffffff)},} -}; - -/* - * Get delimiting key of the buffer by looking for it in the buffers in the - * path, starting from the bottom of the path, and going upwards. We must - * check the path's validity at each step. If the key is not in the path, - * there is no delimiting key in the tree (buffer is first or last buffer - * in tree), and in this case we return a special key, either MIN_KEY or - * MAX_KEY. - */ -static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, - const struct super_block *sb) -{ - int position, path_offset = chk_path->path_length; - struct buffer_head *parent; - - RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, - "PAP-5010: invalid offset in the path"); - - /* While not higher in path than first element. */ - while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { - - RFALSE(!buffer_uptodate - (PATH_OFFSET_PBUFFER(chk_path, path_offset)), - "PAP-5020: parent is not uptodate"); - - /* Parent at the path is not in the tree now. */ - if (!B_IS_IN_TREE - (parent = - PATH_OFFSET_PBUFFER(chk_path, path_offset))) - return &MAX_KEY; - /* Check whether position in the parent is correct. */ - if ((position = - PATH_OFFSET_POSITION(chk_path, - path_offset)) > - B_NR_ITEMS(parent)) - return &MAX_KEY; - /* Check whether parent at the path really points to the child. */ - if (B_N_CHILD_NUM(parent, position) != - PATH_OFFSET_PBUFFER(chk_path, - path_offset + 1)->b_blocknr) - return &MAX_KEY; - /* - * Return delimiting key if position in the parent - * is not equal to zero. - */ - if (position) - return internal_key(parent, position - 1); - } - /* Return MIN_KEY if we are in the root of the buffer tree. */ - if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> - b_blocknr == SB_ROOT_BLOCK(sb)) - return &MIN_KEY; - return &MAX_KEY; -} - -/* Get delimiting key of the buffer at the path and its right neighbor. */ -inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, - const struct super_block *sb) -{ - int position, path_offset = chk_path->path_length; - struct buffer_head *parent; - - RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET, - "PAP-5030: invalid offset in the path"); - - while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { - - RFALSE(!buffer_uptodate - (PATH_OFFSET_PBUFFER(chk_path, path_offset)), - "PAP-5040: parent is not uptodate"); - - /* Parent at the path is not in the tree now. */ - if (!B_IS_IN_TREE - (parent = - PATH_OFFSET_PBUFFER(chk_path, path_offset))) - return &MIN_KEY; - /* Check whether position in the parent is correct. */ - if ((position = - PATH_OFFSET_POSITION(chk_path, - path_offset)) > - B_NR_ITEMS(parent)) - return &MIN_KEY; - /* - * Check whether parent at the path really points - * to the child. - */ - if (B_N_CHILD_NUM(parent, position) != - PATH_OFFSET_PBUFFER(chk_path, - path_offset + 1)->b_blocknr) - return &MIN_KEY; - - /* - * Return delimiting key if position in the parent - * is not the last one. - */ - if (position != B_NR_ITEMS(parent)) - return internal_key(parent, position); - } - - /* Return MAX_KEY if we are in the root of the buffer tree. */ - if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> - b_blocknr == SB_ROOT_BLOCK(sb)) - return &MAX_KEY; - return &MIN_KEY; -} - -/* - * Check whether a key is contained in the tree rooted from a buffer at a path. - * This works by looking at the left and right delimiting keys for the buffer - * in the last path_element in the path. These delimiting keys are stored - * at least one level above that buffer in the tree. If the buffer is the - * first or last node in the tree order then one of the delimiting keys may - * be absent, and in this case get_lkey and get_rkey return a special key - * which is MIN_KEY or MAX_KEY. - */ -static inline int key_in_buffer( - /* Path which should be checked. */ - struct treepath *chk_path, - /* Key which should be checked. */ - const struct cpu_key *key, - struct super_block *sb - ) -{ - - RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET - || chk_path->path_length > MAX_HEIGHT, - "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", - key, chk_path->path_length); - RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev, - "PAP-5060: device must not be NODEV"); - - if (comp_keys(get_lkey(chk_path, sb), key) == 1) - /* left delimiting key is bigger, that the key we look for */ - return 0; - /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */ - if (comp_keys(get_rkey(chk_path, sb), key) != 1) - /* key must be less than right delimitiing key */ - return 0; - return 1; -} - -int reiserfs_check_path(struct treepath *p) -{ - RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, - "path not properly relsed"); - return 0; -} - -/* - * Drop the reference to each buffer in a path and restore - * dirty bits clean when preparing the buffer for the log. - * This version should only be called from fix_nodes() - */ -void pathrelse_and_restore(struct super_block *sb, - struct treepath *search_path) -{ - int path_offset = search_path->path_length; - - RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, - "clm-4000: invalid path offset"); - - while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { - struct buffer_head *bh; - bh = PATH_OFFSET_PBUFFER(search_path, path_offset--); - reiserfs_restore_prepared_buffer(sb, bh); - brelse(bh); - } - search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; -} - -/* Drop the reference to each buffer in a path */ -void pathrelse(struct treepath *search_path) -{ - int path_offset = search_path->path_length; - - RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, - "PAP-5090: invalid path offset"); - - while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) - brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--)); - - search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; -} - -static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih) -{ - struct reiserfs_de_head *deh; - int i; - - deh = B_I_DEH(bh, ih); - for (i = 0; i < ih_entry_count(ih); i++) { - if (deh_location(&deh[i]) > ih_item_len(ih)) { - reiserfs_warning(NULL, "reiserfs-5094", - "directory entry location seems wrong %h", - &deh[i]); - return 0; - } - } - - return 1; -} - -static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) -{ - struct block_head *blkh; - struct item_head *ih; - int used_space; - int prev_location; - int i; - int nr; - - blkh = (struct block_head *)buf; - if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { - reiserfs_warning(NULL, "reiserfs-5080", - "this should be caught earlier"); - return 0; - } - - nr = blkh_nr_item(blkh); - if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { - /* item number is too big or too small */ - reiserfs_warning(NULL, "reiserfs-5081", - "nr_item seems wrong: %z", bh); - return 0; - } - ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; - used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); - - /* free space does not match to calculated amount of use space */ - if (used_space != blocksize - blkh_free_space(blkh)) { - reiserfs_warning(NULL, "reiserfs-5082", - "free space seems wrong: %z", bh); - return 0; - } - /* - * FIXME: it is_leaf will hit performance too much - we may have - * return 1 here - */ - - /* check tables of item heads */ - ih = (struct item_head *)(buf + BLKH_SIZE); - prev_location = blocksize; - for (i = 0; i < nr; i++, ih++) { - if (le_ih_k_type(ih) == TYPE_ANY) { - reiserfs_warning(NULL, "reiserfs-5083", - "wrong item type for item %h", - ih); - return 0; - } - if (ih_location(ih) >= blocksize - || ih_location(ih) < IH_SIZE * nr) { - reiserfs_warning(NULL, "reiserfs-5084", - "item location seems wrong: %h", - ih); - return 0; - } - if (ih_item_len(ih) < 1 - || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { - reiserfs_warning(NULL, "reiserfs-5085", - "item length seems wrong: %h", - ih); - return 0; - } - if (prev_location - ih_location(ih) != ih_item_len(ih)) { - reiserfs_warning(NULL, "reiserfs-5086", - "item location seems wrong " - "(second one): %h", ih); - return 0; - } - if (is_direntry_le_ih(ih)) { - if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) { - reiserfs_warning(NULL, "reiserfs-5093", - "item entry count seems wrong %h", - ih); - return 0; - } - return has_valid_deh_location(bh, ih); - } - prev_location = ih_location(ih); - } - - /* one may imagine many more checks */ - return 1; -} - -/* returns 1 if buf looks like an internal node, 0 otherwise */ -static int is_internal(char *buf, int blocksize, struct buffer_head *bh) -{ - struct block_head *blkh; - int nr; - int used_space; - - blkh = (struct block_head *)buf; - nr = blkh_level(blkh); - if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { - /* this level is not possible for internal nodes */ - reiserfs_warning(NULL, "reiserfs-5087", - "this should be caught earlier"); - return 0; - } - - nr = blkh_nr_item(blkh); - /* for internal which is not root we might check min number of keys */ - if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { - reiserfs_warning(NULL, "reiserfs-5088", - "number of key seems wrong: %z", bh); - return 0; - } - - used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); - if (used_space != blocksize - blkh_free_space(blkh)) { - reiserfs_warning(NULL, "reiserfs-5089", - "free space seems wrong: %z", bh); - return 0; - } - - /* one may imagine many more checks */ - return 1; -} - -/* - * make sure that bh contains formatted node of reiserfs tree of - * 'level'-th level - */ -static int is_tree_node(struct buffer_head *bh, int level) -{ - if (B_LEVEL(bh) != level) { - reiserfs_warning(NULL, "reiserfs-5090", "node level %d does " - "not match to the expected one %d", - B_LEVEL(bh), level); - return 0; - } - if (level == DISK_LEAF_NODE_LEVEL) - return is_leaf(bh->b_data, bh->b_size, bh); - - return is_internal(bh->b_data, bh->b_size, bh); -} - -#define SEARCH_BY_KEY_READA 16 - -/* - * The function is NOT SCHEDULE-SAFE! - * It might unlock the write lock if we needed to wait for a block - * to be read. Note that in this case it won't recover the lock to avoid - * high contention resulting from too much lock requests, especially - * the caller (search_by_key) will perform other schedule-unsafe - * operations just after calling this function. - * - * @return depth of lock to be restored after read completes - */ -static int search_by_key_reada(struct super_block *s, - struct buffer_head **bh, - b_blocknr_t *b, int num) -{ - int i, j; - int depth = -1; - - for (i = 0; i < num; i++) { - bh[i] = sb_getblk(s, b[i]); - } - /* - * We are going to read some blocks on which we - * have a reference. It's safe, though we might be - * reading blocks concurrently changed if we release - * the lock. But it's still fine because we check later - * if the tree changed - */ - for (j = 0; j < i; j++) { - /* - * note, this needs attention if we are getting rid of the BKL - * you have to make sure the prepared bit isn't set on this - * buffer - */ - if (!buffer_uptodate(bh[j])) { - if (depth == -1) - depth = reiserfs_write_unlock_nested(s); - bh_readahead(bh[j], REQ_RAHEAD); - } - brelse(bh[j]); - } - return depth; -} - -/* - * This function fills up the path from the root to the leaf as it - * descends the tree looking for the key. It uses reiserfs_bread to - * try to find buffers in the cache given their block number. If it - * does not find them in the cache it reads them from disk. For each - * node search_by_key finds using reiserfs_bread it then uses - * bin_search to look through that node. bin_search will find the - * position of the block_number of the next node if it is looking - * through an internal node. If it is looking through a leaf node - * bin_search will find the position of the item which has key either - * equal to given key, or which is the maximal key less than the given - * key. search_by_key returns a path that must be checked for the - * correctness of the top of the path but need not be checked for the - * correctness of the bottom of the path - */ -/* - * search_by_key - search for key (and item) in stree - * @sb: superblock - * @key: pointer to key to search for - * @search_path: Allocated and initialized struct treepath; Returned filled - * on success. - * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to - * stop at leaf level. - * - * The function is NOT SCHEDULE-SAFE! - */ -int search_by_key(struct super_block *sb, const struct cpu_key *key, - struct treepath *search_path, int stop_level) -{ - b_blocknr_t block_number; - int expected_level; - struct buffer_head *bh; - struct path_element *last_element; - int node_level, retval; - int fs_gen; - struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; - b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; - int reada_count = 0; - -#ifdef CONFIG_REISERFS_CHECK - int repeat_counter = 0; -#endif - - PROC_INFO_INC(sb, search_by_key); - - /* - * As we add each node to a path we increase its count. This means - * that we must be careful to release all nodes in a path before we - * either discard the path struct or re-use the path struct, as we - * do here. - */ - - pathrelse(search_path); - - /* - * With each iteration of this loop we search through the items in the - * current node, and calculate the next current node(next path element) - * for the next iteration of this loop.. - */ - block_number = SB_ROOT_BLOCK(sb); - expected_level = -1; - while (1) { - -#ifdef CONFIG_REISERFS_CHECK - if (!(++repeat_counter % 50000)) - reiserfs_warning(sb, "PAP-5100", - "%s: there were %d iterations of " - "while loop looking for key %K", - current->comm, repeat_counter, - key); -#endif - - /* prep path to have another element added to it. */ - last_element = - PATH_OFFSET_PELEMENT(search_path, - ++search_path->path_length); - fs_gen = get_generation(sb); - - /* - * Read the next tree node, and set the last element - * in the path to have a pointer to it. - */ - if ((bh = last_element->pe_buffer = - sb_getblk(sb, block_number))) { - - /* - * We'll need to drop the lock if we encounter any - * buffers that need to be read. If all of them are - * already up to date, we don't need to drop the lock. - */ - int depth = -1; - - if (!buffer_uptodate(bh) && reada_count > 1) - depth = search_by_key_reada(sb, reada_bh, - reada_blocks, reada_count); - - if (!buffer_uptodate(bh) && depth == -1) - depth = reiserfs_write_unlock_nested(sb); - - bh_read_nowait(bh, 0); - wait_on_buffer(bh); - - if (depth != -1) - reiserfs_write_lock_nested(sb, depth); - if (!buffer_uptodate(bh)) - goto io_error; - } else { -io_error: - search_path->path_length--; - pathrelse(search_path); - return IO_ERROR; - } - reada_count = 0; - if (expected_level == -1) - expected_level = SB_TREE_HEIGHT(sb); - expected_level--; - - /* - * It is possible that schedule occurred. We must check - * whether the key to search is still in the tree rooted - * from the current buffer. If not then repeat search - * from the root. - */ - if (fs_changed(fs_gen, sb) && - (!B_IS_IN_TREE(bh) || - B_LEVEL(bh) != expected_level || - !key_in_buffer(search_path, key, sb))) { - PROC_INFO_INC(sb, search_by_key_fs_changed); - PROC_INFO_INC(sb, search_by_key_restarted); - PROC_INFO_INC(sb, - sbk_restarted[expected_level - 1]); - pathrelse(search_path); - - /* - * Get the root block number so that we can - * repeat the search starting from the root. - */ - block_number = SB_ROOT_BLOCK(sb); - expected_level = -1; - - /* repeat search from the root */ - continue; - } - - /* - * only check that the key is in the buffer if key is not - * equal to the MAX_KEY. Latter case is only possible in - * "finish_unfinished()" processing during mount. - */ - RFALSE(comp_keys(&MAX_KEY, key) && - !key_in_buffer(search_path, key, sb), - "PAP-5130: key is not in the buffer"); -#ifdef CONFIG_REISERFS_CHECK - if (REISERFS_SB(sb)->cur_tb) { - print_cur_tb("5140"); - reiserfs_panic(sb, "PAP-5140", - "schedule occurred in do_balance!"); - } -#endif - - /* - * make sure, that the node contents look like a node of - * certain level - */ - if (!is_tree_node(bh, expected_level)) { - reiserfs_error(sb, "vs-5150", - "invalid format found in block %ld. " - "Fsck?", bh->b_blocknr); - pathrelse(search_path); - return IO_ERROR; - } - - /* ok, we have acquired next formatted node in the tree */ - node_level = B_LEVEL(bh); - - PROC_INFO_BH_STAT(sb, bh, node_level - 1); - - RFALSE(node_level < stop_level, - "vs-5152: tree level (%d) is less than stop level (%d)", - node_level, stop_level); - - retval = bin_search(key, item_head(bh, 0), - B_NR_ITEMS(bh), - (node_level == - DISK_LEAF_NODE_LEVEL) ? IH_SIZE : - KEY_SIZE, - &last_element->pe_position); - if (node_level == stop_level) { - return retval; - } - - /* we are not in the stop level */ - /* - * item has been found, so we choose the pointer which - * is to the right of the found one - */ - if (retval == ITEM_FOUND) - last_element->pe_position++; - - /* - * if item was not found we choose the position which is to - * the left of the found item. This requires no code, - * bin_search did it already. - */ - - /* - * So we have chosen a position in the current node which is - * an internal node. Now we calculate child block number by - * position in the node. - */ - block_number = - B_N_CHILD_NUM(bh, last_element->pe_position); - - /* - * if we are going to read leaf nodes, try for read - * ahead as well - */ - if ((search_path->reada & PATH_READA) && - node_level == DISK_LEAF_NODE_LEVEL + 1) { - int pos = last_element->pe_position; - int limit = B_NR_ITEMS(bh); - struct reiserfs_key *le_key; - - if (search_path->reada & PATH_READA_BACK) - limit = 0; - while (reada_count < SEARCH_BY_KEY_READA) { - if (pos == limit) - break; - reada_blocks[reada_count++] = - B_N_CHILD_NUM(bh, pos); - if (search_path->reada & PATH_READA_BACK) - pos--; - else - pos++; - - /* - * check to make sure we're in the same object - */ - le_key = internal_key(bh, pos); - if (le32_to_cpu(le_key->k_objectid) != - key->on_disk_key.k_objectid) { - break; - } - } - } - } -} - -/* - * Form the path to an item and position in this item which contains - * file byte defined by key. If there is no such item - * corresponding to the key, we point the path to the item with - * maximal key less than key, and *pos_in_item is set to one - * past the last entry/byte in the item. If searching for entry in a - * directory item, and it is not found, *pos_in_item is set to one - * entry more than the entry with maximal key which is less than the - * sought key. - * - * Note that if there is no entry in this same node which is one more, - * then we point to an imaginary entry. for direct items, the - * position is in units of bytes, for indirect items the position is - * in units of blocknr entries, for directory items the position is in - * units of directory entries. - */ -/* The function is NOT SCHEDULE-SAFE! */ -int search_for_position_by_key(struct super_block *sb, - /* Key to search (cpu variable) */ - const struct cpu_key *p_cpu_key, - /* Filled up by this function. */ - struct treepath *search_path) -{ - struct item_head *p_le_ih; /* pointer to on-disk structure */ - int blk_size; - loff_t item_offset, offset; - struct reiserfs_dir_entry de; - int retval; - - /* If searching for directory entry. */ - if (is_direntry_cpu_key(p_cpu_key)) - return search_by_entry_key(sb, p_cpu_key, search_path, - &de); - - /* If not searching for directory entry. */ - - /* If item is found. */ - retval = search_item(sb, p_cpu_key, search_path); - if (retval == IO_ERROR) - return retval; - if (retval == ITEM_FOUND) { - - RFALSE(!ih_item_len - (item_head - (PATH_PLAST_BUFFER(search_path), - PATH_LAST_POSITION(search_path))), - "PAP-5165: item length equals zero"); - - pos_in_item(search_path) = 0; - return POSITION_FOUND; - } - - RFALSE(!PATH_LAST_POSITION(search_path), - "PAP-5170: position equals zero"); - - /* Item is not found. Set path to the previous item. */ - p_le_ih = - item_head(PATH_PLAST_BUFFER(search_path), - --PATH_LAST_POSITION(search_path)); - blk_size = sb->s_blocksize; - - if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key)) - return FILE_NOT_FOUND; - - /* FIXME: quite ugly this far */ - - item_offset = le_ih_k_offset(p_le_ih); - offset = cpu_key_k_offset(p_cpu_key); - - /* Needed byte is contained in the item pointed to by the path. */ - if (item_offset <= offset && - item_offset + op_bytes_number(p_le_ih, blk_size) > offset) { - pos_in_item(search_path) = offset - item_offset; - if (is_indirect_le_ih(p_le_ih)) { - pos_in_item(search_path) /= blk_size; - } - return POSITION_FOUND; - } - - /* - * Needed byte is not contained in the item pointed to by the - * path. Set pos_in_item out of the item. - */ - if (is_indirect_le_ih(p_le_ih)) - pos_in_item(search_path) = - ih_item_len(p_le_ih) / UNFM_P_SIZE; - else - pos_in_item(search_path) = ih_item_len(p_le_ih); - - return POSITION_NOT_FOUND; -} - -/* Compare given item and item pointed to by the path. */ -int comp_items(const struct item_head *stored_ih, const struct treepath *path) -{ - struct buffer_head *bh = PATH_PLAST_BUFFER(path); - struct item_head *ih; - - /* Last buffer at the path is not in the tree. */ - if (!B_IS_IN_TREE(bh)) - return 1; - - /* Last path position is invalid. */ - if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh)) - return 1; - - /* we need only to know, whether it is the same item */ - ih = tp_item_head(path); - return memcmp(stored_ih, ih, IH_SIZE); -} - -/* prepare for delete or cut of direct item */ -static inline int prepare_for_direct_item(struct treepath *path, - struct item_head *le_ih, - struct inode *inode, - loff_t new_file_length, int *cut_size) -{ - loff_t round_len; - - if (new_file_length == max_reiserfs_offset(inode)) { - /* item has to be deleted */ - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; - } - /* new file gets truncated */ - if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { - round_len = ROUND_UP(new_file_length); - /* this was new_file_length < le_ih ... */ - if (round_len < le_ih_k_offset(le_ih)) { - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete this item. */ - } - /* Calculate first position and size for cutting from item. */ - pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1); - *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); - - return M_CUT; /* Cut from this item. */ - } - - /* old file: items may have any length */ - - if (new_file_length < le_ih_k_offset(le_ih)) { - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete this item. */ - } - - /* Calculate first position and size for cutting from item. */ - *cut_size = -(ih_item_len(le_ih) - - (pos_in_item(path) = - new_file_length + 1 - le_ih_k_offset(le_ih))); - return M_CUT; /* Cut from this item. */ -} - -static inline int prepare_for_direntry_item(struct treepath *path, - struct item_head *le_ih, - struct inode *inode, - loff_t new_file_length, - int *cut_size) -{ - if (le_ih_k_offset(le_ih) == DOT_OFFSET && - new_file_length == max_reiserfs_offset(inode)) { - RFALSE(ih_entry_count(le_ih) != 2, - "PAP-5220: incorrect empty directory item (%h)", le_ih); - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - /* Delete the directory item containing "." and ".." entry. */ - return M_DELETE; - } - - if (ih_entry_count(le_ih) == 1) { - /* - * Delete the directory item such as there is one record only - * in this item - */ - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; - } - - /* Cut one record from the directory item. */ - *cut_size = - -(DEH_SIZE + - entry_length(get_last_bh(path), le_ih, pos_in_item(path))); - return M_CUT; -} - -#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) - -/* - * If the path points to a directory or direct item, calculate mode - * and the size cut, for balance. - * If the path points to an indirect item, remove some number of its - * unformatted nodes. - * In case of file truncate calculate whether this item must be - * deleted/truncated or last unformatted node of this item will be - * converted to a direct item. - * This function returns a determination of what balance mode the - * calling function should employ. - */ -static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct treepath *path, - const struct cpu_key *item_key, - /* - * Number of unformatted nodes - * which were removed from end - * of the file. - */ - int *removed, - int *cut_size, - /* MAX_KEY_OFFSET in case of delete. */ - unsigned long long new_file_length - ) -{ - struct super_block *sb = inode->i_sb; - struct item_head *p_le_ih = tp_item_head(path); - struct buffer_head *bh = PATH_PLAST_BUFFER(path); - - BUG_ON(!th->t_trans_id); - - /* Stat_data item. */ - if (is_statdata_le_ih(p_le_ih)) { - - RFALSE(new_file_length != max_reiserfs_offset(inode), - "PAP-5210: mode must be M_DELETE"); - - *cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); - return M_DELETE; - } - - /* Directory item. */ - if (is_direntry_le_ih(p_le_ih)) - return prepare_for_direntry_item(path, p_le_ih, inode, - new_file_length, - cut_size); - - /* Direct item. */ - if (is_direct_le_ih(p_le_ih)) - return prepare_for_direct_item(path, p_le_ih, inode, - new_file_length, cut_size); - - /* Case of an indirect item. */ - { - int blk_size = sb->s_blocksize; - struct item_head s_ih; - int need_re_search; - int delete = 0; - int result = M_CUT; - int pos = 0; - - if ( new_file_length == max_reiserfs_offset (inode) ) { - /* - * prepare_for_delete_or_cut() is called by - * reiserfs_delete_item() - */ - new_file_length = 0; - delete = 1; - } - - do { - need_re_search = 0; - *cut_size = 0; - bh = PATH_PLAST_BUFFER(path); - copy_item_head(&s_ih, tp_item_head(path)); - pos = I_UNFM_NUM(&s_ih); - - while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { - __le32 *unfm; - __u32 block; - - /* - * Each unformatted block deletion may involve - * one additional bitmap block into the transaction, - * thereby the initial journal space reservation - * might not be enough. - */ - if (!delete && (*cut_size) != 0 && - reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) - break; - - unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1; - block = get_block_num(unfm, 0); - - if (block != 0) { - reiserfs_prepare_for_journal(sb, bh, 1); - put_block_num(unfm, 0, 0); - journal_mark_dirty(th, bh); - reiserfs_free_block(th, inode, block, 1); - } - - reiserfs_cond_resched(sb); - - if (item_moved (&s_ih, path)) { - need_re_search = 1; - break; - } - - pos --; - (*removed)++; - (*cut_size) -= UNFM_P_SIZE; - - if (pos == 0) { - (*cut_size) -= IH_SIZE; - result = M_DELETE; - break; - } - } - /* - * a trick. If the buffer has been logged, this will - * do nothing. If we've broken the loop without logging - * it, it will restore the buffer - */ - reiserfs_restore_prepared_buffer(sb, bh); - } while (need_re_search && - search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); - pos_in_item(path) = pos * UNFM_P_SIZE; - - if (*cut_size == 0) { - /* - * Nothing was cut. maybe convert last unformatted node to the - * direct item? - */ - result = M_CONVERT; - } - return result; - } -} - -/* Calculate number of bytes which will be deleted or cut during balance */ -static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) -{ - int del_size; - struct item_head *p_le_ih = tp_item_head(tb->tb_path); - - if (is_statdata_le_ih(p_le_ih)) - return 0; - - del_size = - (mode == - M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; - if (is_direntry_le_ih(p_le_ih)) { - /* - * return EMPTY_DIR_SIZE; We delete emty directories only. - * we can't use EMPTY_DIR_SIZE, as old format dirs have a - * different empty size. ick. FIXME, is this right? - */ - return del_size; - } - - if (is_indirect_le_ih(p_le_ih)) - del_size = (del_size / UNFM_P_SIZE) * - (PATH_PLAST_BUFFER(tb->tb_path)->b_size); - return del_size; -} - -static void init_tb_struct(struct reiserfs_transaction_handle *th, - struct tree_balance *tb, - struct super_block *sb, - struct treepath *path, int size) -{ - - BUG_ON(!th->t_trans_id); - - memset(tb, '\0', sizeof(struct tree_balance)); - tb->transaction_handle = th; - tb->tb_sb = sb; - tb->tb_path = path; - PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; - PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; - tb->insert_size[0] = size; -} - -void padd_item(char *item, int total_length, int length) -{ - int i; - - for (i = total_length; i > length;) - item[--i] = 0; -} - -#ifdef REISERQUOTA_DEBUG -char key2type(struct reiserfs_key *ih) -{ - if (is_direntry_le_key(2, ih)) - return 'd'; - if (is_direct_le_key(2, ih)) - return 'D'; - if (is_indirect_le_key(2, ih)) - return 'i'; - if (is_statdata_le_key(2, ih)) - return 's'; - return 'u'; -} - -char head2type(struct item_head *ih) -{ - if (is_direntry_le_ih(ih)) - return 'd'; - if (is_direct_le_ih(ih)) - return 'D'; - if (is_indirect_le_ih(ih)) - return 'i'; - if (is_statdata_le_ih(ih)) - return 's'; - return 'u'; -} -#endif - -/* - * Delete object item. - * th - active transaction handle - * path - path to the deleted item - * item_key - key to search for the deleted item - * indode - used for updating i_blocks and quotas - * un_bh - NULL or unformatted node pointer - */ -int reiserfs_delete_item(struct reiserfs_transaction_handle *th, - struct treepath *path, const struct cpu_key *item_key, - struct inode *inode, struct buffer_head *un_bh) -{ - struct super_block *sb = inode->i_sb; - struct tree_balance s_del_balance; - struct item_head s_ih; - struct item_head *q_ih; - int quota_cut_bytes; - int ret_value, del_size, removed; - int depth; - -#ifdef CONFIG_REISERFS_CHECK - char mode; -#endif - - BUG_ON(!th->t_trans_id); - - init_tb_struct(th, &s_del_balance, sb, path, - 0 /*size is unknown */ ); - - while (1) { - removed = 0; - -#ifdef CONFIG_REISERFS_CHECK - mode = -#endif - prepare_for_delete_or_cut(th, inode, path, - item_key, &removed, - &del_size, - max_reiserfs_offset(inode)); - - RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); - - copy_item_head(&s_ih, tp_item_head(path)); - s_del_balance.insert_size[0] = del_size; - - ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); - if (ret_value != REPEAT_SEARCH) - break; - - PROC_INFO_INC(sb, delete_item_restarted); - - /* file system changed, repeat search */ - ret_value = - search_for_position_by_key(sb, item_key, path); - if (ret_value == IO_ERROR) - break; - if (ret_value == FILE_NOT_FOUND) { - reiserfs_warning(sb, "vs-5340", - "no items of the file %K found", - item_key); - break; - } - } /* while (1) */ - - if (ret_value != CARRY_ON) { - unfix_nodes(&s_del_balance); - return 0; - } - - /* reiserfs_delete_item returns item length when success */ - ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); - q_ih = tp_item_head(path); - quota_cut_bytes = ih_item_len(q_ih); - - /* - * hack so the quota code doesn't have to guess if the file has a - * tail. On tail insert, we allocate quota for 1 unformatted node. - * We test the offset because the tail might have been - * split into multiple items, and we only want to decrement for - * the unfm node once - */ - if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { - if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { - quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; - } else { - quota_cut_bytes = 0; - } - } - - if (un_bh) { - int off; - char *data; - - /* - * We are in direct2indirect conversion, so move tail contents - * to the unformatted node - */ - /* - * note, we do the copy before preparing the buffer because we - * don't care about the contents of the unformatted node yet. - * the only thing we really care about is the direct item's - * data is in the unformatted node. - * - * Otherwise, we would have to call - * reiserfs_prepare_for_journal on the unformatted node, - * which might schedule, meaning we'd have to loop all the - * way back up to the start of the while loop. - * - * The unformatted node must be dirtied later on. We can't be - * sure here if the entire tail has been deleted yet. - * - * un_bh is from the page cache (all unformatted nodes are - * from the page cache) and might be a highmem page. So, we - * can't use un_bh->b_data. - * -clm - */ - - data = kmap_atomic(un_bh->b_page); - off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1)); - memcpy(data + off, - ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), - ret_value); - kunmap_atomic(data); - } - - /* Perform balancing after all resources have been collected at once. */ - do_balance(&s_del_balance, NULL, NULL, M_DELETE); - -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(sb, REISERFS_DEBUG_CODE, - "reiserquota delete_item(): freeing %u, id=%u type=%c", - quota_cut_bytes, inode->i_uid, head2type(&s_ih)); -#endif - depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_free_space_nodirty(inode, quota_cut_bytes); - reiserfs_write_lock_nested(inode->i_sb, depth); - - /* Return deleted body length */ - return ret_value; -} - -/* - * Summary Of Mechanisms For Handling Collisions Between Processes: - * - * deletion of the body of the object is performed by iput(), with the - * result that if multiple processes are operating on a file, the - * deletion of the body of the file is deferred until the last process - * that has an open inode performs its iput(). - * - * writes and truncates are protected from collisions by use of - * semaphores. - * - * creates, linking, and mknod are protected from collisions with other - * processes by making the reiserfs_add_entry() the last step in the - * creation, and then rolling back all changes if there was a collision. - * - Hans -*/ - -/* this deletes item which never gets split */ -void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, - struct inode *inode, struct reiserfs_key *key) -{ - struct super_block *sb = th->t_super; - struct tree_balance tb; - INITIALIZE_PATH(path); - int item_len = 0; - int tb_init = 0; - struct cpu_key cpu_key = {}; - int retval; - int quota_cut_bytes = 0; - - BUG_ON(!th->t_trans_id); - - le_key2cpu_key(&cpu_key, key); - - while (1) { - retval = search_item(th->t_super, &cpu_key, &path); - if (retval == IO_ERROR) { - reiserfs_error(th->t_super, "vs-5350", - "i/o failure occurred trying " - "to delete %K", &cpu_key); - break; - } - if (retval != ITEM_FOUND) { - pathrelse(&path); - /* - * No need for a warning, if there is just no free - * space to insert '..' item into the - * newly-created subdir - */ - if (! - ((unsigned long long) - GET_HASH_VALUE(le_key_k_offset - (le_key_version(key), key)) == 0 - && (unsigned long long) - GET_GENERATION_NUMBER(le_key_k_offset - (le_key_version(key), - key)) == 1)) - reiserfs_warning(th->t_super, "vs-5355", - "%k not found", key); - break; - } - if (!tb_init) { - tb_init = 1; - item_len = ih_item_len(tp_item_head(&path)); - init_tb_struct(th, &tb, th->t_super, &path, - -(IH_SIZE + item_len)); - } - quota_cut_bytes = ih_item_len(tp_item_head(&path)); - - retval = fix_nodes(M_DELETE, &tb, NULL, NULL); - if (retval == REPEAT_SEARCH) { - PROC_INFO_INC(th->t_super, delete_solid_item_restarted); - continue; - } - - if (retval == CARRY_ON) { - do_balance(&tb, NULL, NULL, M_DELETE); - /* - * Should we count quota for item? (we don't - * count quotas for save-links) - */ - if (inode) { - int depth; -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, - "reiserquota delete_solid_item(): freeing %u id=%u type=%c", - quota_cut_bytes, inode->i_uid, - key2type(key)); -#endif - depth = reiserfs_write_unlock_nested(sb); - dquot_free_space_nodirty(inode, - quota_cut_bytes); - reiserfs_write_lock_nested(sb, depth); - } - break; - } - - /* IO_ERROR, NO_DISK_SPACE, etc */ - reiserfs_warning(th->t_super, "vs-5360", - "could not delete %K due to fix_nodes failure", - &cpu_key); - unfix_nodes(&tb); - break; - } - - reiserfs_check_path(&path); -} - -int reiserfs_delete_object(struct reiserfs_transaction_handle *th, - struct inode *inode) -{ - int err; - inode->i_size = 0; - BUG_ON(!th->t_trans_id); - - /* for directory this deletes item containing "." and ".." */ - err = - reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ ); - if (err) - return err; - -#if defined( USE_INODE_GENERATION_COUNTER ) - if (!old_format_only(th->t_super)) { - __le32 *inode_generation; - - inode_generation = - &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; - le32_add_cpu(inode_generation, 1); - } -/* USE_INODE_GENERATION_COUNTER */ -#endif - reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); - - return err; -} - -static void unmap_buffers(struct page *page, loff_t pos) -{ - struct buffer_head *bh; - struct buffer_head *head; - struct buffer_head *next; - unsigned long tail_index; - unsigned long cur_index; - - if (page) { - if (page_has_buffers(page)) { - tail_index = pos & (PAGE_SIZE - 1); - cur_index = 0; - head = page_buffers(page); - bh = head; - do { - next = bh->b_this_page; - - /* - * we want to unmap the buffers that contain - * the tail, and all the buffers after it - * (since the tail must be at the end of the - * file). We don't want to unmap file data - * before the tail, since it might be dirty - * and waiting to reach disk - */ - cur_index += bh->b_size; - if (cur_index > tail_index) { - reiserfs_unmap_buffer(bh); - } - bh = next; - } while (bh != head); - } - } -} - -static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct page *page, - struct treepath *path, - const struct cpu_key *item_key, - loff_t new_file_size, char *mode) -{ - struct super_block *sb = inode->i_sb; - int block_size = sb->s_blocksize; - int cut_bytes; - BUG_ON(!th->t_trans_id); - BUG_ON(new_file_size != inode->i_size); - - /* - * the page being sent in could be NULL if there was an i/o error - * reading in the last block. The user will hit problems trying to - * read the file, but for now we just skip the indirect2direct - */ - if (atomic_read(&inode->i_count) > 1 || - !tail_has_to_be_packed(inode) || - !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) { - /* leave tail in an unformatted node */ - *mode = M_SKIP_BALANCING; - cut_bytes = - block_size - (new_file_size & (block_size - 1)); - pathrelse(path); - return cut_bytes; - } - - /* Perform the conversion to a direct_item. */ - return indirect2direct(th, inode, page, path, item_key, - new_file_size, mode); -} - -/* - * we did indirect_to_direct conversion. And we have inserted direct - * item successesfully, but there were no disk space to cut unfm - * pointer being converted. Therefore we have to delete inserted - * direct item(s) - */ -static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, - struct inode *inode, struct treepath *path) -{ - struct cpu_key tail_key; - int tail_len; - int removed; - BUG_ON(!th->t_trans_id); - - make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); - tail_key.key_length = 4; - - tail_len = - (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; - while (tail_len) { - /* look for the last byte of the tail */ - if (search_for_position_by_key(inode->i_sb, &tail_key, path) == - POSITION_NOT_FOUND) - reiserfs_panic(inode->i_sb, "vs-5615", - "found invalid item"); - RFALSE(path->pos_in_item != - ih_item_len(tp_item_head(path)) - 1, - "vs-5616: appended bytes found"); - PATH_LAST_POSITION(path)--; - - removed = - reiserfs_delete_item(th, path, &tail_key, inode, - NULL /*unbh not needed */ ); - RFALSE(removed <= 0 - || removed > tail_len, - "vs-5617: there was tail %d bytes, removed item length %d bytes", - tail_len, removed); - tail_len -= removed; - set_cpu_key_k_offset(&tail_key, - cpu_key_k_offset(&tail_key) - removed); - } - reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " - "conversion has been rolled back due to " - "lack of disk space"); - mark_inode_dirty(inode); -} - -/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ -int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, - struct treepath *path, - struct cpu_key *item_key, - struct inode *inode, - struct page *page, loff_t new_file_size) -{ - struct super_block *sb = inode->i_sb; - /* - * Every function which is going to call do_balance must first - * create a tree_balance structure. Then it must fill up this - * structure by using the init_tb_struct and fix_nodes functions. - * After that we can make tree balancing. - */ - struct tree_balance s_cut_balance; - struct item_head *p_le_ih; - int cut_size = 0; /* Amount to be cut. */ - int ret_value = CARRY_ON; - int removed = 0; /* Number of the removed unformatted nodes. */ - int is_inode_locked = 0; - char mode; /* Mode of the balance. */ - int retval2 = -1; - int quota_cut_bytes; - loff_t tail_pos = 0; - int depth; - - BUG_ON(!th->t_trans_id); - - init_tb_struct(th, &s_cut_balance, inode->i_sb, path, - cut_size); - - /* - * Repeat this loop until we either cut the item without needing - * to balance, or we fix_nodes without schedule occurring - */ - while (1) { - /* - * Determine the balance mode, position of the first byte to - * be cut, and size to be cut. In case of the indirect item - * free unformatted nodes which are pointed to by the cut - * pointers. - */ - - mode = - prepare_for_delete_or_cut(th, inode, path, - item_key, &removed, - &cut_size, new_file_size); - if (mode == M_CONVERT) { - /* - * convert last unformatted node to direct item or - * leave tail in the unformatted node - */ - RFALSE(ret_value != CARRY_ON, - "PAP-5570: can not convert twice"); - - ret_value = - maybe_indirect_to_direct(th, inode, page, - path, item_key, - new_file_size, &mode); - if (mode == M_SKIP_BALANCING) - /* tail has been left in the unformatted node */ - return ret_value; - - is_inode_locked = 1; - - /* - * removing of last unformatted node will - * change value we have to return to truncate. - * Save it - */ - retval2 = ret_value; - - /* - * So, we have performed the first part of the - * conversion: - * inserting the new direct item. Now we are - * removing the last unformatted node pointer. - * Set key to search for it. - */ - set_cpu_key_k_type(item_key, TYPE_INDIRECT); - item_key->key_length = 4; - new_file_size -= - (new_file_size & (sb->s_blocksize - 1)); - tail_pos = new_file_size; - set_cpu_key_k_offset(item_key, new_file_size + 1); - if (search_for_position_by_key - (sb, item_key, - path) == POSITION_NOT_FOUND) { - print_block(PATH_PLAST_BUFFER(path), 3, - PATH_LAST_POSITION(path) - 1, - PATH_LAST_POSITION(path) + 1); - reiserfs_panic(sb, "PAP-5580", "item to " - "convert does not exist (%K)", - item_key); - } - continue; - } - if (cut_size == 0) { - pathrelse(path); - return 0; - } - - s_cut_balance.insert_size[0] = cut_size; - - ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL); - if (ret_value != REPEAT_SEARCH) - break; - - PROC_INFO_INC(sb, cut_from_item_restarted); - - ret_value = - search_for_position_by_key(sb, item_key, path); - if (ret_value == POSITION_FOUND) - continue; - - reiserfs_warning(sb, "PAP-5610", "item %K not found", - item_key); - unfix_nodes(&s_cut_balance); - return (ret_value == IO_ERROR) ? -EIO : -ENOENT; - } /* while */ - - /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */ - if (ret_value != CARRY_ON) { - if (is_inode_locked) { - /* - * FIXME: this seems to be not needed: we are always - * able to cut item - */ - indirect_to_direct_roll_back(th, inode, path); - } - if (ret_value == NO_DISK_SPACE) - reiserfs_warning(sb, "reiserfs-5092", - "NO_DISK_SPACE"); - unfix_nodes(&s_cut_balance); - return -EIO; - } - - /* go ahead and perform balancing */ - - RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode"); - - /* Calculate number of bytes that need to be cut from the item. */ - quota_cut_bytes = - (mode == - M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance. - insert_size[0]; - if (retval2 == -1) - ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); - else - ret_value = retval2; - - /* - * For direct items, we only change the quota when deleting the last - * item. - */ - p_le_ih = tp_item_head(s_cut_balance.tb_path); - if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { - if (mode == M_DELETE && - (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == - 1) { - /* FIXME: this is to keep 3.5 happy */ - REISERFS_I(inode)->i_first_direct_byte = U32_MAX; - quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; - } else { - quota_cut_bytes = 0; - } - } -#ifdef CONFIG_REISERFS_CHECK - if (is_inode_locked) { - struct item_head *le_ih = - tp_item_head(s_cut_balance.tb_path); - /* - * we are going to complete indirect2direct conversion. Make - * sure, that we exactly remove last unformatted node pointer - * of the item - */ - if (!is_indirect_le_ih(le_ih)) - reiserfs_panic(sb, "vs-5652", - "item must be indirect %h", le_ih); - - if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) - reiserfs_panic(sb, "vs-5653", "completing " - "indirect2direct conversion indirect " - "item %h being deleted must be of " - "4 byte long", le_ih); - - if (mode == M_CUT - && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { - reiserfs_panic(sb, "vs-5654", "can not complete " - "indirect2direct conversion of %h " - "(CUT, insert_size==%d)", - le_ih, s_cut_balance.insert_size[0]); - } - /* - * it would be useful to make sure, that right neighboring - * item is direct item of this file - */ - } -#endif - - do_balance(&s_cut_balance, NULL, NULL, mode); - if (is_inode_locked) { - /* - * we've done an indirect->direct conversion. when the - * data block was freed, it was removed from the list of - * blocks that must be flushed before the transaction - * commits, make sure to unmap and invalidate it - */ - unmap_buffers(page, tail_pos); - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - } -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, - "reiserquota cut_from_item(): freeing %u id=%u type=%c", - quota_cut_bytes, inode->i_uid, '?'); -#endif - depth = reiserfs_write_unlock_nested(sb); - dquot_free_space_nodirty(inode, quota_cut_bytes); - reiserfs_write_lock_nested(sb, depth); - return ret_value; -} - -static void truncate_directory(struct reiserfs_transaction_handle *th, - struct inode *inode) -{ - BUG_ON(!th->t_trans_id); - if (inode->i_nlink) - reiserfs_error(inode->i_sb, "vs-5655", "link count != 0"); - - set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); - set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); - reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); - reiserfs_update_sd(th, inode); - set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET); - set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); -} - -/* - * Truncate file to the new size. Note, this must be called with a - * transaction already started - */ -int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, - struct inode *inode, /* ->i_size contains new size */ - struct page *page, /* up to date for last block */ - /* - * when it is called by file_release to convert - * the tail - no timestamps should be updated - */ - int update_timestamps - ) -{ - INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ - struct item_head *p_le_ih; /* Pointer to an item header. */ - - /* Key to search for a previous file item. */ - struct cpu_key s_item_key; - loff_t file_size, /* Old file size. */ - new_file_size; /* New file size. */ - int deleted; /* Number of deleted or truncated bytes. */ - int retval; - int err = 0; - - BUG_ON(!th->t_trans_id); - if (! - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) - || S_ISLNK(inode->i_mode))) - return 0; - - /* deletion of directory - no need to update timestamps */ - if (S_ISDIR(inode->i_mode)) { - truncate_directory(th, inode); - return 0; - } - - /* Get new file size. */ - new_file_size = inode->i_size; - - /* FIXME: note, that key type is unimportant here */ - make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), - TYPE_DIRECT, 3); - - retval = - search_for_position_by_key(inode->i_sb, &s_item_key, - &s_search_path); - if (retval == IO_ERROR) { - reiserfs_error(inode->i_sb, "vs-5657", - "i/o failure occurred trying to truncate %K", - &s_item_key); - err = -EIO; - goto out; - } - if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { - reiserfs_error(inode->i_sb, "PAP-5660", - "wrong result %d of search for %K", retval, - &s_item_key); - - err = -EIO; - goto out; - } - - s_search_path.pos_in_item--; - - /* Get real file size (total length of all file items) */ - p_le_ih = tp_item_head(&s_search_path); - if (is_statdata_le_ih(p_le_ih)) - file_size = 0; - else { - loff_t offset = le_ih_k_offset(p_le_ih); - int bytes = - op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); - - /* - * this may mismatch with real file size: if last direct item - * had no padding zeros and last unformatted node had no free - * space, this file would have this file size - */ - file_size = offset + bytes - 1; - } - /* - * are we doing a full truncate or delete, if so - * kick in the reada code - */ - if (new_file_size == 0) - s_search_path.reada = PATH_READA | PATH_READA_BACK; - - if (file_size == 0 || file_size < new_file_size) { - goto update_and_out; - } - - /* Update key to search for the last file item. */ - set_cpu_key_k_offset(&s_item_key, file_size); - - do { - /* Cut or delete file item. */ - deleted = - reiserfs_cut_from_item(th, &s_search_path, &s_item_key, - inode, page, new_file_size); - if (deleted < 0) { - reiserfs_warning(inode->i_sb, "vs-5665", - "reiserfs_cut_from_item failed"); - reiserfs_check_path(&s_search_path); - return 0; - } - - RFALSE(deleted > file_size, - "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", - deleted, file_size, &s_item_key); - - /* Change key to search the last file item. */ - file_size -= deleted; - - set_cpu_key_k_offset(&s_item_key, file_size); - - /* - * While there are bytes to truncate and previous - * file item is presented in the tree. - */ - - /* - * This loop could take a really long time, and could log - * many more blocks than a transaction can hold. So, we do - * a polite journal end here, and if the transaction needs - * ending, we make sure the file is consistent before ending - * the current trans and starting a new one - */ - if (journal_transaction_should_end(th, 0) || - reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { - pathrelse(&s_search_path); - - if (update_timestamps) { - inode_set_mtime_to_ts(inode, - current_time(inode)); - inode_set_ctime_current(inode); - } - reiserfs_update_sd(th, inode); - - err = journal_end(th); - if (err) - goto out; - err = journal_begin(th, inode->i_sb, - JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ; - if (err) - goto out; - reiserfs_update_inode_transaction(inode); - } - } while (file_size > ROUND_UP(new_file_size) && - search_for_position_by_key(inode->i_sb, &s_item_key, - &s_search_path) == POSITION_FOUND); - - RFALSE(file_size > ROUND_UP(new_file_size), - "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", - new_file_size, file_size, s_item_key.on_disk_key.k_objectid); - -update_and_out: - if (update_timestamps) { - /* this is truncate, not file closing */ - inode_set_mtime_to_ts(inode, current_time(inode)); - inode_set_ctime_current(inode); - } - reiserfs_update_sd(th, inode); - -out: - pathrelse(&s_search_path); - return err; -} - -#ifdef CONFIG_REISERFS_CHECK -/* this makes sure, that we __append__, not overwrite or add holes */ -static void check_research_for_paste(struct treepath *path, - const struct cpu_key *key) -{ - struct item_head *found_ih = tp_item_head(path); - - if (is_direct_le_ih(found_ih)) { - if (le_ih_k_offset(found_ih) + - op_bytes_number(found_ih, - get_last_bh(path)->b_size) != - cpu_key_k_offset(key) - || op_bytes_number(found_ih, - get_last_bh(path)->b_size) != - pos_in_item(path)) - reiserfs_panic(NULL, "PAP-5720", "found direct item " - "%h or position (%d) does not match " - "to key %K", found_ih, - pos_in_item(path), key); - } - if (is_indirect_le_ih(found_ih)) { - if (le_ih_k_offset(found_ih) + - op_bytes_number(found_ih, - get_last_bh(path)->b_size) != - cpu_key_k_offset(key) - || I_UNFM_NUM(found_ih) != pos_in_item(path) - || get_ih_free_space(found_ih) != 0) - reiserfs_panic(NULL, "PAP-5730", "found indirect " - "item (%h) or position (%d) does not " - "match to key (%K)", - found_ih, pos_in_item(path), key); - } -} -#endif /* config reiserfs check */ - -/* - * Paste bytes to the existing item. - * Returns bytes number pasted into the item. - */ -int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, - /* Path to the pasted item. */ - struct treepath *search_path, - /* Key to search for the needed item. */ - const struct cpu_key *key, - /* Inode item belongs to */ - struct inode *inode, - /* Pointer to the bytes to paste. */ - const char *body, - /* Size of pasted bytes. */ - int pasted_size) -{ - struct super_block *sb = inode->i_sb; - struct tree_balance s_paste_balance; - int retval; - int fs_gen; - int depth; - - BUG_ON(!th->t_trans_id); - - fs_gen = get_generation(inode->i_sb); - -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, - "reiserquota paste_into_item(): allocating %u id=%u type=%c", - pasted_size, inode->i_uid, - key2type(&key->on_disk_key)); -#endif - - depth = reiserfs_write_unlock_nested(sb); - retval = dquot_alloc_space_nodirty(inode, pasted_size); - reiserfs_write_lock_nested(sb, depth); - if (retval) { - pathrelse(search_path); - return retval; - } - init_tb_struct(th, &s_paste_balance, th->t_super, search_path, - pasted_size); -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - s_paste_balance.key = key->on_disk_key; -#endif - - /* DQUOT_* can schedule, must check before the fix_nodes */ - if (fs_changed(fs_gen, inode->i_sb)) { - goto search_again; - } - - while ((retval = - fix_nodes(M_PASTE, &s_paste_balance, NULL, - body)) == REPEAT_SEARCH) { -search_again: - /* file system changed while we were in the fix_nodes */ - PROC_INFO_INC(th->t_super, paste_into_item_restarted); - retval = - search_for_position_by_key(th->t_super, key, - search_path); - if (retval == IO_ERROR) { - retval = -EIO; - goto error_out; - } - if (retval == POSITION_FOUND) { - reiserfs_warning(inode->i_sb, "PAP-5710", - "entry or pasted byte (%K) exists", - key); - retval = -EEXIST; - goto error_out; - } -#ifdef CONFIG_REISERFS_CHECK - check_research_for_paste(search_path, key); -#endif - } - - /* - * Perform balancing after all resources are collected by fix_nodes, - * and accessing them will not risk triggering schedule. - */ - if (retval == CARRY_ON) { - do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); - return 0; - } - retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; -error_out: - /* this also releases the path */ - unfix_nodes(&s_paste_balance); -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, - "reiserquota paste_into_item(): freeing %u id=%u type=%c", - pasted_size, inode->i_uid, - key2type(&key->on_disk_key)); -#endif - depth = reiserfs_write_unlock_nested(sb); - dquot_free_space_nodirty(inode, pasted_size); - reiserfs_write_lock_nested(sb, depth); - return retval; -} - -/* - * Insert new item into the buffer at the path. - * th - active transaction handle - * path - path to the inserted item - * ih - pointer to the item header to insert - * body - pointer to the bytes to insert - */ -int reiserfs_insert_item(struct reiserfs_transaction_handle *th, - struct treepath *path, const struct cpu_key *key, - struct item_head *ih, struct inode *inode, - const char *body) -{ - struct tree_balance s_ins_balance; - int retval; - int fs_gen = 0; - int quota_bytes = 0; - - BUG_ON(!th->t_trans_id); - - if (inode) { /* Do we count quotas for item? */ - int depth; - fs_gen = get_generation(inode->i_sb); - quota_bytes = ih_item_len(ih); - - /* - * hack so the quota code doesn't have to guess - * if the file has a tail, links are always tails, - * so there's no guessing needed - */ - if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) - quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; -#ifdef REISERQUOTA_DEBUG - reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, - "reiserquota insert_item(): allocating %u id=%u type=%c", - quota_bytes, inode->i_uid, head2type(ih)); -#endif - /* - * We can't dirty inode here. It would be immediately - * written but appropriate stat item isn't inserted yet... - */ - depth = reiserfs_write_unlock_nested(inode->i_sb); - retval = dquot_alloc_space_nodirty(inode, quota_bytes); - reiserfs_write_lock_nested(inode->i_sb, depth); - if (retval) { - pathrelse(path); - return retval; - } - } - init_tb_struct(th, &s_ins_balance, th->t_super, path, - IH_SIZE + ih_item_len(ih)); -#ifdef DISPLACE_NEW_PACKING_LOCALITIES - s_ins_balance.key = key->on_disk_key; -#endif - /* - * DQUOT_* can schedule, must check to be sure calling - * fix_nodes is safe - */ - if (inode && fs_changed(fs_gen, inode->i_sb)) { - goto search_again; - } - - while ((retval = - fix_nodes(M_INSERT, &s_ins_balance, ih, - body)) == REPEAT_SEARCH) { -search_again: - /* file system changed while we were in the fix_nodes */ - PROC_INFO_INC(th->t_super, insert_item_restarted); - retval = search_item(th->t_super, key, path); - if (retval == IO_ERROR) { - retval = -EIO; - goto error_out; - } - if (retval == ITEM_FOUND) { - reiserfs_warning(th->t_super, "PAP-5760", - "key %K already exists in the tree", - key); - retval = -EEXIST; - goto error_out; - } - } - - /* make balancing after all resources will be collected at a time */ - if (retval == CARRY_ON) { - do_balance(&s_ins_balance, ih, body, M_INSERT); - return 0; - } - - retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; -error_out: - /* also releases the path */ - unfix_nodes(&s_ins_balance); -#ifdef REISERQUOTA_DEBUG - if (inode) - reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, - "reiserquota insert_item(): freeing %u id=%u type=%c", - quota_bytes, inode->i_uid, head2type(ih)); -#endif - if (inode) { - int depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_free_space_nodirty(inode, quota_bytes); - reiserfs_write_lock_nested(inode->i_sb, depth); - } - return retval; -} diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c deleted file mode 100644 index ab76468da02d..000000000000 --- a/fs/reiserfs/super.c +++ /dev/null @@ -1,2646 +0,0 @@ -/* - * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README - * - * Trivial changes by Alan Cox to add the LFS fixes - * - * Trivial Changes: - * Rights granted to Hans Reiser to redistribute under other terms providing - * he accepts all liability including but not limited to patent, fitness - * for purpose, and direct or indirect claims arising from failure to perform. - * - * NO WARRANTY - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/time.h> -#include <linux/uaccess.h> -#include "reiserfs.h" -#include "acl.h" -#include "xattr.h" -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/buffer_head.h> -#include <linux/exportfs.h> -#include <linux/quotaops.h> -#include <linux/vfs.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <linux/crc32.h> -#include <linux/seq_file.h> - -struct file_system_type reiserfs_fs_type; - -static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; -static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; -static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; - -int is_reiserfs_3_5(struct reiserfs_super_block *rs) -{ - return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string, - strlen(reiserfs_3_5_magic_string)); -} - -int is_reiserfs_3_6(struct reiserfs_super_block *rs) -{ - return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string, - strlen(reiserfs_3_6_magic_string)); -} - -int is_reiserfs_jr(struct reiserfs_super_block *rs) -{ - return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string, - strlen(reiserfs_jr_magic_string)); -} - -static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) -{ - return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) || - is_reiserfs_jr(rs)); -} - -static int reiserfs_remount(struct super_block *s, int *flags, char *data); -static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); - -static int reiserfs_sync_fs(struct super_block *s, int wait) -{ - struct reiserfs_transaction_handle th; - - /* - * Writeback quota in non-journalled quota case - journalled quota has - * no dirty dquots - */ - dquot_writeback_dquots(s, -1); - reiserfs_write_lock(s); - if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th)) - reiserfs_flush_old_commits(s); - reiserfs_write_unlock(s); - return 0; -} - -static void flush_old_commits(struct work_struct *work) -{ - struct reiserfs_sb_info *sbi; - struct super_block *s; - - sbi = container_of(work, struct reiserfs_sb_info, old_work.work); - s = sbi->s_journal->j_work_sb; - - /* - * We need s_umount for protecting quota writeback. We have to use - * trylock as reiserfs_cancel_old_flush() may be waiting for this work - * to complete with s_umount held. - */ - if (!down_read_trylock(&s->s_umount)) { - /* Requeue work if we are not cancelling it */ - spin_lock(&sbi->old_work_lock); - if (sbi->work_queued == 1) - queue_delayed_work(system_long_wq, &sbi->old_work, HZ); - spin_unlock(&sbi->old_work_lock); - return; - } - spin_lock(&sbi->old_work_lock); - /* Avoid clobbering the cancel state... */ - if (sbi->work_queued == 1) - sbi->work_queued = 0; - spin_unlock(&sbi->old_work_lock); - - reiserfs_sync_fs(s, 1); - up_read(&s->s_umount); -} - -void reiserfs_schedule_old_flush(struct super_block *s) -{ - struct reiserfs_sb_info *sbi = REISERFS_SB(s); - unsigned long delay; - - /* - * Avoid scheduling flush when sb is being shut down. It can race - * with journal shutdown and free still queued delayed work. - */ - if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE)) - return; - - spin_lock(&sbi->old_work_lock); - if (!sbi->work_queued) { - delay = msecs_to_jiffies(dirty_writeback_interval * 10); - queue_delayed_work(system_long_wq, &sbi->old_work, delay); - sbi->work_queued = 1; - } - spin_unlock(&sbi->old_work_lock); -} - -void reiserfs_cancel_old_flush(struct super_block *s) -{ - struct reiserfs_sb_info *sbi = REISERFS_SB(s); - - spin_lock(&sbi->old_work_lock); - /* Make sure no new flushes will be queued */ - sbi->work_queued = 2; - spin_unlock(&sbi->old_work_lock); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); -} - -static int reiserfs_freeze(struct super_block *s) -{ - struct reiserfs_transaction_handle th; - - reiserfs_cancel_old_flush(s); - - reiserfs_write_lock(s); - if (!sb_rdonly(s)) { - int err = journal_begin(&th, s, 1); - if (err) { - reiserfs_block_writes(&th); - } else { - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), - 1); - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - reiserfs_block_writes(&th); - journal_end_sync(&th); - } - } - reiserfs_write_unlock(s); - return 0; -} - -static int reiserfs_unfreeze(struct super_block *s) -{ - struct reiserfs_sb_info *sbi = REISERFS_SB(s); - - reiserfs_allow_writes(s); - spin_lock(&sbi->old_work_lock); - /* Allow old_work to run again */ - sbi->work_queued = 0; - spin_unlock(&sbi->old_work_lock); - return 0; -} - -extern const struct in_core_key MAX_IN_CORE_KEY; - -/* - * this is used to delete "save link" when there are no items of a - * file it points to. It can either happen if unlink is completed but - * "save unlink" removal, or if file has both unlink and truncate - * pending and as unlink completes first (because key of "save link" - * protecting unlink is bigger that a key lf "save link" which - * protects truncate), so there left no items to make truncate - * completion on - */ -static int remove_save_link_only(struct super_block *s, - struct reiserfs_key *key, int oid_free) -{ - struct reiserfs_transaction_handle th; - int err; - - /* we are going to do one balancing */ - err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT); - if (err) - return err; - - reiserfs_delete_solid_item(&th, NULL, key); - if (oid_free) - /* removals are protected by direct items */ - reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); - - return journal_end(&th); -} - -#ifdef CONFIG_QUOTA -static int reiserfs_quota_on_mount(struct super_block *, int); -#endif - -/* - * Look for uncompleted unlinks and truncates and complete them - * - * Called with superblock write locked. If quotas are enabled, we have to - * release/retake lest we call dquot_quota_on_mount(), proceed to - * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per - * cpu worklets to complete flush_async_commits() that in turn wait for the - * superblock write lock. - */ -static int finish_unfinished(struct super_block *s) -{ - INITIALIZE_PATH(path); - struct cpu_key max_cpu_key, obj_key; - struct reiserfs_key save_link_key, last_inode_key; - int retval = 0; - struct item_head *ih; - struct buffer_head *bh; - int item_pos; - char *item; - int done; - struct inode *inode; - int truncate; -#ifdef CONFIG_QUOTA - int i; - int ms_active_set; - int quota_enabled[REISERFS_MAXQUOTAS]; -#endif - - /* compose key to look for "save" links */ - max_cpu_key.version = KEY_FORMAT_3_5; - max_cpu_key.on_disk_key.k_dir_id = ~0U; - max_cpu_key.on_disk_key.k_objectid = ~0U; - set_cpu_key_k_offset(&max_cpu_key, ~0U); - max_cpu_key.key_length = 3; - - memset(&last_inode_key, 0, sizeof(last_inode_key)); - -#ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - if (s->s_flags & SB_ACTIVE) { - ms_active_set = 0; - } else { - ms_active_set = 1; - s->s_flags |= SB_ACTIVE; - } - /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < REISERFS_MAXQUOTAS; i++) { - quota_enabled[i] = 1; - if (REISERFS_SB(s)->s_qf_names[i]) { - int ret; - - if (sb_has_quota_active(s, i)) { - quota_enabled[i] = 0; - continue; - } - reiserfs_write_unlock(s); - ret = reiserfs_quota_on_mount(s, i); - reiserfs_write_lock(s); - if (ret < 0) - reiserfs_warning(s, "reiserfs-2500", - "cannot turn on journaled " - "quota: error %d", ret); - } - } -#endif - - done = 0; - REISERFS_SB(s)->s_is_unlinked_ok = 1; - while (!retval) { - int depth; - retval = search_item(s, &max_cpu_key, &path); - if (retval != ITEM_NOT_FOUND) { - reiserfs_error(s, "vs-2140", - "search_by_key returned %d", retval); - break; - } - - bh = get_last_bh(&path); - item_pos = get_item_pos(&path); - if (item_pos != B_NR_ITEMS(bh)) { - reiserfs_warning(s, "vs-2060", - "wrong position found"); - break; - } - item_pos--; - ih = item_head(bh, item_pos); - - if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) - /* there are no "save" links anymore */ - break; - - save_link_key = ih->ih_key; - if (is_indirect_le_ih(ih)) - truncate = 1; - else - truncate = 0; - - /* reiserfs_iget needs k_dirid and k_objectid only */ - item = ih_item_body(bh, ih); - obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); - obj_key.on_disk_key.k_objectid = - le32_to_cpu(ih->ih_key.k_objectid); - obj_key.on_disk_key.k_offset = 0; - obj_key.on_disk_key.k_type = 0; - - pathrelse(&path); - - inode = reiserfs_iget(s, &obj_key); - if (IS_ERR_OR_NULL(inode)) { - /* - * the unlink almost completed, it just did not - * manage to remove "save" link and release objectid - */ - reiserfs_warning(s, "vs-2180", "iget failed for %K", - &obj_key); - retval = remove_save_link_only(s, &save_link_key, 1); - continue; - } - - if (!truncate && inode->i_nlink) { - /* file is not unlinked */ - reiserfs_warning(s, "vs-2185", - "file %K is not unlinked", - &obj_key); - retval = remove_save_link_only(s, &save_link_key, 0); - continue; - } - depth = reiserfs_write_unlock_nested(inode->i_sb); - dquot_initialize(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); - - if (truncate && S_ISDIR(inode->i_mode)) { - /* - * We got a truncate request for a dir which - * is impossible. The only imaginable way is to - * execute unfinished truncate request then boot - * into old kernel, remove the file and create dir - * with the same key. - */ - reiserfs_warning(s, "green-2101", - "impossible truncate on a " - "directory %k. Please report", - INODE_PKEY(inode)); - retval = remove_save_link_only(s, &save_link_key, 0); - truncate = 0; - iput(inode); - continue; - } - - if (truncate) { - REISERFS_I(inode)->i_flags |= - i_link_saved_truncate_mask; - /* - * not completed truncate found. New size was - * committed together with "save" link - */ - reiserfs_info(s, "Truncating %k to %lld ..", - INODE_PKEY(inode), inode->i_size); - - /* don't update modification time */ - reiserfs_truncate_file(inode, 0); - - retval = remove_save_link(inode, truncate); - } else { - REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; - /* not completed unlink (rmdir) found */ - reiserfs_info(s, "Removing %k..", INODE_PKEY(inode)); - if (memcmp(&last_inode_key, INODE_PKEY(inode), - sizeof(last_inode_key))){ - last_inode_key = *INODE_PKEY(inode); - /* removal gets completed in iput */ - retval = 0; - } else { - reiserfs_warning(s, "super-2189", "Dead loop " - "in finish_unfinished " - "detected, just remove " - "save link\n"); - retval = remove_save_link_only(s, - &save_link_key, 0); - } - } - - iput(inode); - printk("done\n"); - done++; - } - REISERFS_SB(s)->s_is_unlinked_ok = 0; - -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - reiserfs_write_unlock(s); - for (i = 0; i < REISERFS_MAXQUOTAS; i++) { - if (sb_dqopt(s)->files[i] && quota_enabled[i]) - dquot_quota_off(s, i); - } - reiserfs_write_lock(s); - if (ms_active_set) - /* Restore the flag back */ - s->s_flags &= ~SB_ACTIVE; -#endif - pathrelse(&path); - if (done) - reiserfs_info(s, "There were %d uncompleted unlinks/truncates. " - "Completed\n", done); - return retval; -} - -/* - * to protect file being unlinked from getting lost we "safe" link files - * being unlinked. This link will be deleted in the same transaction with last - * item of file. mounting the filesystem we scan all these links and remove - * files which almost got lost - */ -void add_save_link(struct reiserfs_transaction_handle *th, - struct inode *inode, int truncate) -{ - INITIALIZE_PATH(path); - int retval; - struct cpu_key key; - struct item_head ih; - __le32 link; - - BUG_ON(!th->t_trans_id); - - /* file can only get one "save link" of each kind */ - RFALSE(truncate && - (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask), - "saved link already exists for truncated inode %lx", - (long)inode->i_ino); - RFALSE(!truncate && - (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask), - "saved link already exists for unlinked inode %lx", - (long)inode->i_ino); - - /* setup key of "save" link */ - key.version = KEY_FORMAT_3_5; - key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; - key.on_disk_key.k_objectid = inode->i_ino; - if (!truncate) { - /* unlink, rmdir, rename */ - set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize); - set_cpu_key_k_type(&key, TYPE_DIRECT); - - /* item head of "safe" link */ - make_le_item_head(&ih, &key, key.version, - 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, - 4 /*length */ , 0xffff /*free space */ ); - } else { - /* truncate */ - if (S_ISDIR(inode->i_mode)) - reiserfs_warning(inode->i_sb, "green-2102", - "Adding a truncate savelink for " - "a directory %k! Please report", - INODE_PKEY(inode)); - set_cpu_key_k_offset(&key, 1); - set_cpu_key_k_type(&key, TYPE_INDIRECT); - - /* item head of "safe" link */ - make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT, - 4 /*length */ , 0 /*free space */ ); - } - key.key_length = 3; - - /* look for its place in the tree */ - retval = search_item(inode->i_sb, &key, &path); - if (retval != ITEM_NOT_FOUND) { - if (retval != -ENOSPC) - reiserfs_error(inode->i_sb, "vs-2100", - "search_by_key (%K) returned %d", &key, - retval); - pathrelse(&path); - return; - } - - /* body of "save" link */ - link = INODE_PKEY(inode)->k_dir_id; - - /* put "save" link into tree, don't charge quota to anyone */ - retval = - reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); - if (retval) { - if (retval != -ENOSPC) - reiserfs_error(inode->i_sb, "vs-2120", - "insert_item returned %d", retval); - } else { - if (truncate) - REISERFS_I(inode)->i_flags |= - i_link_saved_truncate_mask; - else - REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; - } -} - -/* this opens transaction unlike add_save_link */ -int remove_save_link(struct inode *inode, int truncate) -{ - struct reiserfs_transaction_handle th; - struct reiserfs_key key; - int err; - - /* we are going to do one balancing only */ - err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); - if (err) - return err; - - /* setup key of "save" link */ - key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID); - key.k_objectid = INODE_PKEY(inode)->k_objectid; - if (!truncate) { - /* unlink, rmdir, rename */ - set_le_key_k_offset(KEY_FORMAT_3_5, &key, - 1 + inode->i_sb->s_blocksize); - set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT); - } else { - /* truncate */ - set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1); - set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT); - } - - if ((truncate && - (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) || - (!truncate && - (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask))) - /* don't take quota bytes from anywhere */ - reiserfs_delete_solid_item(&th, NULL, &key); - if (!truncate) { - reiserfs_release_objectid(&th, inode->i_ino); - REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask; - } else - REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; - - return journal_end(&th); -} - -static void reiserfs_kill_sb(struct super_block *s) -{ - if (REISERFS_SB(s)) { - reiserfs_proc_info_done(s); - /* - * Force any pending inode evictions to occur now. Any - * inodes to be removed that have extended attributes - * associated with them need to clean them up before - * we can release the extended attribute root dentries. - * shrink_dcache_for_umount will BUG if we don't release - * those before it's called so ->put_super is too late. - */ - shrink_dcache_sb(s); - - dput(REISERFS_SB(s)->xattr_root); - REISERFS_SB(s)->xattr_root = NULL; - dput(REISERFS_SB(s)->priv_root); - REISERFS_SB(s)->priv_root = NULL; - } - - kill_block_super(s); -} - -#ifdef CONFIG_QUOTA -static int reiserfs_quota_off(struct super_block *sb, int type); - -static void reiserfs_quota_off_umount(struct super_block *s) -{ - int type; - - for (type = 0; type < REISERFS_MAXQUOTAS; type++) - reiserfs_quota_off(s, type); -} -#else -static inline void reiserfs_quota_off_umount(struct super_block *s) -{ -} -#endif - -static void reiserfs_put_super(struct super_block *s) -{ - struct reiserfs_transaction_handle th; - th.t_trans_id = 0; - - reiserfs_quota_off_umount(s); - - reiserfs_write_lock(s); - - /* - * change file system state to current state if it was mounted - * with read-write permissions - */ - if (!sb_rdonly(s)) { - if (!journal_begin(&th, s, 10)) { - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), - 1); - set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), - REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - } - } - - /* - * note, journal_release checks for readonly mount, and can - * decide not to do a journal_end - */ - journal_release(&th, s); - - reiserfs_free_bitmap_cache(s); - - brelse(SB_BUFFER_WITH_SB(s)); - - print_statistics(s); - - if (REISERFS_SB(s)->reserved_blocks != 0) { - reiserfs_warning(s, "green-2005", "reserved blocks left %d", - REISERFS_SB(s)->reserved_blocks); - } - - reiserfs_write_unlock(s); - mutex_destroy(&REISERFS_SB(s)->lock); - destroy_workqueue(REISERFS_SB(s)->commit_wq); - kfree(REISERFS_SB(s)->s_jdev); - kfree(s->s_fs_info); - s->s_fs_info = NULL; -} - -static struct kmem_cache *reiserfs_inode_cachep; - -static struct inode *reiserfs_alloc_inode(struct super_block *sb) -{ - struct reiserfs_inode_info *ei; - ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - atomic_set(&ei->openers, 0); - mutex_init(&ei->tailpack); -#ifdef CONFIG_QUOTA - memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); -#endif - - return &ei->vfs_inode; -} - -static void reiserfs_free_inode(struct inode *inode) -{ - kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); -} - -static void init_once(void *foo) -{ - struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; - - INIT_LIST_HEAD(&ei->i_prealloc_list); - inode_init_once(&ei->vfs_inode); -} - -static int __init init_inodecache(void) -{ - reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", - sizeof(struct - reiserfs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_ACCOUNT), - init_once); - if (reiserfs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(reiserfs_inode_cachep); -} - -/* we don't mark inodes dirty, we just log them */ -static void reiserfs_dirty_inode(struct inode *inode, int flags) -{ - struct reiserfs_transaction_handle th; - - int err = 0; - - if (sb_rdonly(inode->i_sb)) { - reiserfs_warning(inode->i_sb, "clm-6006", - "writing inode %lu on readonly FS", - inode->i_ino); - return; - } - reiserfs_write_lock(inode->i_sb); - - /* - * this is really only used for atime updates, so they don't have - * to be included in O_SYNC or fsync - */ - err = journal_begin(&th, inode->i_sb, 1); - if (err) - goto out; - - reiserfs_update_sd(&th, inode); - journal_end(&th); - -out: - reiserfs_write_unlock(inode->i_sb); -} - -static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) -{ - struct super_block *s = root->d_sb; - struct reiserfs_journal *journal = SB_JOURNAL(s); - long opts = REISERFS_SB(s)->s_mount_opt; - - if (opts & (1 << REISERFS_LARGETAIL)) - seq_puts(seq, ",tails=on"); - else if (!(opts & (1 << REISERFS_SMALLTAIL))) - seq_puts(seq, ",notail"); - /* tails=small is default so we don't show it */ - - if (!(opts & (1 << REISERFS_BARRIER_FLUSH))) - seq_puts(seq, ",barrier=none"); - /* barrier=flush is default so we don't show it */ - - if (opts & (1 << REISERFS_ERROR_CONTINUE)) - seq_puts(seq, ",errors=continue"); - else if (opts & (1 << REISERFS_ERROR_PANIC)) - seq_puts(seq, ",errors=panic"); - /* errors=ro is default so we don't show it */ - - if (opts & (1 << REISERFS_DATA_LOG)) - seq_puts(seq, ",data=journal"); - else if (opts & (1 << REISERFS_DATA_WRITEBACK)) - seq_puts(seq, ",data=writeback"); - /* data=ordered is default so we don't show it */ - - if (opts & (1 << REISERFS_ATTRS)) - seq_puts(seq, ",attrs"); - - if (opts & (1 << REISERFS_XATTRS_USER)) - seq_puts(seq, ",user_xattr"); - - if (opts & (1 << REISERFS_EXPOSE_PRIVROOT)) - seq_puts(seq, ",expose_privroot"); - - if (opts & (1 << REISERFS_POSIXACL)) - seq_puts(seq, ",acl"); - - if (REISERFS_SB(s)->s_jdev) - seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); - - if (journal->j_max_commit_age != journal->j_default_max_commit_age) - seq_printf(seq, ",commit=%d", journal->j_max_commit_age); - -#ifdef CONFIG_QUOTA - if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", - REISERFS_SB(s)->s_qf_names[USRQUOTA]); - else if (opts & (1 << REISERFS_USRQUOTA)) - seq_puts(seq, ",usrquota"); - if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", - REISERFS_SB(s)->s_qf_names[GRPQUOTA]); - else if (opts & (1 << REISERFS_GRPQUOTA)) - seq_puts(seq, ",grpquota"); - if (REISERFS_SB(s)->s_jquota_fmt) { - if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD) - seq_puts(seq, ",jqfmt=vfsold"); - else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0) - seq_puts(seq, ",jqfmt=vfsv0"); - } -#endif - - /* Block allocator options */ - if (opts & (1 << REISERFS_NO_BORDER)) - seq_puts(seq, ",block-allocator=noborder"); - if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION)) - seq_puts(seq, ",block-allocator=no_unhashed_relocation"); - if (opts & (1 << REISERFS_HASHED_RELOCATION)) - seq_puts(seq, ",block-allocator=hashed_relocation"); - if (opts & (1 << REISERFS_TEST4)) - seq_puts(seq, ",block-allocator=test4"); - show_alloc_options(seq, s); - return 0; -} - -#ifdef CONFIG_QUOTA -static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, - size_t, loff_t); -static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, - loff_t); - -static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode) -{ - return REISERFS_I(inode)->i_dquot; -} -#endif - -static const struct super_operations reiserfs_sops = { - .alloc_inode = reiserfs_alloc_inode, - .free_inode = reiserfs_free_inode, - .write_inode = reiserfs_write_inode, - .dirty_inode = reiserfs_dirty_inode, - .evict_inode = reiserfs_evict_inode, - .put_super = reiserfs_put_super, - .sync_fs = reiserfs_sync_fs, - .freeze_fs = reiserfs_freeze, - .unfreeze_fs = reiserfs_unfreeze, - .statfs = reiserfs_statfs, - .remount_fs = reiserfs_remount, - .show_options = reiserfs_show_options, -#ifdef CONFIG_QUOTA - .quota_read = reiserfs_quota_read, - .quota_write = reiserfs_quota_write, - .get_dquots = reiserfs_get_dquots, -#endif -}; - -#ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") - -static int reiserfs_write_dquot(struct dquot *); -static int reiserfs_acquire_dquot(struct dquot *); -static int reiserfs_release_dquot(struct dquot *); -static int reiserfs_mark_dquot_dirty(struct dquot *); -static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, const struct path *); - -static const struct dquot_operations reiserfs_quota_operations = { - .write_dquot = reiserfs_write_dquot, - .acquire_dquot = reiserfs_acquire_dquot, - .release_dquot = reiserfs_release_dquot, - .mark_dirty = reiserfs_mark_dquot_dirty, - .write_info = reiserfs_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, - .get_next_id = dquot_get_next_id, -}; - -static const struct quotactl_ops reiserfs_qctl_operations = { - .quota_on = reiserfs_quota_on, - .quota_off = reiserfs_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk, -}; -#endif - -static const struct export_operations reiserfs_export_ops = { - .encode_fh = reiserfs_encode_fh, - .fh_to_dentry = reiserfs_fh_to_dentry, - .fh_to_parent = reiserfs_fh_to_parent, - .get_parent = reiserfs_get_parent, -}; - -/* - * this struct is used in reiserfs_getopt () for containing the value for - * those mount options that have values rather than being toggles. - */ -typedef struct { - char *value; - /* - * bitmask which is to set on mount_options bitmask - * when this value is found, 0 is no bits are to be changed. - */ - int setmask; - /* - * bitmask which is to clear on mount_options bitmask - * when this value is found, 0 is no bits are to be changed. - * This is applied BEFORE setmask - */ - int clrmask; -} arg_desc_t; - -/* Set this bit in arg_required to allow empty arguments */ -#define REISERFS_OPT_ALLOWEMPTY 31 - -/* - * this struct is used in reiserfs_getopt() for describing the - * set of reiserfs mount options - */ -typedef struct { - char *option_name; - - /* 0 if argument is not required, not 0 otherwise */ - int arg_required; - - /* list of values accepted by an option */ - const arg_desc_t *values; - - /* - * bitmask which is to set on mount_options bitmask - * when this value is found, 0 is no bits are to be changed. - */ - int setmask; - - /* - * bitmask which is to clear on mount_options bitmask - * when this value is found, 0 is no bits are to be changed. - * This is applied BEFORE setmask - */ - int clrmask; -} opt_desc_t; - -/* possible values for -o data= */ -static const arg_desc_t logging_mode[] = { - {"ordered", 1 << REISERFS_DATA_ORDERED, - (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)}, - {"journal", 1 << REISERFS_DATA_LOG, - (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)}, - {"writeback", 1 << REISERFS_DATA_WRITEBACK, - (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)}, - {.value = NULL} -}; - -/* possible values for -o barrier= */ -static const arg_desc_t barrier_mode[] = { - {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH}, - {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE}, - {.value = NULL} -}; - -/* - * possible values for "-o block-allocator=" and bits which are to be set in - * s_mount_opt of reiserfs specific part of in-core super block - */ -static const arg_desc_t balloc[] = { - {"noborder", 1 << REISERFS_NO_BORDER, 0}, - {"border", 0, 1 << REISERFS_NO_BORDER}, - {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0}, - {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0}, - {"test4", 1 << REISERFS_TEST4, 0}, - {"notest4", 0, 1 << REISERFS_TEST4}, - {NULL, 0, 0} -}; - -static const arg_desc_t tails[] = { - {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL}, - {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, - {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL}, - {NULL, 0, 0} -}; - -static const arg_desc_t error_actions[] = { - {"panic", 1 << REISERFS_ERROR_PANIC, - (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, - {"ro-remount", 1 << REISERFS_ERROR_RO, - (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, -#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG - {"continue", 1 << REISERFS_ERROR_CONTINUE, - (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, -#endif - {NULL, 0, 0}, -}; - -/* - * proceed only one option from a list *cur - string containing of mount - * options - * opts - array of options which are accepted - * opt_arg - if option is found and requires an argument and if it is specifed - * in the input - pointer to the argument is stored here - * bit_flags - if option requires to set a certain bit - it is set here - * return -1 if unknown option is found, opt->arg_required otherwise - */ -static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, - char **opt_arg, unsigned long *bit_flags) -{ - char *p; - /* - * foo=bar, - * ^ ^ ^ - * | | +-- option_end - * | +-- arg_start - * +-- option_start - */ - const opt_desc_t *opt; - const arg_desc_t *arg; - - p = *cur; - - /* assume argument cannot contain commas */ - *cur = strchr(p, ','); - if (*cur) { - *(*cur) = '\0'; - (*cur)++; - } - - if (!strncmp(p, "alloc=", 6)) { - /* - * Ugly special case, probably we should redo options - * parser so that it can understand several arguments for - * some options, also so that it can fill several bitfields - * with option values. - */ - if (reiserfs_parse_alloc_options(s, p + 6)) { - return -1; - } else { - return 0; - } - } - - /* for every option in the list */ - for (opt = opts; opt->option_name; opt++) { - if (!strncmp(p, opt->option_name, strlen(opt->option_name))) { - if (bit_flags) { - if (opt->clrmask == - (1 << REISERFS_UNSUPPORTED_OPT)) - reiserfs_warning(s, "super-6500", - "%s not supported.\n", - p); - else - *bit_flags &= ~opt->clrmask; - if (opt->setmask == - (1 << REISERFS_UNSUPPORTED_OPT)) - reiserfs_warning(s, "super-6501", - "%s not supported.\n", - p); - else - *bit_flags |= opt->setmask; - } - break; - } - } - if (!opt->option_name) { - reiserfs_warning(s, "super-6502", - "unknown mount option \"%s\"", p); - return -1; - } - - p += strlen(opt->option_name); - switch (*p) { - case '=': - if (!opt->arg_required) { - reiserfs_warning(s, "super-6503", - "the option \"%s\" does not " - "require an argument\n", - opt->option_name); - return -1; - } - break; - - case 0: - if (opt->arg_required) { - reiserfs_warning(s, "super-6504", - "the option \"%s\" requires an " - "argument\n", opt->option_name); - return -1; - } - break; - default: - reiserfs_warning(s, "super-6505", - "head of option \"%s\" is only correct\n", - opt->option_name); - return -1; - } - - /* - * move to the argument, or to next option if argument is not - * required - */ - p++; - - if (opt->arg_required - && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) - && !strlen(p)) { - /* this catches "option=," if not allowed */ - reiserfs_warning(s, "super-6506", - "empty argument for \"%s\"\n", - opt->option_name); - return -1; - } - - if (!opt->values) { - /* *=NULLopt_arg contains pointer to argument */ - *opt_arg = p; - return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY); - } - - /* values possible for this option are listed in opt->values */ - for (arg = opt->values; arg->value; arg++) { - if (!strcmp(p, arg->value)) { - if (bit_flags) { - *bit_flags &= ~arg->clrmask; - *bit_flags |= arg->setmask; - } - return opt->arg_required; - } - } - - reiserfs_warning(s, "super-6506", - "bad value \"%s\" for option \"%s\"\n", p, - opt->option_name); - return -1; -} - -/* returns 0 if something is wrong in option string, 1 - otherwise */ -static int reiserfs_parse_options(struct super_block *s, - - /* string given via mount's -o */ - char *options, - - /* - * after the parsing phase, contains the - * collection of bitflags defining what - * mount options were selected. - */ - unsigned long *mount_options, - - /* strtol-ed from NNN of resize=NNN */ - unsigned long *blocks, - char **jdev_name, - unsigned int *commit_max_age, - char **qf_names, - unsigned int *qfmt) -{ - int c; - char *arg = NULL; - char *pos; - opt_desc_t opts[] = { - /* - * Compatibility stuff, so that -o notail for old - * setups still work - */ - {"tails",.arg_required = 't',.values = tails}, - {"notail",.clrmask = - (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, - {"conv",.setmask = 1 << REISERFS_CONVERT}, - {"attrs",.setmask = 1 << REISERFS_ATTRS}, - {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, - {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, -#ifdef CONFIG_REISERFS_FS_XATTR - {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, - {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, -#else - {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, - {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, -#endif -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - {"acl",.setmask = 1 << REISERFS_POSIXACL}, - {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, -#else - {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, - {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, -#endif - {.option_name = "nolog"}, - {"replayonly",.setmask = 1 << REPLAYONLY}, - {"block-allocator",.arg_required = 'a',.values = balloc}, - {"data",.arg_required = 'd',.values = logging_mode}, - {"barrier",.arg_required = 'b',.values = barrier_mode}, - {"resize",.arg_required = 'r',.values = NULL}, - {"jdev",.arg_required = 'j',.values = NULL}, - {"nolargeio",.arg_required = 'w',.values = NULL}, - {"commit",.arg_required = 'c',.values = NULL}, - {"usrquota",.setmask = 1 << REISERFS_USRQUOTA}, - {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA}, - {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA}, - {"errors",.arg_required = 'e',.values = error_actions}, - {"usrjquota",.arg_required = - 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, - {"grpjquota",.arg_required = - 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, - {"jqfmt",.arg_required = 'f',.values = NULL}, - {.option_name = NULL} - }; - - *blocks = 0; - if (!options || !*options) - /* - * use default configuration: create tails, journaling on, no - * conversion to newest format - */ - return 1; - - for (pos = options; pos;) { - c = reiserfs_getopt(s, &pos, opts, &arg, mount_options); - if (c == -1) - /* wrong option is given */ - return 0; - - if (c == 'r') { - char *p; - - p = NULL; - /* "resize=NNN" or "resize=auto" */ - - if (!strcmp(arg, "auto")) { - /* From JFS code, to auto-get the size. */ - *blocks = sb_bdev_nr_blocks(s); - } else { - *blocks = simple_strtoul(arg, &p, 0); - if (*p != '\0') { - /* NNN does not look like a number */ - reiserfs_warning(s, "super-6507", - "bad value %s for " - "-oresize\n", arg); - return 0; - } - } - } - - if (c == 'c') { - char *p = NULL; - unsigned long val = simple_strtoul(arg, &p, 0); - /* commit=NNN (time in seconds) */ - if (*p != '\0' || val >= (unsigned int)-1) { - reiserfs_warning(s, "super-6508", - "bad value %s for -ocommit\n", - arg); - return 0; - } - *commit_max_age = (unsigned int)val; - } - - if (c == 'w') { - reiserfs_warning(s, "super-6509", "nolargeio option " - "is no longer supported"); - return 0; - } - - if (c == 'j') { - if (arg && *arg && jdev_name) { - /* Hm, already assigned? */ - if (*jdev_name) { - reiserfs_warning(s, "super-6510", - "journal device was " - "already specified to " - "be %s", *jdev_name); - return 0; - } - *jdev_name = arg; - } - } -#ifdef CONFIG_QUOTA - if (c == 'u' || c == 'g') { - int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - - if (sb_any_quota_loaded(s) && - (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { - reiserfs_warning(s, "super-6511", - "cannot change journaled " - "quota options when quota " - "turned on."); - return 0; - } - if (qf_names[qtype] != - REISERFS_SB(s)->s_qf_names[qtype]) - kfree(qf_names[qtype]); - qf_names[qtype] = NULL; - if (*arg) { /* Some filename specified? */ - if (REISERFS_SB(s)->s_qf_names[qtype] - && strcmp(REISERFS_SB(s)->s_qf_names[qtype], - arg)) { - reiserfs_warning(s, "super-6512", - "%s quota file " - "already specified.", - QTYPE2NAME(qtype)); - return 0; - } - if (strchr(arg, '/')) { - reiserfs_warning(s, "super-6513", - "quotafile must be " - "on filesystem root."); - return 0; - } - qf_names[qtype] = kstrdup(arg, GFP_KERNEL); - if (!qf_names[qtype]) { - reiserfs_warning(s, "reiserfs-2502", - "not enough memory " - "for storing " - "quotafile name."); - return 0; - } - if (qtype == USRQUOTA) - *mount_options |= 1 << REISERFS_USRQUOTA; - else - *mount_options |= 1 << REISERFS_GRPQUOTA; - } else { - if (qtype == USRQUOTA) - *mount_options &= ~(1 << REISERFS_USRQUOTA); - else - *mount_options &= ~(1 << REISERFS_GRPQUOTA); - } - } - if (c == 'f') { - if (!strcmp(arg, "vfsold")) - *qfmt = QFMT_VFS_OLD; - else if (!strcmp(arg, "vfsv0")) - *qfmt = QFMT_VFS_V0; - else { - reiserfs_warning(s, "super-6514", - "unknown quota format " - "specified."); - return 0; - } - if (sb_any_quota_loaded(s) && - *qfmt != REISERFS_SB(s)->s_jquota_fmt) { - reiserfs_warning(s, "super-6515", - "cannot change journaled " - "quota options when quota " - "turned on."); - return 0; - } - } -#else - if (c == 'u' || c == 'g' || c == 'f') { - reiserfs_warning(s, "reiserfs-2503", "journaled " - "quota options not supported."); - return 0; - } -#endif - } - -#ifdef CONFIG_QUOTA - if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt - && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { - reiserfs_warning(s, "super-6515", - "journaled quota format not specified."); - return 0; - } - if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) && - sb_has_quota_loaded(s, USRQUOTA)) || - (!(*mount_options & (1 << REISERFS_GRPQUOTA)) && - sb_has_quota_loaded(s, GRPQUOTA))) { - reiserfs_warning(s, "super-6516", "quota options must " - "be present when quota is turned on."); - return 0; - } -#endif - - return 1; -} - -static void switch_data_mode(struct super_block *s, unsigned long mode) -{ - REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | - (1 << REISERFS_DATA_ORDERED) | - (1 << REISERFS_DATA_WRITEBACK)); - REISERFS_SB(s)->s_mount_opt |= (1 << mode); -} - -static void handle_data_mode(struct super_block *s, unsigned long mount_options) -{ - if (mount_options & (1 << REISERFS_DATA_LOG)) { - if (!reiserfs_data_log(s)) { - switch_data_mode(s, REISERFS_DATA_LOG); - reiserfs_info(s, "switching to journaled data mode\n"); - } - } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { - if (!reiserfs_data_ordered(s)) { - switch_data_mode(s, REISERFS_DATA_ORDERED); - reiserfs_info(s, "switching to ordered data mode\n"); - } - } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { - if (!reiserfs_data_writeback(s)) { - switch_data_mode(s, REISERFS_DATA_WRITEBACK); - reiserfs_info(s, "switching to writeback data mode\n"); - } - } -} - -static void handle_barrier_mode(struct super_block *s, unsigned long bits) -{ - int flush = (1 << REISERFS_BARRIER_FLUSH); - int none = (1 << REISERFS_BARRIER_NONE); - int all_barrier = flush | none; - - if (bits & all_barrier) { - REISERFS_SB(s)->s_mount_opt &= ~all_barrier; - if (bits & flush) { - REISERFS_SB(s)->s_mount_opt |= flush; - printk("reiserfs: enabling write barrier flush mode\n"); - } else if (bits & none) { - REISERFS_SB(s)->s_mount_opt |= none; - printk("reiserfs: write barriers turned off\n"); - } - } -} - -static void handle_attrs(struct super_block *s) -{ - struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); - - if (reiserfs_attrs(s)) { - if (old_format_only(s)) { - reiserfs_warning(s, "super-6517", "cannot support " - "attributes on 3.5.x disk format"); - REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); - return; - } - if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { - reiserfs_warning(s, "super-6518", "cannot support " - "attributes until flag is set in " - "super-block"); - REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); - } - } -} - -#ifdef CONFIG_QUOTA -static void handle_quota_files(struct super_block *s, char **qf_names, - unsigned int *qfmt) -{ - int i; - - for (i = 0; i < REISERFS_MAXQUOTAS; i++) { - if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) - kfree(REISERFS_SB(s)->s_qf_names[i]); - REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; - } - if (*qfmt) - REISERFS_SB(s)->s_jquota_fmt = *qfmt; -} -#endif - -static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) -{ - struct reiserfs_super_block *rs; - struct reiserfs_transaction_handle th; - unsigned long blocks; - unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; - unsigned long safe_mask = 0; - unsigned int commit_max_age = (unsigned int)-1; - struct reiserfs_journal *journal = SB_JOURNAL(s); - int err; - char *qf_names[REISERFS_MAXQUOTAS]; - unsigned int qfmt = 0; -#ifdef CONFIG_QUOTA - int i; -#endif - - sync_filesystem(s); - reiserfs_write_lock(s); - -#ifdef CONFIG_QUOTA - memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); -#endif - - rs = SB_DISK_SUPER_BLOCK(s); - - if (!reiserfs_parse_options - (s, arg, &mount_options, &blocks, NULL, &commit_max_age, - qf_names, &qfmt)) { -#ifdef CONFIG_QUOTA - for (i = 0; i < REISERFS_MAXQUOTAS; i++) - if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) - kfree(qf_names[i]); -#endif - err = -EINVAL; - goto out_err_unlock; - } -#ifdef CONFIG_QUOTA - handle_quota_files(s, qf_names, &qfmt); -#endif - - handle_attrs(s); - - /* Add options that are safe here */ - safe_mask |= 1 << REISERFS_SMALLTAIL; - safe_mask |= 1 << REISERFS_LARGETAIL; - safe_mask |= 1 << REISERFS_NO_BORDER; - safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; - safe_mask |= 1 << REISERFS_HASHED_RELOCATION; - safe_mask |= 1 << REISERFS_TEST4; - safe_mask |= 1 << REISERFS_ATTRS; - safe_mask |= 1 << REISERFS_XATTRS_USER; - safe_mask |= 1 << REISERFS_POSIXACL; - safe_mask |= 1 << REISERFS_BARRIER_FLUSH; - safe_mask |= 1 << REISERFS_BARRIER_NONE; - safe_mask |= 1 << REISERFS_ERROR_RO; - safe_mask |= 1 << REISERFS_ERROR_CONTINUE; - safe_mask |= 1 << REISERFS_ERROR_PANIC; - safe_mask |= 1 << REISERFS_USRQUOTA; - safe_mask |= 1 << REISERFS_GRPQUOTA; - - /* - * Update the bitmask, taking care to keep - * the bits we're not allowed to change here - */ - REISERFS_SB(s)->s_mount_opt = - (REISERFS_SB(s)-> - s_mount_opt & ~safe_mask) | (mount_options & safe_mask); - - if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) { - journal->j_max_commit_age = commit_max_age; - journal->j_max_trans_age = commit_max_age; - } else if (commit_max_age == 0) { - /* 0 means restore defaults. */ - journal->j_max_commit_age = journal->j_default_max_commit_age; - journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; - } - - if (blocks) { - err = reiserfs_resize(s, blocks); - if (err != 0) - goto out_err_unlock; - } - - if (*mount_flags & SB_RDONLY) { - reiserfs_write_unlock(s); - reiserfs_xattr_init(s, *mount_flags); - /* remount read-only */ - if (sb_rdonly(s)) - /* it is read-only already */ - goto out_ok_unlocked; - - err = dquot_suspend(s, -1); - if (err < 0) - goto out_err; - - /* try to remount file system with read-only permissions */ - if (sb_umount_state(rs) == REISERFS_VALID_FS - || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { - goto out_ok_unlocked; - } - - reiserfs_write_lock(s); - - err = journal_begin(&th, s, 10); - if (err) - goto out_err_unlock; - - /* Mounting a rw partition read-only. */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - } else { - /* remount read-write */ - if (!sb_rdonly(s)) { - reiserfs_write_unlock(s); - reiserfs_xattr_init(s, *mount_flags); - goto out_ok_unlocked; /* We are read-write already */ - } - - if (reiserfs_is_journal_aborted(journal)) { - err = journal->j_errno; - goto out_err_unlock; - } - - handle_data_mode(s, mount_options); - handle_barrier_mode(s, mount_options); - REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - - /* now it is safe to call journal_begin */ - s->s_flags &= ~SB_RDONLY; - err = journal_begin(&th, s, 10); - if (err) - goto out_err_unlock; - - /* Mount a partition which is read-only, read-write */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~SB_RDONLY; - set_sb_umount_state(rs, REISERFS_ERROR_FS); - if (!old_format_only(s)) - set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); - /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; - } - /* this will force a full flush of all journal lists */ - SB_JOURNAL(s)->j_must_wait = 1; - err = journal_end(&th); - if (err) - goto out_err_unlock; - - reiserfs_write_unlock(s); - if (!(*mount_flags & SB_RDONLY)) { - dquot_resume(s, -1); - reiserfs_write_lock(s); - finish_unfinished(s); - reiserfs_write_unlock(s); - reiserfs_xattr_init(s, *mount_flags); - } - -out_ok_unlocked: - return 0; - -out_err_unlock: - reiserfs_write_unlock(s); -out_err: - return err; -} - -static int read_super_block(struct super_block *s, int offset) -{ - struct buffer_head *bh; - struct reiserfs_super_block *rs; - int fs_blocksize; - - bh = sb_bread(s, offset / s->s_blocksize); - if (!bh) { - reiserfs_warning(s, "sh-2006", - "bread failed (dev %s, block %lu, size %lu)", - s->s_id, offset / s->s_blocksize, - s->s_blocksize); - return 1; - } - - rs = (struct reiserfs_super_block *)bh->b_data; - if (!is_any_reiserfs_magic_string(rs)) { - brelse(bh); - return 1; - } - /* - * ok, reiserfs signature (old or new) found in at the given offset - */ - fs_blocksize = sb_blocksize(rs); - brelse(bh); - sb_set_blocksize(s, fs_blocksize); - - bh = sb_bread(s, offset / s->s_blocksize); - if (!bh) { - reiserfs_warning(s, "sh-2007", - "bread failed (dev %s, block %lu, size %lu)", - s->s_id, offset / s->s_blocksize, - s->s_blocksize); - return 1; - } - - rs = (struct reiserfs_super_block *)bh->b_data; - if (sb_blocksize(rs) != s->s_blocksize) { - reiserfs_warning(s, "sh-2011", "can't find a reiserfs " - "filesystem on (dev %s, block %llu, size %lu)", - s->s_id, - (unsigned long long)bh->b_blocknr, - s->s_blocksize); - brelse(bh); - return 1; - } - - if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { - brelse(bh); - reiserfs_warning(s, "super-6519", "Unfinished reiserfsck " - "--rebuild-tree run detected. Please run\n" - "reiserfsck --rebuild-tree and wait for a " - "completion. If that fails\n" - "get newer reiserfsprogs package"); - return 1; - } - - reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and " - "scheduled to be removed from the kernel in 2025"); - SB_BUFFER_WITH_SB(s) = bh; - SB_DISK_SUPER_BLOCK(s) = rs; - - /* - * magic is of non-standard journal filesystem, look at s_version to - * find which format is in use - */ - if (is_reiserfs_jr(rs)) { - if (sb_version(rs) == REISERFS_VERSION_2) - reiserfs_info(s, "found reiserfs format \"3.6\"" - " with non-standard journal\n"); - else if (sb_version(rs) == REISERFS_VERSION_1) - reiserfs_info(s, "found reiserfs format \"3.5\"" - " with non-standard journal\n"); - else { - reiserfs_warning(s, "sh-2012", "found unknown " - "format \"%u\" of reiserfs with " - "non-standard magic", sb_version(rs)); - return 1; - } - } else - /* - * s_version of standard format may contain incorrect - * information, so we just look at the magic string - */ - reiserfs_info(s, - "found reiserfs format \"%s\" with standard journal\n", - is_reiserfs_3_5(rs) ? "3.5" : "3.6"); - - s->s_op = &reiserfs_sops; - s->s_export_op = &reiserfs_export_ops; -#ifdef CONFIG_QUOTA - s->s_qcop = &reiserfs_qctl_operations; - s->dq_op = &reiserfs_quota_operations; - s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; -#endif - - /* - * new format is limited by the 32 bit wide i_blocks field, want to - * be one full block below that. - */ - s->s_maxbytes = (512LL << 32) - s->s_blocksize; - return 0; -} - -/* after journal replay, reread all bitmap and super blocks */ -static int reread_meta_blocks(struct super_block *s) -{ - if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) { - reiserfs_warning(s, "reiserfs-2504", "error reading the super"); - return 1; - } - - return 0; -} - -/* hash detection stuff */ - -/* - * if root directory is empty - we set default - Yura's - hash and - * warn about it - * FIXME: we look for only one name in a directory. If tea and yura - * both have the same value - we ask user to send report to the - * mailing list - */ -static __u32 find_hash_out(struct super_block *s) -{ - int retval; - struct inode *inode; - struct cpu_key key; - INITIALIZE_PATH(path); - struct reiserfs_dir_entry de; - struct reiserfs_de_head *deh; - __u32 hash = DEFAULT_HASH; - __u32 deh_hashval, teahash, r5hash, yurahash; - - inode = d_inode(s->s_root); - - make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); - retval = search_by_entry_key(s, &key, &path, &de); - if (retval == IO_ERROR) { - pathrelse(&path); - return UNSET_HASH; - } - if (retval == NAME_NOT_FOUND) - de.de_entry_num--; - - set_de_name_and_namelen(&de); - deh = de.de_deh + de.de_entry_num; - - if (deh_offset(deh) == DOT_DOT_OFFSET) { - /* allow override in this case */ - if (reiserfs_rupasov_hash(s)) - hash = YURA_HASH; - reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n"); - goto out; - } - - deh_hashval = GET_HASH_VALUE(deh_offset(deh)); - r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); - teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); - yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); - - if ((teahash == r5hash && deh_hashval == r5hash) || - (teahash == yurahash && deh_hashval == yurahash) || - (r5hash == yurahash && deh_hashval == yurahash)) { - reiserfs_warning(s, "reiserfs-2506", - "Unable to automatically detect hash " - "function. Please mount with -o " - "hash={tea,rupasov,r5}"); - hash = UNSET_HASH; - goto out; - } - - if (deh_hashval == yurahash) - hash = YURA_HASH; - else if (deh_hashval == teahash) - hash = TEA_HASH; - else if (deh_hashval == r5hash) - hash = R5_HASH; - else { - reiserfs_warning(s, "reiserfs-2506", - "Unrecognised hash function"); - hash = UNSET_HASH; - } -out: - pathrelse(&path); - return hash; -} - -/* finds out which hash names are sorted with */ -static int what_hash(struct super_block *s) -{ - __u32 code; - - code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); - - /* - * reiserfs_hash_detect() == true if any of the hash mount options - * were used. We must check them to make sure the user isn't - * using a bad hash value - */ - if (code == UNSET_HASH || reiserfs_hash_detect(s)) - code = find_hash_out(s); - - if (code != UNSET_HASH && reiserfs_hash_detect(s)) { - /* - * detection has found the hash, and we must check against the - * mount options - */ - if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { - reiserfs_warning(s, "reiserfs-2507", - "Error, %s hash detected, " - "unable to force rupasov hash", - reiserfs_hashname(code)); - code = UNSET_HASH; - } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { - reiserfs_warning(s, "reiserfs-2508", - "Error, %s hash detected, " - "unable to force tea hash", - reiserfs_hashname(code)); - code = UNSET_HASH; - } else if (reiserfs_r5_hash(s) && code != R5_HASH) { - reiserfs_warning(s, "reiserfs-2509", - "Error, %s hash detected, " - "unable to force r5 hash", - reiserfs_hashname(code)); - code = UNSET_HASH; - } - } else { - /* - * find_hash_out was not called or - * could not determine the hash - */ - if (reiserfs_rupasov_hash(s)) { - code = YURA_HASH; - } else if (reiserfs_tea_hash(s)) { - code = TEA_HASH; - } else if (reiserfs_r5_hash(s)) { - code = R5_HASH; - } - } - - /* - * if we are mounted RW, and we have a new valid hash code, update - * the super - */ - if (code != UNSET_HASH && - !sb_rdonly(s) && - code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { - set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); - } - return code; -} - -/* return pointer to appropriate function */ -static hashf_t hash_function(struct super_block *s) -{ - switch (what_hash(s)) { - case TEA_HASH: - reiserfs_info(s, "Using tea hash to sort names\n"); - return keyed_hash; - case YURA_HASH: - reiserfs_info(s, "Using rupasov hash to sort names\n"); - return yura_hash; - case R5_HASH: - reiserfs_info(s, "Using r5 hash to sort names\n"); - return r5_hash; - } - return NULL; -} - -/* this is used to set up correct value for old partitions */ -static int function2code(hashf_t func) -{ - if (func == keyed_hash) - return TEA_HASH; - if (func == yura_hash) - return YURA_HASH; - if (func == r5_hash) - return R5_HASH; - - BUG(); /* should never happen */ - - return 0; -} - -#define SWARN(silent, s, id, ...) \ - if (!(silent)) \ - reiserfs_warning(s, id, __VA_ARGS__) - -static int reiserfs_fill_super(struct super_block *s, void *data, int silent) -{ - struct inode *root_inode; - struct reiserfs_transaction_handle th; - int old_format = 0; - unsigned long blocks; - unsigned int commit_max_age = 0; - int jinit_done = 0; - struct reiserfs_iget_args args; - struct reiserfs_super_block *rs; - char *jdev_name; - struct reiserfs_sb_info *sbi; - int errval = -EINVAL; - char *qf_names[REISERFS_MAXQUOTAS] = {}; - unsigned int qfmt = 0; - - sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - s->s_fs_info = sbi; - /* Set default values for options: non-aggressive tails, RO on errors */ - sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); - sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); - sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); - /* no preallocation minimum, be smart in reiserfs_file_write instead */ - sbi->s_alloc_options.preallocmin = 0; - /* Preallocate by 16 blocks (17-1) at once */ - sbi->s_alloc_options.preallocsize = 17; - /* setup default block allocator options */ - reiserfs_init_alloc_options(s); - - spin_lock_init(&sbi->old_work_lock); - INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits); - mutex_init(&sbi->lock); - sbi->lock_depth = -1; - - sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0, - s->s_id); - if (!sbi->commit_wq) { - SWARN(silent, s, "", "Cannot allocate commit workqueue"); - errval = -ENOMEM; - goto error_unlocked; - } - - jdev_name = NULL; - if (reiserfs_parse_options - (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name, - &commit_max_age, qf_names, &qfmt) == 0) { - goto error_unlocked; - } - if (jdev_name && jdev_name[0]) { - sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL); - if (!sbi->s_jdev) { - SWARN(silent, s, "", "Cannot allocate memory for " - "journal device name"); - goto error_unlocked; - } - } -#ifdef CONFIG_QUOTA - handle_quota_files(s, qf_names, &qfmt); -#endif - - if (blocks) { - SWARN(silent, s, "jmacd-7", "resize option for remount only"); - goto error_unlocked; - } - - /* - * try old format (undistributed bitmap, super block in 8-th 1k - * block of a device) - */ - if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) - old_format = 1; - - /* - * try new format (64-th 1k block), which can contain reiserfs - * super block - */ - else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { - SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", - s->s_id); - goto error_unlocked; - } - - s->s_time_min = 0; - s->s_time_max = U32_MAX; - - rs = SB_DISK_SUPER_BLOCK(s); - /* - * Let's do basic sanity check to verify that underlying device is not - * smaller than the filesystem. If the check fails then abort and - * scream, because bad stuff will happen otherwise. - */ - if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) { - SWARN(silent, s, "", "Filesystem cannot be " - "mounted because it is bigger than the device"); - SWARN(silent, s, "", "You may need to run fsck " - "or increase size of your LVM partition"); - SWARN(silent, s, "", "Or may be you forgot to " - "reboot after fdisk when it told you to"); - goto error_unlocked; - } - - sbi->s_mount_state = SB_REISERFS_STATE(s); - sbi->s_mount_state = REISERFS_VALID_FS; - - if ((errval = reiserfs_init_bitmap_cache(s))) { - SWARN(silent, s, "jmacd-8", "unable to read bitmap"); - goto error_unlocked; - } - - errval = -EINVAL; -#ifdef CONFIG_REISERFS_CHECK - SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); - SWARN(silent, s, "", "- it is slow mode for debugging."); -#endif - - /* make data=ordered the default */ - if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && - !reiserfs_data_writeback(s)) { - sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); - } - - if (reiserfs_data_log(s)) { - reiserfs_info(s, "using journaled data mode\n"); - } else if (reiserfs_data_ordered(s)) { - reiserfs_info(s, "using ordered data mode\n"); - } else { - reiserfs_info(s, "using writeback data mode\n"); - } - if (reiserfs_barrier_flush(s)) { - printk("reiserfs: using flush barriers\n"); - } - - if (journal_init(s, jdev_name, old_format, commit_max_age)) { - SWARN(silent, s, "sh-2022", - "unable to initialize journal space"); - goto error_unlocked; - } else { - /* - * once this is set, journal_release must be called - * if we error out of the mount - */ - jinit_done = 1; - } - - if (reread_meta_blocks(s)) { - SWARN(silent, s, "jmacd-9", - "unable to reread meta blocks after journal init"); - goto error_unlocked; - } - - if (replay_only(s)) - goto error_unlocked; - - s->s_xattr = reiserfs_xattr_handlers; - - if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { - SWARN(silent, s, "clm-7000", - "Detected readonly device, marking FS readonly"); - s->s_flags |= SB_RDONLY; - } - args.objectid = REISERFS_ROOT_OBJECTID; - args.dirid = REISERFS_ROOT_PARENT_OBJECTID; - root_inode = - iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, - reiserfs_init_locked_inode, (void *)&args); - if (!root_inode) { - SWARN(silent, s, "jmacd-10", "get root inode failed"); - goto error_unlocked; - } - - /* - * This path assumed to be called with the BKL in the old times. - * Now we have inherited the big reiserfs lock from it and many - * reiserfs helpers called in the mount path and elsewhere require - * this lock to be held even if it's not always necessary. Let's be - * conservative and hold it early. The window can be reduced after - * careful review of the code. - */ - reiserfs_write_lock(s); - - if (root_inode->i_state & I_NEW) { - reiserfs_read_locked_inode(root_inode, &args); - unlock_new_inode(root_inode); - } - - if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) || - !root_inode->i_size) { - SWARN(silent, s, "", "corrupt root inode, run fsck"); - iput(root_inode); - errval = -EUCLEAN; - goto error; - } - - s->s_root = d_make_root(root_inode); - if (!s->s_root) - goto error; - /* define and initialize hash function */ - sbi->s_hash_function = hash_function(s); - if (sbi->s_hash_function == NULL) { - dput(s->s_root); - s->s_root = NULL; - goto error; - } - - if (is_reiserfs_3_5(rs) - || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) - set_bit(REISERFS_3_5, &sbi->s_properties); - else if (old_format) - set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties); - else - set_bit(REISERFS_3_6, &sbi->s_properties); - - if (!sb_rdonly(s)) { - - errval = journal_begin(&th, s, 1); - if (errval) { - dput(s->s_root); - s->s_root = NULL; - goto error; - } - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - - set_sb_umount_state(rs, REISERFS_ERROR_FS); - set_sb_fs_state(rs, 0); - - /* - * Clear out s_bmap_nr if it would wrap. We can handle this - * case, but older revisions can't. This will cause the - * file system to fail mount on those older implementations, - * avoiding corruption. -jeffm - */ - if (bmap_would_wrap(reiserfs_bmap_count(s)) && - sb_bmap_nr(rs) != 0) { - reiserfs_warning(s, "super-2030", "This file system " - "claims to use %u bitmap blocks in " - "its super block, but requires %u. " - "Clearing to zero.", sb_bmap_nr(rs), - reiserfs_bmap_count(s)); - - set_sb_bmap_nr(rs, 0); - } - - if (old_format_only(s)) { - /* - * filesystem of format 3.5 either with standard - * or non-standard journal - */ - if (convert_reiserfs(s)) { - /* and -o conv is given */ - if (!silent) - reiserfs_info(s, - "converting 3.5 filesystem to the 3.6 format"); - - if (is_reiserfs_3_5(rs)) - /* - * put magic string of 3.6 format. - * 2.2 will not be able to - * mount this filesystem anymore - */ - memcpy(rs->s_v1.s_magic, - reiserfs_3_6_magic_string, - sizeof - (reiserfs_3_6_magic_string)); - - set_sb_version(rs, REISERFS_VERSION_2); - reiserfs_convert_objectid_map_v1(s); - set_bit(REISERFS_3_6, &sbi->s_properties); - clear_bit(REISERFS_3_5, &sbi->s_properties); - } else if (!silent) { - reiserfs_info(s, "using 3.5.x disk format\n"); - } - } else - set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); - - - journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); - errval = journal_end(&th); - if (errval) { - dput(s->s_root); - s->s_root = NULL; - goto error; - } - - reiserfs_write_unlock(s); - if ((errval = reiserfs_lookup_privroot(s)) || - (errval = reiserfs_xattr_init(s, s->s_flags))) { - dput(s->s_root); - s->s_root = NULL; - goto error_unlocked; - } - reiserfs_write_lock(s); - - /* - * look for files which were to be removed in previous session - */ - finish_unfinished(s); - } else { - if (old_format_only(s) && !silent) { - reiserfs_info(s, "using 3.5.x disk format\n"); - } - - reiserfs_write_unlock(s); - if ((errval = reiserfs_lookup_privroot(s)) || - (errval = reiserfs_xattr_init(s, s->s_flags))) { - dput(s->s_root); - s->s_root = NULL; - goto error_unlocked; - } - reiserfs_write_lock(s); - } - /* - * mark hash in super block: it could be unset. overwrite should be ok - */ - set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); - - handle_attrs(s); - - reiserfs_proc_info_init(s); - - init_waitqueue_head(&(sbi->s_wait)); - spin_lock_init(&sbi->bitmap_lock); - - reiserfs_write_unlock(s); - - return (0); - -error: - reiserfs_write_unlock(s); - -error_unlocked: - /* kill the commit thread, free journal ram */ - if (jinit_done) { - reiserfs_write_lock(s); - journal_release_error(NULL, s); - reiserfs_write_unlock(s); - } - - if (sbi->commit_wq) - destroy_workqueue(sbi->commit_wq); - - reiserfs_cancel_old_flush(s); - - reiserfs_free_bitmap_cache(s); - if (SB_BUFFER_WITH_SB(s)) - brelse(SB_BUFFER_WITH_SB(s)); -#ifdef CONFIG_QUOTA - { - int j; - for (j = 0; j < REISERFS_MAXQUOTAS; j++) - kfree(qf_names[j]); - } -#endif - kfree(sbi->s_jdev); - kfree(sbi); - - s->s_fs_info = NULL; - return errval; -} - -static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb); - - buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); - buf->f_bfree = sb_free_blocks(rs); - buf->f_bavail = buf->f_bfree; - buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; - buf->f_bsize = dentry->d_sb->s_blocksize; - /* changed to accommodate gcc folks. */ - buf->f_type = REISERFS_SUPER_MAGIC; - buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2); - buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2, - sizeof(rs->s_uuid)/2); - - return 0; -} - -#ifdef CONFIG_QUOTA -static int reiserfs_write_dquot(struct dquot *dquot) -{ - struct reiserfs_transaction_handle th; - int ret, err; - int depth; - - reiserfs_write_lock(dquot->dq_sb); - ret = - journal_begin(&th, dquot->dq_sb, - REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); - if (ret) - goto out; - depth = reiserfs_write_unlock_nested(dquot->dq_sb); - ret = dquot_commit(dquot); - reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = journal_end(&th); - if (!ret && err) - ret = err; -out: - reiserfs_write_unlock(dquot->dq_sb); - return ret; -} - -static int reiserfs_acquire_dquot(struct dquot *dquot) -{ - struct reiserfs_transaction_handle th; - int ret, err; - int depth; - - reiserfs_write_lock(dquot->dq_sb); - ret = - journal_begin(&th, dquot->dq_sb, - REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); - if (ret) - goto out; - depth = reiserfs_write_unlock_nested(dquot->dq_sb); - ret = dquot_acquire(dquot); - reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = journal_end(&th); - if (!ret && err) - ret = err; -out: - reiserfs_write_unlock(dquot->dq_sb); - return ret; -} - -static int reiserfs_release_dquot(struct dquot *dquot) -{ - struct reiserfs_transaction_handle th; - int ret, err; - - reiserfs_write_lock(dquot->dq_sb); - ret = - journal_begin(&th, dquot->dq_sb, - REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); - reiserfs_write_unlock(dquot->dq_sb); - if (ret) { - /* Release dquot anyway to avoid endless cycle in dqput() */ - dquot_release(dquot); - goto out; - } - ret = dquot_release(dquot); - reiserfs_write_lock(dquot->dq_sb); - err = journal_end(&th); - if (!ret && err) - ret = err; - reiserfs_write_unlock(dquot->dq_sb); -out: - return ret; -} - -static int reiserfs_mark_dquot_dirty(struct dquot *dquot) -{ - /* Are we journaling quotas? */ - if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { - dquot_mark_dquot_dirty(dquot); - return reiserfs_write_dquot(dquot); - } else - return dquot_mark_dquot_dirty(dquot); -} - -static int reiserfs_write_info(struct super_block *sb, int type) -{ - struct reiserfs_transaction_handle th; - int ret, err; - int depth; - - /* Data block + inode block */ - reiserfs_write_lock(sb); - ret = journal_begin(&th, sb, 2); - if (ret) - goto out; - depth = reiserfs_write_unlock_nested(sb); - ret = dquot_commit_info(sb, type); - reiserfs_write_lock_nested(sb, depth); - err = journal_end(&th); - if (!ret && err) - ret = err; -out: - reiserfs_write_unlock(sb); - return ret; -} - -/* - * Turn on quotas during mount time - we need to find the quota file and such... - */ -static int reiserfs_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], - REISERFS_SB(sb)->s_jquota_fmt, type); -} - -/* - * Standard function to be called on quota_on - */ -static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - const struct path *path) -{ - int err; - struct inode *inode; - struct reiserfs_transaction_handle th; - int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA; - - reiserfs_write_lock(sb); - if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) { - err = -EINVAL; - goto out; - } - - /* Quotafile not on the same filesystem? */ - if (path->dentry->d_sb != sb) { - err = -EXDEV; - goto out; - } - inode = d_inode(path->dentry); - /* - * We must not pack tails for quota files on reiserfs for quota - * IO to work - */ - if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { - err = reiserfs_unpack(inode); - if (err) { - reiserfs_warning(sb, "super-6520", - "Unpacking tail of quota file failed" - " (%d). Cannot turn on quotas.", err); - err = -EINVAL; - goto out; - } - mark_inode_dirty(inode); - } - /* Journaling quota? */ - if (REISERFS_SB(sb)->s_qf_names[type]) { - /* Quotafile not of fs root? */ - if (path->dentry->d_parent != sb->s_root) - reiserfs_warning(sb, "super-6521", - "Quota file not on filesystem root. " - "Journalled quota will not work."); - } - - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (reiserfs_file_data_log(inode)) { - /* Just start temporary transaction and finish it */ - err = journal_begin(&th, sb, 1); - if (err) - goto out; - err = journal_end_sync(&th); - if (err) - goto out; - } - reiserfs_write_unlock(sb); - err = dquot_quota_on(sb, type, format_id, path); - if (!err) { - inode_lock(inode); - REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL | - REISERFS_NOATIME_FL; - inode_set_flags(inode, S_IMMUTABLE | S_NOATIME, - S_IMMUTABLE | S_NOATIME); - inode_unlock(inode); - mark_inode_dirty(inode); - } - return err; -out: - reiserfs_write_unlock(sb); - return err; -} - -static int reiserfs_quota_off(struct super_block *sb, int type) -{ - int err; - struct inode *inode = sb_dqopt(sb)->files[type]; - - if (!inode || !igrab(inode)) - goto out; - - err = dquot_quota_off(sb, type); - if (err) - goto out_put; - - inode_lock(inode); - REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL | - REISERFS_NOATIME_FL); - inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME); - inode_unlock(inode); - mark_inode_dirty(inode); -out_put: - iput(inode); - return err; -out: - return dquot_quota_off(sb, type); -} - -/* - * Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races - */ -static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - unsigned long blk = off >> sb->s_blocksize_bits; - int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; - size_t toread; - struct buffer_head tmp_bh, *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off + len > i_size) - len = i_size - off; - toread = len; - while (toread > 0) { - tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); - tmp_bh.b_state = 0; - /* - * Quota files are without tails so we can safely - * use this function - */ - reiserfs_write_lock(sb); - err = reiserfs_get_block(inode, blk, &tmp_bh, 0); - reiserfs_write_unlock(sb); - if (err) - return err; - if (!buffer_mapped(&tmp_bh)) /* A hole? */ - memset(data, 0, tocopy); - else { - bh = sb_bread(sb, tmp_bh.b_blocknr); - if (!bh) - return -EIO; - memcpy(data, bh->b_data + offset, tocopy); - brelse(bh); - } - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* - * Write to quotafile (we know the transaction is already started and has - * enough credits) - */ -static ssize_t reiserfs_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - unsigned long blk = off >> sb->s_blocksize_bits; - int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; - int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; - struct buffer_head tmp_bh, *bh; - - if (!current->journal_info) { - printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - while (towrite > 0) { - tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); - tmp_bh.b_state = 0; - reiserfs_write_lock(sb); - err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); - reiserfs_write_unlock(sb); - if (err) - goto out; - if (offset || tocopy != sb->s_blocksize) - bh = sb_bread(sb, tmp_bh.b_blocknr); - else - bh = sb_getblk(sb, tmp_bh.b_blocknr); - if (!bh) { - err = -EIO; - goto out; - } - lock_buffer(bh); - memcpy(bh->b_data + offset, data, tocopy); - flush_dcache_page(bh->b_page); - set_buffer_uptodate(bh); - unlock_buffer(bh); - reiserfs_write_lock(sb); - reiserfs_prepare_for_journal(sb, bh, 1); - journal_mark_dirty(current->journal_info, bh); - if (!journal_quota) - reiserfs_add_ordered_list(inode, bh); - reiserfs_write_unlock(sb); - brelse(bh); - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; - } -out: - if (len == towrite) - return err; - if (inode->i_size < off + len - towrite) - i_size_write(inode, off + len - towrite); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - mark_inode_dirty(inode); - return len - towrite; -} - -#endif - -static struct dentry *get_super_block(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); -} - -static int __init init_reiserfs_fs(void) -{ - int ret; - - ret = init_inodecache(); - if (ret) - return ret; - - reiserfs_proc_info_global_init(); - - ret = register_filesystem(&reiserfs_fs_type); - if (ret) - goto out; - - return 0; -out: - reiserfs_proc_info_global_done(); - destroy_inodecache(); - - return ret; -} - -static void __exit exit_reiserfs_fs(void) -{ - reiserfs_proc_info_global_done(); - unregister_filesystem(&reiserfs_fs_type); - destroy_inodecache(); -} - -struct file_system_type reiserfs_fs_type = { - .owner = THIS_MODULE, - .name = "reiserfs", - .mount = get_super_block, - .kill_sb = reiserfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("reiserfs"); - -MODULE_DESCRIPTION("ReiserFS journaled filesystem"); -MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>"); -MODULE_LICENSE("GPL"); - -module_init(init_reiserfs_fs); -module_exit(exit_reiserfs_fs); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c deleted file mode 100644 index 2cec61af2a9e..000000000000 --- a/fs/reiserfs/tail_conversion.c +++ /dev/null @@ -1,318 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright - * details - */ - -#include <linux/time.h> -#include <linux/pagemap.h> -#include <linux/buffer_head.h> -#include "reiserfs.h" - -/* - * access to tail : when one is going to read tail it must make sure, that is - * not running. direct2indirect and indirect2direct can not run concurrently - */ - -/* - * Converts direct items to an unformatted node. Panics if file has no - * tail. -ENOSPC if no disk space for conversion - */ -/* - * path points to first direct item of the file regardless of how many of - * them are there - */ -int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, - struct treepath *path, struct buffer_head *unbh, - loff_t tail_offset) -{ - struct super_block *sb = inode->i_sb; - struct buffer_head *up_to_date_bh; - struct item_head *p_le_ih = tp_item_head(path); - unsigned long total_tail = 0; - - /* Key to search for the last byte of the converted item. */ - struct cpu_key end_key; - - /* - * new indirect item to be inserted or key - * of unfm pointer to be pasted - */ - struct item_head ind_ih; - int blk_size; - /* returned value for reiserfs_insert_item and clones */ - int retval; - /* Handle on an unformatted node that will be inserted in the tree. */ - unp_t unfm_ptr; - - BUG_ON(!th->t_trans_id); - - REISERFS_SB(sb)->s_direct2indirect++; - - blk_size = sb->s_blocksize; - - /* - * and key to search for append or insert pointer to the new - * unformatted node. - */ - copy_item_head(&ind_ih, p_le_ih); - set_le_ih_k_offset(&ind_ih, tail_offset); - set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); - - /* Set the key to search for the place for new unfm pointer */ - make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); - - /* FIXME: we could avoid this */ - if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { - reiserfs_error(sb, "PAP-14030", - "pasted or inserted byte exists in " - "the tree %K. Use fsck to repair.", &end_key); - pathrelse(path); - return -EIO; - } - - p_le_ih = tp_item_head(path); - - unfm_ptr = cpu_to_le32(unbh->b_blocknr); - - if (is_statdata_le_ih(p_le_ih)) { - /* Insert new indirect item. */ - set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ - put_ih_item_len(&ind_ih, UNFM_P_SIZE); - PATH_LAST_POSITION(path)++; - retval = - reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, - (char *)&unfm_ptr); - } else { - /* Paste into last indirect item of an object. */ - retval = reiserfs_paste_into_item(th, path, &end_key, inode, - (char *)&unfm_ptr, - UNFM_P_SIZE); - } - if (retval) { - return retval; - } - /* - * note: from here there are two keys which have matching first - * three key components. They only differ by the fourth one. - */ - - /* Set the key to search for the direct items of the file */ - make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, - 4); - - /* - * Move bytes from the direct items to the new unformatted node - * and delete them. - */ - while (1) { - int tail_size; - - /* - * end_key.k_offset is set so, that we will always have found - * last item of the file - */ - if (search_for_position_by_key(sb, &end_key, path) == - POSITION_FOUND) - reiserfs_panic(sb, "PAP-14050", - "direct item (%K) not found", &end_key); - p_le_ih = tp_item_head(path); - RFALSE(!is_direct_le_ih(p_le_ih), - "vs-14055: direct item expected(%K), found %h", - &end_key, p_le_ih); - tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) - + ih_item_len(p_le_ih) - 1; - - /* - * we only send the unbh pointer if the buffer is not - * up to date. this avoids overwriting good data from - * writepage() with old data from the disk or buffer cache - * Special case: unbh->b_page will be NULL if we are coming - * through DIRECT_IO handler here. - */ - if (!unbh->b_page || buffer_uptodate(unbh) - || PageUptodate(unbh->b_page)) { - up_to_date_bh = NULL; - } else { - up_to_date_bh = unbh; - } - retval = reiserfs_delete_item(th, path, &end_key, inode, - up_to_date_bh); - - total_tail += retval; - - /* done: file does not have direct items anymore */ - if (tail_size == retval) - break; - - } - /* - * if we've copied bytes from disk into the page, we need to zero - * out the unused part of the block (it was not up to date before) - */ - if (up_to_date_bh) { - unsigned pgoff = - (tail_offset + total_tail - 1) & (PAGE_SIZE - 1); - char *kaddr = kmap_atomic(up_to_date_bh->b_page); - memset(kaddr + pgoff, 0, blk_size - total_tail); - kunmap_atomic(kaddr); - } - - REISERFS_I(inode)->i_first_direct_byte = U32_MAX; - - return 0; -} - -/* stolen from fs/buffer.c */ -void reiserfs_unmap_buffer(struct buffer_head *bh) -{ - lock_buffer(bh); - if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { - BUG(); - } - clear_buffer_dirty(bh); - /* - * Remove the buffer from whatever list it belongs to. We are mostly - * interested in removing it from per-sb j_dirty_buffers list, to avoid - * BUG() on attempt to write not mapped buffer - */ - if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { - struct inode *inode = bh->b_folio->mapping->host; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); - spin_lock(&j->j_dirty_buffers_lock); - list_del_init(&bh->b_assoc_buffers); - reiserfs_free_jh(bh); - spin_unlock(&j->j_dirty_buffers_lock); - } - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - bh->b_bdev = NULL; - unlock_buffer(bh); -} - -/* - * this first locks inode (neither reads nor sync are permitted), - * reads tail through page cache, insert direct item. When direct item - * inserted successfully inode is left locked. Return value is always - * what we expect from it (number of cut bytes). But when tail remains - * in the unformatted node, we set mode to SKIP_BALANCING and unlock - * inode - */ -int indirect2direct(struct reiserfs_transaction_handle *th, - struct inode *inode, struct page *page, - struct treepath *path, /* path to the indirect item. */ - const struct cpu_key *item_key, /* Key to look for - * unformatted node - * pointer to be cut. */ - loff_t n_new_file_size, /* New file size. */ - char *mode) -{ - struct super_block *sb = inode->i_sb; - struct item_head s_ih; - unsigned long block_size = sb->s_blocksize; - char *tail; - int tail_len, round_tail_len; - loff_t pos, pos1; /* position of first byte of the tail */ - struct cpu_key key; - - BUG_ON(!th->t_trans_id); - - REISERFS_SB(sb)->s_indirect2direct++; - - *mode = M_SKIP_BALANCING; - - /* store item head path points to. */ - copy_item_head(&s_ih, tp_item_head(path)); - - tail_len = (n_new_file_size & (block_size - 1)); - if (get_inode_sd_version(inode) == STAT_DATA_V2) - round_tail_len = ROUND_UP(tail_len); - else - round_tail_len = tail_len; - - pos = - le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - - 1) * sb->s_blocksize; - pos1 = pos; - - /* - * we are protected by i_mutex. The tail can not disapper, not - * append can be done either - * we are in truncate or packing tail in file_release - */ - - tail = (char *)kmap(page); /* this can schedule */ - - if (path_changed(&s_ih, path)) { - /* re-search indirect item */ - if (search_for_position_by_key(sb, item_key, path) - == POSITION_NOT_FOUND) - reiserfs_panic(sb, "PAP-5520", - "item to be converted %K does not exist", - item_key); - copy_item_head(&s_ih, tp_item_head(path)); -#ifdef CONFIG_REISERFS_CHECK - pos = le_ih_k_offset(&s_ih) - 1 + - (ih_item_len(&s_ih) / UNFM_P_SIZE - - 1) * sb->s_blocksize; - if (pos != pos1) - reiserfs_panic(sb, "vs-5530", "tail position " - "changed while we were reading it"); -#endif - } - - /* Set direct item header to insert. */ - make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode), - pos1 + 1, TYPE_DIRECT, round_tail_len, - 0xffff /*ih_free_space */ ); - - /* - * we want a pointer to the first byte of the tail in the page. - * the page was locked and this part of the page was up to date when - * indirect2direct was called, so we know the bytes are still valid - */ - tail = tail + (pos & (PAGE_SIZE - 1)); - - PATH_LAST_POSITION(path)++; - - key = *item_key; - set_cpu_key_k_type(&key, TYPE_DIRECT); - key.key_length = 4; - /* Insert tail as new direct item in the tree */ - if (reiserfs_insert_item(th, path, &key, &s_ih, inode, - tail ? tail : NULL) < 0) { - /* - * No disk memory. So we can not convert last unformatted node - * to the direct item. In this case we used to adjust - * indirect items's ih_free_space. Now ih_free_space is not - * used, it would be ideal to write zeros to corresponding - * unformatted node. For now i_size is considered as guard for - * going out of file size - */ - kunmap(page); - return block_size - round_tail_len; - } - kunmap(page); - - /* make sure to get the i_blocks changes from reiserfs_insert_item */ - reiserfs_update_sd(th, inode); - - /* - * note: we have now the same as in above direct2indirect - * conversion: there are two keys which have matching first three - * key components. They only differ by the fourth one. - */ - - /* - * We have inserted new direct item and must remove last - * unformatted node. - */ - *mode = M_CUT; - - /* we store position of first direct item in the in-core inode */ - /* mark_file_with_tail (inode, pos1 + 1); */ - REISERFS_I(inode)->i_first_direct_byte = pos1 + 1; - - return block_size - round_tail_len; -} diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c deleted file mode 100644 index 998035a6388e..000000000000 --- a/fs/reiserfs/xattr.c +++ /dev/null @@ -1,1039 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/reiserfs/xattr.c - * - * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com> - * - */ - -/* - * In order to implement EA/ACLs in a clean, backwards compatible manner, - * they are implemented as files in a "private" directory. - * Each EA is in it's own file, with the directory layout like so (/ is assumed - * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, - * directories named using the capital-hex form of the objectid and - * generation number are used. Inside each directory are individual files - * named with the name of the extended attribute. - * - * So, for objectid 12648430, we could have: - * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access - * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default - * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type - * .. or similar. - * - * The file contents are the text of the EA. The size is known based on the - * stat data describing the file. - * - * In the case of system.posix_acl_access and system.posix_acl_default, since - * these are special cases for filesystem ACLs, they are interpreted by the - * kernel, in addition, they are negatively and positively cached and attached - * to the inode so that unnecessary lookups are avoided. - * - * Locking works like so: - * Directory components (xattr root, xattr dir) are protectd by their i_mutex. - * The xattrs themselves are protected by the xattr_sem. - */ - -#include "reiserfs.h" -#include <linux/capability.h> -#include <linux/dcache.h> -#include <linux/namei.h> -#include <linux/errno.h> -#include <linux/gfp.h> -#include <linux/fs.h> -#include <linux/file.h> -#include <linux/pagemap.h> -#include <linux/xattr.h> -#include "xattr.h" -#include "acl.h" -#include <linux/uaccess.h> -#include <net/checksum.h> -#include <linux/stat.h> -#include <linux/quotaops.h> -#include <linux/security.h> -#include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> - -#define PRIVROOT_NAME ".reiserfs_priv" -#define XAROOT_NAME "xattrs" - - -/* - * Helpers for inode ops. We do this so that we don't have all the VFS - * overhead and also for proper i_mutex annotation. - * dir->i_mutex must be held for all of them. - */ -#ifdef CONFIG_REISERFS_FS_XATTR -static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) -{ - BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true); -} -#endif - -static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode); -} - -/* - * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr - * mutation ops aren't called during rename or splace, which are the - * only other users of I_MUTEX_CHILD. It violates the ordering, but that's - * better than allocating another subclass just for this code. - */ -static int xattr_unlink(struct inode *dir, struct dentry *dentry) -{ - int error; - - BUG_ON(!inode_is_locked(dir)); - - inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); - error = dir->i_op->unlink(dir, dentry); - inode_unlock(d_inode(dentry)); - - if (!error) - d_delete(dentry); - return error; -} - -static int xattr_rmdir(struct inode *dir, struct dentry *dentry) -{ - int error; - - BUG_ON(!inode_is_locked(dir)); - - inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); - error = dir->i_op->rmdir(dir, dentry); - if (!error) - d_inode(dentry)->i_flags |= S_DEAD; - inode_unlock(d_inode(dentry)); - if (!error) - d_delete(dentry); - - return error; -} - -#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE) - -static struct dentry *open_xa_root(struct super_block *sb, int flags) -{ - struct dentry *privroot = REISERFS_SB(sb)->priv_root; - struct dentry *xaroot; - - if (d_really_is_negative(privroot)) - return ERR_PTR(-EOPNOTSUPP); - - inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); - - xaroot = dget(REISERFS_SB(sb)->xattr_root); - if (!xaroot) - xaroot = ERR_PTR(-EOPNOTSUPP); - else if (d_really_is_negative(xaroot)) { - int err = -ENODATA; - - if (xattr_may_create(flags)) - err = xattr_mkdir(d_inode(privroot), xaroot, 0700); - if (err) { - dput(xaroot); - xaroot = ERR_PTR(err); - } - } - - inode_unlock(d_inode(privroot)); - return xaroot; -} - -static struct dentry *open_xa_dir(const struct inode *inode, int flags) -{ - struct dentry *xaroot, *xadir; - char namebuf[17]; - - xaroot = open_xa_root(inode->i_sb, flags); - if (IS_ERR(xaroot)) - return xaroot; - - snprintf(namebuf, sizeof(namebuf), "%X.%X", - le32_to_cpu(INODE_PKEY(inode)->k_objectid), - inode->i_generation); - - inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); - - xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); - if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { - int err = -ENODATA; - - if (xattr_may_create(flags)) - err = xattr_mkdir(d_inode(xaroot), xadir, 0700); - if (err) { - dput(xadir); - xadir = ERR_PTR(err); - } - } - - inode_unlock(d_inode(xaroot)); - dput(xaroot); - return xadir; -} - -/* - * The following are side effects of other operations that aren't explicitly - * modifying extended attributes. This includes operations such as permissions - * or ownership changes, object deletions, etc. - */ -struct reiserfs_dentry_buf { - struct dir_context ctx; - struct dentry *xadir; - int count; - int err; - struct dentry *dentries[8]; -}; - -static bool -fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct reiserfs_dentry_buf *dbuf = - container_of(ctx, struct reiserfs_dentry_buf, ctx); - struct dentry *dentry; - - WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); - - if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) - return false; - - if (name[0] == '.' && (namelen < 2 || - (namelen == 2 && name[1] == '.'))) - return true; - - dentry = lookup_one_len(name, dbuf->xadir, namelen); - if (IS_ERR(dentry)) { - dbuf->err = PTR_ERR(dentry); - return false; - } else if (d_really_is_negative(dentry)) { - /* A directory entry exists, but no file? */ - reiserfs_error(dentry->d_sb, "xattr-20003", - "Corrupted directory: xattr %pd listed but " - "not found for file %pd.\n", - dentry, dbuf->xadir); - dput(dentry); - dbuf->err = -EIO; - return false; - } - - dbuf->dentries[dbuf->count++] = dentry; - return true; -} - -static void -cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) -{ - int i; - - for (i = 0; i < buf->count; i++) - if (buf->dentries[i]) - dput(buf->dentries[i]); -} - -static int reiserfs_for_each_xattr(struct inode *inode, - int (*action)(struct dentry *, void *), - void *data) -{ - struct dentry *dir; - int i, err = 0; - struct reiserfs_dentry_buf buf = { - .ctx.actor = fill_with_dentries, - }; - - /* Skip out, an xattr has no xattrs associated with it */ - if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) - return 0; - - dir = open_xa_dir(inode, XATTR_REPLACE); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } else if (d_really_is_negative(dir)) { - err = 0; - goto out_dir; - } - - inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); - - buf.xadir = dir; - while (1) { - err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - if (err) - break; - if (buf.err) { - err = buf.err; - break; - } - if (!buf.count) - break; - for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { - struct dentry *dentry = buf.dentries[i]; - - if (!d_is_dir(dentry)) - err = action(dentry, data); - - dput(dentry); - buf.dentries[i] = NULL; - } - if (err) - break; - buf.count = 0; - } - inode_unlock(d_inode(dir)); - - cleanup_dentry_buf(&buf); - - if (!err) { - /* - * We start a transaction here to avoid a ABBA situation - * between the xattr root's i_mutex and the journal lock. - * This doesn't incur much additional overhead since the - * new transaction will just nest inside the - * outer transaction. - */ - int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + - 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); - struct reiserfs_transaction_handle th; - - reiserfs_write_lock(inode->i_sb); - err = journal_begin(&th, inode->i_sb, blocks); - reiserfs_write_unlock(inode->i_sb); - if (!err) { - int jerror; - - inode_lock_nested(d_inode(dir->d_parent), - I_MUTEX_XATTR); - err = action(dir, data); - reiserfs_write_lock(inode->i_sb); - jerror = journal_end(&th); - reiserfs_write_unlock(inode->i_sb); - inode_unlock(d_inode(dir->d_parent)); - err = jerror ?: err; - } - } -out_dir: - dput(dir); -out: - /* - * -ENODATA: this object doesn't have any xattrs - * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk. - * Neither are errors - */ - if (err == -ENODATA || err == -EOPNOTSUPP) - err = 0; - return err; -} - -static int delete_one_xattr(struct dentry *dentry, void *data) -{ - struct inode *dir = d_inode(dentry->d_parent); - - /* This is the xattr dir, handle specially. */ - if (d_is_dir(dentry)) - return xattr_rmdir(dir, dentry); - - return xattr_unlink(dir, dentry); -} - -static int chown_one_xattr(struct dentry *dentry, void *data) -{ - struct iattr *attrs = data; - int ia_valid = attrs->ia_valid; - int err; - - /* - * We only want the ownership bits. Otherwise, we'll do - * things like change a directory to a regular file if - * ATTR_MODE is set. - */ - attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs); - attrs->ia_valid = ia_valid; - - return err; -} - -/* No i_mutex, but the inode is unconnected. */ -int reiserfs_delete_xattrs(struct inode *inode) -{ - int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); - - if (err) - reiserfs_warning(inode->i_sb, "jdm-20004", - "Couldn't delete all xattrs (%d)\n", err); - return err; -} - -/* inode->i_mutex: down */ -int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) -{ - int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); - - if (err) - reiserfs_warning(inode->i_sb, "jdm-20007", - "Couldn't chown all xattrs (%d)\n", err); - return err; -} - -#ifdef CONFIG_REISERFS_FS_XATTR -/* - * Returns a dentry corresponding to a specific extended attribute file - * for the inode. If flags allow, the file is created. Otherwise, a - * valid or negative dentry, or an error is returned. - */ -static struct dentry *xattr_lookup(struct inode *inode, const char *name, - int flags) -{ - struct dentry *xadir, *xafile; - int err = 0; - - xadir = open_xa_dir(inode, flags); - if (IS_ERR(xadir)) - return ERR_CAST(xadir); - - inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); - xafile = lookup_one_len(name, xadir, strlen(name)); - if (IS_ERR(xafile)) { - err = PTR_ERR(xafile); - goto out; - } - - if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) - err = -EEXIST; - - if (d_really_is_negative(xafile)) { - err = -ENODATA; - if (xattr_may_create(flags)) - err = xattr_create(d_inode(xadir), xafile, - 0700|S_IFREG); - } - - if (err) - dput(xafile); -out: - inode_unlock(d_inode(xadir)); - dput(xadir); - if (err) - return ERR_PTR(err); - return xafile; -} - -/* Internal operations on file data */ -static inline void reiserfs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -static struct page *reiserfs_get_page(struct inode *dir, size_t n) -{ - struct address_space *mapping = dir->i_mapping; - struct page *page; - /* - * We can deadlock if we try to free dentries, - * and an unlink/rmdir has just occurred - GFP_NOFS avoids this - */ - mapping_set_gfp_mask(mapping, GFP_NOFS); - page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; -} - -static inline __u32 xattr_hash(const char *msg, int len) -{ - /* - * csum_partial() gives different results for little-endian and - * big endian hosts. Images created on little-endian hosts and - * mounted on big-endian hosts(and vice versa) will see csum mismatches - * when trying to fetch xattrs. Treating the hash as __wsum_t would - * lower the frequency of mismatch. This is an endianness bug in - * reiserfs. The return statement would result in a sparse warning. Do - * not fix the sparse warning so as to not hide a reminder of the bug. - */ - return csum_partial(msg, len, 0); -} - -int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to); - -static void update_ctime(struct inode *inode) -{ - struct timespec64 now = current_time(inode); - struct timespec64 ctime = inode_get_ctime(inode); - - if (inode_unhashed(inode) || !inode->i_nlink || - timespec64_equal(&ctime, &now)) - return; - - inode_set_ctime_to_ts(inode, now); - mark_inode_dirty(inode); -} - -static int lookup_and_delete_xattr(struct inode *inode, const char *name) -{ - int err = 0; - struct dentry *dentry, *xadir; - - xadir = open_xa_dir(inode, XATTR_REPLACE); - if (IS_ERR(xadir)) - return PTR_ERR(xadir); - - inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); - dentry = lookup_one_len(name, xadir, strlen(name)); - if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); - goto out_dput; - } - - if (d_really_is_positive(dentry)) { - err = xattr_unlink(d_inode(xadir), dentry); - update_ctime(inode); - } - - dput(dentry); -out_dput: - inode_unlock(d_inode(xadir)); - dput(xadir); - return err; -} - - -/* Generic extended attribute operations that can be used by xa plugins */ - -/* - * inode->i_mutex: down - */ -int -reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, - struct inode *inode, const char *name, - const void *buffer, size_t buffer_size, int flags) -{ - int err = 0; - struct dentry *dentry; - struct page *page; - char *data; - size_t file_pos = 0; - size_t buffer_pos = 0; - size_t new_size; - __u32 xahash = 0; - - if (get_inode_sd_version(inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - if (!buffer) { - err = lookup_and_delete_xattr(inode, name); - return err; - } - - dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - - down_write(&REISERFS_I(inode)->i_xattr_sem); - - xahash = xattr_hash(buffer, buffer_size); - while (buffer_pos < buffer_size || buffer_pos == 0) { - size_t chunk; - size_t skip = 0; - size_t page_offset = (file_pos & (PAGE_SIZE - 1)); - - if (buffer_size - buffer_pos > PAGE_SIZE) - chunk = PAGE_SIZE; - else - chunk = buffer_size - buffer_pos; - - page = reiserfs_get_page(d_inode(dentry), file_pos); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out_unlock; - } - - lock_page(page); - data = page_address(page); - - if (file_pos == 0) { - struct reiserfs_xattr_header *rxh; - - skip = file_pos = sizeof(struct reiserfs_xattr_header); - if (chunk + skip > PAGE_SIZE) - chunk = PAGE_SIZE - skip; - rxh = (struct reiserfs_xattr_header *)data; - rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); - rxh->h_hash = cpu_to_le32(xahash); - } - - reiserfs_write_lock(inode->i_sb); - err = __reiserfs_write_begin(page, page_offset, chunk + skip); - if (!err) { - if (buffer) - memcpy(data + skip, buffer + buffer_pos, chunk); - err = reiserfs_commit_write(NULL, page, page_offset, - page_offset + chunk + - skip); - } - reiserfs_write_unlock(inode->i_sb); - unlock_page(page); - reiserfs_put_page(page); - buffer_pos += chunk; - file_pos += chunk; - skip = 0; - if (err || buffer_size == 0 || !buffer) - break; - } - - new_size = buffer_size + sizeof(struct reiserfs_xattr_header); - if (!err && new_size < i_size_read(d_inode(dentry))) { - struct iattr newattrs = { - .ia_ctime = current_time(inode), - .ia_size = new_size, - .ia_valid = ATTR_SIZE | ATTR_CTIME, - }; - - inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); - inode_dio_wait(d_inode(dentry)); - - err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs); - inode_unlock(d_inode(dentry)); - } else - update_ctime(inode); -out_unlock: - up_write(&REISERFS_I(inode)->i_xattr_sem); - dput(dentry); - return err; -} - -/* We need to start a transaction to maintain lock ordering */ -int reiserfs_xattr_set(struct inode *inode, const char *name, - const void *buffer, size_t buffer_size, int flags) -{ - - struct reiserfs_transaction_handle th; - int error, error2; - size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); - - /* Check before we start a transaction and then do nothing. */ - if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root)) - return -EOPNOTSUPP; - - if (!(flags & XATTR_REPLACE)) - jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); - - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, jbegin_count); - reiserfs_write_unlock(inode->i_sb); - if (error) { - return error; - } - - error = reiserfs_xattr_set_handle(&th, inode, name, - buffer, buffer_size, flags); - - reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th); - reiserfs_write_unlock(inode->i_sb); - if (error == 0) - error = error2; - - return error; -} - -/* - * inode->i_mutex: down - */ -int -reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, - size_t buffer_size) -{ - ssize_t err = 0; - struct dentry *dentry; - size_t isize; - size_t file_pos = 0; - size_t buffer_pos = 0; - struct page *page; - __u32 hash = 0; - - if (name == NULL) - return -EINVAL; - - /* - * We can't have xattrs attached to v1 items since they don't have - * generation numbers - */ - if (get_inode_sd_version(inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - /* - * priv_root needn't be initialized during mount so allow initial - * lookups to succeed. - */ - if (!REISERFS_SB(inode->i_sb)->priv_root) - return 0; - - dentry = xattr_lookup(inode, name, XATTR_REPLACE); - if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); - goto out; - } - - down_read(&REISERFS_I(inode)->i_xattr_sem); - - isize = i_size_read(d_inode(dentry)); - - /* Just return the size needed */ - if (buffer == NULL) { - err = isize - sizeof(struct reiserfs_xattr_header); - goto out_unlock; - } - - if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { - err = -ERANGE; - goto out_unlock; - } - - while (file_pos < isize) { - size_t chunk; - char *data; - size_t skip = 0; - - if (isize - file_pos > PAGE_SIZE) - chunk = PAGE_SIZE; - else - chunk = isize - file_pos; - - page = reiserfs_get_page(d_inode(dentry), file_pos); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out_unlock; - } - - lock_page(page); - data = page_address(page); - if (file_pos == 0) { - struct reiserfs_xattr_header *rxh = - (struct reiserfs_xattr_header *)data; - skip = file_pos = sizeof(struct reiserfs_xattr_header); - chunk -= skip; - /* Magic doesn't match up.. */ - if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { - unlock_page(page); - reiserfs_put_page(page); - reiserfs_warning(inode->i_sb, "jdm-20001", - "Invalid magic for xattr (%s) " - "associated with %k", name, - INODE_PKEY(inode)); - err = -EIO; - goto out_unlock; - } - hash = le32_to_cpu(rxh->h_hash); - } - memcpy(buffer + buffer_pos, data + skip, chunk); - unlock_page(page); - reiserfs_put_page(page); - file_pos += chunk; - buffer_pos += chunk; - skip = 0; - } - err = isize - sizeof(struct reiserfs_xattr_header); - - if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != - hash) { - reiserfs_warning(inode->i_sb, "jdm-20002", - "Invalid hash for xattr (%s) associated " - "with %k", name, INODE_PKEY(inode)); - err = -EIO; - } - -out_unlock: - up_read(&REISERFS_I(inode)->i_xattr_sem); - dput(dentry); - -out: - return err; -} - -/* - * In order to implement different sets of xattr operations for each xattr - * prefix with the generic xattr API, a filesystem should create a - * null-terminated array of struct xattr_handler (one for each prefix) and - * hang a pointer to it off of the s_xattr field of the superblock. - * - * The generic_fooxattr() functions will use this list to dispatch xattr - * operations to the correct xattr_handler. - */ -#define for_each_xattr_handler(handlers, handler) \ - for ((handler) = *(handlers)++; \ - (handler) != NULL; \ - (handler) = *(handlers)++) - -static inline bool reiserfs_posix_acl_list(const char *name, - struct dentry *dentry) -{ - return (posix_acl_type(name) >= 0) && - IS_POSIXACL(d_backing_inode(dentry)); -} - -/* This is the implementation for the xattr plugin infrastructure */ -static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers, - const char *name, struct dentry *dentry) -{ - if (handlers) { - const struct xattr_handler *xah = NULL; - - for_each_xattr_handler(handlers, xah) { - const char *prefix = xattr_prefix(xah); - - if (strncmp(prefix, name, strlen(prefix))) - continue; - - if (!xattr_handler_can_list(xah, dentry)) - return false; - - return true; - } - } - - return reiserfs_posix_acl_list(name, dentry); -} - -struct listxattr_buf { - struct dir_context ctx; - size_t size; - size_t pos; - char *buf; - struct dentry *dentry; -}; - -static bool listxattr_filler(struct dir_context *ctx, const char *name, - int namelen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct listxattr_buf *b = - container_of(ctx, struct listxattr_buf, ctx); - size_t size; - - if (name[0] != '.' || - (namelen != 1 && (name[1] != '.' || namelen != 2))) { - if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name, - b->dentry)) - return true; - size = namelen + 1; - if (b->buf) { - if (b->pos + size > b->size) { - b->pos = -ERANGE; - return false; - } - memcpy(b->buf + b->pos, name, namelen); - b->buf[b->pos + namelen] = 0; - } - b->pos += size; - } - return true; -} - -/* - * Inode operation listxattr() - * - * We totally ignore the generic listxattr here because it would be stupid - * not to. Since the xattrs are organized in a directory, we can just - * readdir to find them. - */ -ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) -{ - struct dentry *dir; - int err = 0; - struct listxattr_buf buf = { - .ctx.actor = listxattr_filler, - .dentry = dentry, - .buf = buffer, - .size = buffer ? size : 0, - }; - - if (d_really_is_negative(dentry)) - return -EINVAL; - - if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - if (err == -ENODATA) - err = 0; /* Not an error if there aren't any xattrs */ - goto out; - } - - inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); - err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - inode_unlock(d_inode(dir)); - - if (!err) - err = buf.pos; - - dput(dir); -out: - return err; -} - -static int create_privroot(struct dentry *dentry) -{ - int err; - struct inode *inode = d_inode(dentry->d_parent); - - WARN_ON_ONCE(!inode_is_locked(inode)); - - err = xattr_mkdir(inode, dentry, 0700); - if (err || d_really_is_negative(dentry)) { - reiserfs_warning(dentry->d_sb, "jdm-20006", - "xattrs/ACLs enabled and couldn't " - "find/create .reiserfs_priv. " - "Failing mount."); - return -EOPNOTSUPP; - } - - reiserfs_init_priv_inode(d_inode(dentry)); - reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " - "storage.\n", PRIVROOT_NAME); - - return 0; -} - -#else -int __init reiserfs_xattr_register_handlers(void) { return 0; } -void reiserfs_xattr_unregister_handlers(void) {} -static int create_privroot(struct dentry *dentry) { return 0; } -#endif - -/* Actual operations that are exported to VFS-land */ -const struct xattr_handler * const reiserfs_xattr_handlers[] = { -#ifdef CONFIG_REISERFS_FS_XATTR - &reiserfs_xattr_user_handler, - &reiserfs_xattr_trusted_handler, -#endif -#ifdef CONFIG_REISERFS_FS_SECURITY - &reiserfs_xattr_security_handler, -#endif - NULL -}; - -static int xattr_mount_check(struct super_block *s) -{ - /* - * We need generation numbers to ensure that the oid mapping is correct - * v3.5 filesystems don't have them. - */ - if (old_format_only(s)) { - if (reiserfs_xattrs_optional(s)) { - /* - * Old format filesystem, but optional xattrs have - * been enabled. Error out. - */ - reiserfs_warning(s, "jdm-2005", - "xattrs/ACLs not supported " - "on pre-v3.6 format filesystems. " - "Failing mount."); - return -EOPNOTSUPP; - } - } - - return 0; -} - -int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode, - int mask) -{ - /* - * We don't do permission checks on the internal objects. - * Permissions are determined by the "owning" object. - */ - if (IS_PRIVATE(inode)) - return 0; - - return generic_permission(&nop_mnt_idmap, inode, mask); -} - -static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) -{ - return -EPERM; -} - -static const struct dentry_operations xattr_lookup_poison_ops = { - .d_revalidate = xattr_hide_revalidate, -}; - -int reiserfs_lookup_privroot(struct super_block *s) -{ - struct dentry *dentry; - int err = 0; - - /* If we don't have the privroot located yet - go find it */ - inode_lock(d_inode(s->s_root)); - dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, - strlen(PRIVROOT_NAME)); - if (!IS_ERR(dentry)) { - REISERFS_SB(s)->priv_root = dentry; - d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (d_really_is_positive(dentry)) - reiserfs_init_priv_inode(d_inode(dentry)); - } else - err = PTR_ERR(dentry); - inode_unlock(d_inode(s->s_root)); - - return err; -} - -/* - * We need to take a copy of the mount flags since things like - * SB_RDONLY don't get set until *after* we're called. - * mount_flags != mount_options - */ -int reiserfs_xattr_init(struct super_block *s, int mount_flags) -{ - int err = 0; - struct dentry *privroot = REISERFS_SB(s)->priv_root; - - err = xattr_mount_check(s); - if (err) - goto error; - - if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) { - inode_lock(d_inode(s->s_root)); - err = create_privroot(REISERFS_SB(s)->priv_root); - inode_unlock(d_inode(s->s_root)); - } - - if (d_really_is_positive(privroot)) { - inode_lock(d_inode(privroot)); - if (!REISERFS_SB(s)->xattr_root) { - struct dentry *dentry; - - dentry = lookup_one_len(XAROOT_NAME, privroot, - strlen(XAROOT_NAME)); - if (!IS_ERR(dentry)) - REISERFS_SB(s)->xattr_root = dentry; - else - err = PTR_ERR(dentry); - } - inode_unlock(d_inode(privroot)); - } - -error: - if (err) { - clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt); - clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); - } - - /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */ - if (reiserfs_posixacl(s)) - s->s_flags |= SB_POSIXACL; - else - s->s_flags &= ~SB_POSIXACL; - - return err; -} diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h deleted file mode 100644 index 5868a4e990e3..000000000000 --- a/fs/reiserfs/xattr.h +++ /dev/null @@ -1,117 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/reiserfs_xattr.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/rwsem.h> -#include <linux/xattr.h> - -struct inode; -struct dentry; -struct iattr; -struct super_block; - -int reiserfs_xattr_register_handlers(void) __init; -void reiserfs_xattr_unregister_handlers(void); -int reiserfs_xattr_init(struct super_block *sb, int mount_flags); -int reiserfs_lookup_privroot(struct super_block *sb); -int reiserfs_delete_xattrs(struct inode *inode); -int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct mnt_idmap *idmap, - struct inode *inode, int mask); - -#ifdef CONFIG_REISERFS_FS_XATTR -#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); - -int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); -int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); -int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *, - struct inode *, const char *, const void *, - size_t, int); - -extern const struct xattr_handler reiserfs_xattr_user_handler; -extern const struct xattr_handler reiserfs_xattr_trusted_handler; -extern const struct xattr_handler reiserfs_xattr_security_handler; -#ifdef CONFIG_REISERFS_FS_SECURITY -int reiserfs_security_init(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct reiserfs_security_handle *sec); -int reiserfs_security_write(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct reiserfs_security_handle *sec); -void reiserfs_security_free(struct reiserfs_security_handle *sec); -#endif - -static inline int reiserfs_xattrs_initialized(struct super_block *sb) -{ - return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root; -} - -#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) -static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) -{ - loff_t ret = 0; - if (reiserfs_file_data_log(inode)) { - ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize); - ret >>= inode->i_sb->s_blocksize_bits; - } - return ret; -} - -/* - * We may have to create up to 3 objects: xattr root, xattr dir, xattr file. - * Let's try to be smart about it. - * xattr root: We cache it. If it's not cached, we may need to create it. - * xattr dir: If anything has been loaded for this inode, we can set a flag - * saying so. - * xattr file: Since we don't cache xattrs, we can't tell. We always include - * blocks for it. - * - * However, since root and dir can be created between calls - YOU MUST SAVE - * THIS VALUE. - */ -static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) -{ - size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - - if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { - nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) - nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - } - - return nblocks; -} - -static inline void reiserfs_init_xattr_rwsem(struct inode *inode) -{ - init_rwsem(&REISERFS_I(inode)->i_xattr_sem); -} - -#else - -#define reiserfs_listxattr NULL - -static inline void reiserfs_init_xattr_rwsem(struct inode *inode) -{ -} -#endif /* CONFIG_REISERFS_FS_XATTR */ - -#ifndef CONFIG_REISERFS_FS_SECURITY -static inline int reiserfs_security_init(struct inode *dir, - struct inode *inode, - const struct qstr *qstr, - struct reiserfs_security_handle *sec) -{ - return 0; -} -static inline int -reiserfs_security_write(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct reiserfs_security_handle *sec) -{ - return 0; -} -static inline void reiserfs_security_free(struct reiserfs_security_handle *sec) -{} -#endif diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c deleted file mode 100644 index 064264992b49..000000000000 --- a/fs/reiserfs/xattr_acl.c +++ /dev/null @@ -1,411 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/posix_acl.h> -#include "reiserfs.h" -#include <linux/errno.h> -#include <linux/pagemap.h> -#include <linux/xattr.h> -#include <linux/slab.h> -#include <linux/posix_acl_xattr.h> -#include "xattr.h" -#include "acl.h" -#include <linux/uaccess.h> - -static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, - struct inode *inode, int type, - struct posix_acl *acl); - - -int -reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, - struct posix_acl *acl, int type) -{ - int error, error2; - struct reiserfs_transaction_handle th; - size_t jcreate_blocks; - int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; - int update_mode = 0; - struct inode *inode = d_inode(dentry); - umode_t mode = inode->i_mode; - - /* - * Pessimism: We can't assume that anything from the xattr root up - * has been created. - */ - - jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + - reiserfs_xattr_nblocks(inode, size) * 2; - - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, jcreate_blocks); - reiserfs_write_unlock(inode->i_sb); - if (error == 0) { - if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(&nop_mnt_idmap, inode, - &mode, &acl); - if (error) - goto unlock; - update_mode = 1; - } - error = __reiserfs_set_acl(&th, inode, type, acl); - if (!error && update_mode) - inode->i_mode = mode; -unlock: - reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th); - reiserfs_write_unlock(inode->i_sb); - if (error2) - error = error2; - } - - return error; -} - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) -{ - const char *end = (char *)value + size; - int n, count; - struct posix_acl *acl; - - if (!value) - return NULL; - if (size < sizeof(reiserfs_acl_header)) - return ERR_PTR(-EINVAL); - if (((reiserfs_acl_header *) value)->a_version != - cpu_to_le32(REISERFS_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(reiserfs_acl_header); - count = reiserfs_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n = 0; n < count; n++) { - reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; - if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch (acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(reiserfs_acl_entry_short); - break; - - case ACL_USER: - value = (char *)value + sizeof(reiserfs_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_uid = - make_kuid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - case ACL_GROUP: - value = (char *)value + sizeof(reiserfs_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_gid = - make_kgid(&init_user_ns, - le32_to_cpu(entry->e_id)); - break; - - default: - goto fail; - } - } - if (value != end) - goto fail; - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) -{ - reiserfs_acl_header *ext_acl; - char *e; - int n; - - *size = reiserfs_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(reiserfs_acl_header) + - acl->a_count * - sizeof(reiserfs_acl_entry), - GFP_NOFS); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); - e = (char *)ext_acl + sizeof(reiserfs_acl_header); - for (n = 0; n < acl->a_count; n++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch (acl->a_entries[n].e_tag) { - case ACL_USER: - entry->e_id = cpu_to_le32( - from_kuid(&init_user_ns, acl_e->e_uid)); - e += sizeof(reiserfs_acl_entry); - break; - case ACL_GROUP: - entry->e_id = cpu_to_le32( - from_kgid(&init_user_ns, acl_e->e_gid)); - e += sizeof(reiserfs_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(reiserfs_acl_entry_short); - break; - - default: - goto fail; - } - } - return (char *)ext_acl; - -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); -} - -/* - * Inode operation get_posix_acl(). - * - * inode->i_mutex: down - * BKL held [before 2.5.x] - */ -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu) -{ - char *name, *value; - struct posix_acl *acl; - int size; - int retval; - - if (rcu) - return ERR_PTR(-ECHILD); - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - - size = reiserfs_xattr_get(inode, name, NULL, 0); - if (size < 0) { - if (size == -ENODATA || size == -ENOSYS) - return NULL; - return ERR_PTR(size); - } - - value = kmalloc(size, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - - retval = reiserfs_xattr_get(inode, name, value, size); - if (retval == -ENODATA || retval == -ENOSYS) { - /* - * This shouldn't actually happen as it should have - * been caught above.. but just in case - */ - acl = NULL; - } else if (retval < 0) { - acl = ERR_PTR(retval); - } else { - acl = reiserfs_posix_acl_from_disk(value, retval); - } - - kfree(value); - return acl; -} - -/* - * Inode operation set_posix_acl(). - * - * inode->i_mutex: down - * BKL held [before 2.5.x] - */ -static int -__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, - int type, struct posix_acl *acl) -{ - char *name; - void *value = NULL; - size_t size = 0; - int error; - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - default: - return -EINVAL; - } - - if (acl) { - value = reiserfs_posix_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } - - error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0); - - /* - * Ensure that the inode gets dirtied if we're only using - * the mode bits and an old ACL didn't exist. We don't need - * to check if the inode is hashed here since we won't get - * called by reiserfs_inherit_default_acl(). - */ - if (error == -ENODATA) { - error = 0; - if (type == ACL_TYPE_ACCESS) { - inode_set_ctime_current(inode); - mark_inode_dirty(inode); - } - } - - kfree(value); - - if (!error) - set_cached_acl(inode, type, acl); - - return error; -} - -/* - * dir->i_mutex: locked, - * inode is new and not released into the wild yet - */ -int -reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, - struct inode *dir, struct dentry *dentry, - struct inode *inode) -{ - struct posix_acl *default_acl, *acl; - int err = 0; - - /* ACLs only get applied to files and directories */ - if (S_ISLNK(inode->i_mode)) - return 0; - - /* - * ACLs can only be used on "new" objects, so if it's an old object - * there is nothing to inherit from - */ - if (get_inode_sd_version(dir) == STAT_DATA_V1) - goto apply_umask; - - /* - * Don't apply ACLs to objects in the .reiserfs_priv tree.. This - * would be useless since permissions are ignored, and a pain because - * it introduces locking cycles - */ - if (IS_PRIVATE(inode)) - goto apply_umask; - - err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (err) - return err; - - if (default_acl) { - err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, - default_acl); - posix_acl_release(default_acl); - } - if (acl) { - if (!err) - err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, - acl); - posix_acl_release(acl); - } - - return err; - -apply_umask: - /* no ACL, apply umask */ - inode->i_mode &= ~current_umask(); - return err; -} - -/* This is used to cache the default acl before a new object is created. - * The biggest reason for this is to get an idea of how many blocks will - * actually be required for the create operation if we must inherit an ACL. - * An ACL write can add up to 3 object creations and an additional file write - * so we'd prefer not to reserve that many blocks in the journal if we can. - * It also has the advantage of not loading the ACL with a transaction open, - * this may seem silly, but if the owner of the directory is doing the - * creation, the ACL may not be loaded since the permissions wouldn't require - * it. - * We return the number of blocks required for the transaction. - */ -int reiserfs_cache_default_acl(struct inode *inode) -{ - struct posix_acl *acl; - int nblocks = 0; - - if (IS_PRIVATE(inode)) - return 0; - - acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); - - if (acl && !IS_ERR(acl)) { - int size = reiserfs_acl_size(acl->a_count); - - /* Other xattrs can be created during inode creation. We don't - * want to claim too many blocks, so we check to see if we - * need to create the tree to the xattrs, and then we - * just want two files. */ - nblocks = reiserfs_xattr_jcreate_nblocks(inode); - nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - - REISERFS_I(inode)->i_flags |= i_has_xattr_dir; - - /* We need to account for writes + bitmaps for two files */ - nblocks += reiserfs_xattr_nblocks(inode, size) * 4; - posix_acl_release(acl); - } - - return nblocks; -} - -/* - * Called under i_mutex - */ -int reiserfs_acl_chmod(struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - - if (IS_PRIVATE(inode)) - return 0; - if (get_inode_sd_version(inode) == STAT_DATA_V1 || - !reiserfs_posixacl(inode->i_sb)) - return 0; - - return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode); -} diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c deleted file mode 100644 index 078dd8cc312f..000000000000 --- a/fs/reiserfs/xattr_security.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "reiserfs.h" -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/xattr.h> -#include <linux/slab.h> -#include "xattr.h" -#include <linux/security.h> -#include <linux/uaccess.h> - -static int -security_get(const struct xattr_handler *handler, struct dentry *unused, - struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (IS_PRIVATE(inode)) - return -EPERM; - - return reiserfs_xattr_get(inode, xattr_full_name(handler, name), - buffer, size); -} - -static int -security_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, struct dentry *unused, - struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) -{ - if (IS_PRIVATE(inode)) - return -EPERM; - - return reiserfs_xattr_set(inode, - xattr_full_name(handler, name), - buffer, size, flags); -} - -static bool security_list(struct dentry *dentry) -{ - return !IS_PRIVATE(d_inode(dentry)); -} - -static int -reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) -{ - struct reiserfs_security_handle *sec = fs_info; - - sec->value = kmemdup(xattr_array->value, xattr_array->value_len, - GFP_KERNEL); - if (!sec->value) - return -ENOMEM; - - sec->name = xattr_array->name; - sec->length = xattr_array->value_len; - return 0; -} - -/* Initializes the security context for a new inode and returns the number - * of blocks needed for the transaction. If successful, reiserfs_security - * must be released using reiserfs_security_free when the caller is done. */ -int reiserfs_security_init(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct reiserfs_security_handle *sec) -{ - int blocks = 0; - int error; - - sec->name = NULL; - sec->value = NULL; - sec->length = 0; - - /* Don't add selinux attributes on xattrs - they'll never get used */ - if (IS_PRIVATE(dir)) - return 0; - - error = security_inode_init_security(inode, dir, qstr, - &reiserfs_initxattrs, sec); - if (error) { - sec->name = NULL; - sec->value = NULL; - sec->length = 0; - return error; - } - - if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) { - blocks = reiserfs_xattr_jcreate_nblocks(inode) + - reiserfs_xattr_nblocks(inode, sec->length); - /* We don't want to count the directories twice if we have - * a default ACL. */ - REISERFS_I(inode)->i_flags |= i_has_xattr_dir; - } - return blocks; -} - -int reiserfs_security_write(struct reiserfs_transaction_handle *th, - struct inode *inode, - struct reiserfs_security_handle *sec) -{ - char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX; - int error; - - if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX) - return -EINVAL; - - strlcat(xattr_name, sec->name, sizeof(xattr_name)); - - error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value, - sec->length, XATTR_CREATE); - if (error == -ENODATA || error == -EOPNOTSUPP) - error = 0; - - return error; -} - -void reiserfs_security_free(struct reiserfs_security_handle *sec) -{ - kfree(sec->value); - sec->name = NULL; - sec->value = NULL; -} - -const struct xattr_handler reiserfs_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = security_get, - .set = security_set, - .list = security_list, -}; diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c deleted file mode 100644 index 0c0c74d8db0e..000000000000 --- a/fs/reiserfs/xattr_trusted.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "reiserfs.h" -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/xattr.h> -#include "xattr.h" -#include <linux/uaccess.h> - -static int -trusted_get(const struct xattr_handler *handler, struct dentry *unused, - struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) - return -EPERM; - - return reiserfs_xattr_get(inode, xattr_full_name(handler, name), - buffer, size); -} - -static int -trusted_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, struct dentry *unused, - struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) -{ - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) - return -EPERM; - - return reiserfs_xattr_set(inode, - xattr_full_name(handler, name), - buffer, size, flags); -} - -static bool trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry)); -} - -const struct xattr_handler reiserfs_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .get = trusted_get, - .set = trusted_set, - .list = trusted_list, -}; diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c deleted file mode 100644 index 88195181e1d7..000000000000 --- a/fs/reiserfs/xattr_user.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "reiserfs.h" -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/xattr.h> -#include "xattr.h" -#include <linux/uaccess.h> - -static int -user_get(const struct xattr_handler *handler, struct dentry *unused, - struct inode *inode, const char *name, void *buffer, size_t size) -{ - if (!reiserfs_xattrs_user(inode->i_sb)) - return -EOPNOTSUPP; - return reiserfs_xattr_get(inode, xattr_full_name(handler, name), - buffer, size); -} - -static int -user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, - struct dentry *unused, - struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) -{ - if (!reiserfs_xattrs_user(inode->i_sb)) - return -EOPNOTSUPP; - return reiserfs_xattr_set(inode, - xattr_full_name(handler, name), - buffer, size, flags); -} - -static bool user_list(struct dentry *dentry) -{ - return reiserfs_xattrs_user(dentry->d_sb); -} - -const struct xattr_handler reiserfs_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .get = user_get, - .set = user_set, - .list = user_list, -}; diff --git a/fs/remap_range.c b/fs/remap_range.c index 4403d5c68fcb..26afbbbfb10c 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -536,20 +536,19 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } for (i = 0, info = same->info; i < count; i++, info++) { - struct fd dst_fd = fdget(info->dest_fd); - struct file *dst_file = fd_file(dst_fd); + CLASS(fd, dst_fd)(info->dest_fd); - if (!dst_file) { + if (fd_empty(dst_fd)) { info->status = -EBADF; goto next_loop; } if (info->reserved) { info->status = -EINVAL; - goto next_fdput; + goto next_loop; } - deduped = vfs_dedupe_file_range_one(file, off, dst_file, + deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd), info->dest_offset, len, REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) @@ -559,8 +558,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) else info->bytes_deduped = len; -next_fdput: - fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) break; diff --git a/fs/select.c b/fs/select.c index a77907faf2b4..7da531b1cf6b 100644 --- a/fs/select.c +++ b/fs/select.c @@ -462,15 +462,22 @@ get_max: EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) -static inline void wait_key_set(poll_table *wait, unsigned long in, +static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return EPOLLNVAL; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; + + return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) @@ -522,20 +529,12 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { - struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; - mask = EPOLLNVAL; - f = fdget(i); - if (fd_file(f)) { - wait_key_set(wait, in, out, bit, - busy_flag); - mask = vfs_poll(fd_file(f), wait); - - fdput(f); - } + mask = select_poll_one(i, wait, in, out, bit, + busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; @@ -787,7 +786,7 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -856,15 +855,14 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, __poll_t busy_flag) { int fd = pollfd->fd; - __poll_t mask = 0, filter; - struct fd f; + __poll_t mask, filter; if (fd < 0) - goto out; - mask = EPOLLNVAL; - f = fdget(fd); - if (!fd_file(f)) - goto out; + return 0; + + CLASS(fd, f)(fd); + if (fd_empty(f)) + return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; @@ -872,13 +870,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; - mask &= filter; /* Mask out unneeded events. */ - fdput(f); - -out: - /* ... and so does ->revents */ - pollfd->revents = mangle_poll(mask); - return mask; + return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, @@ -910,6 +902,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { + __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't @@ -917,8 +910,9 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt, &can_busy_loop, - busy_flag)) { + mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); + pfd->revents = mangle_poll(mask); + if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ @@ -1361,7 +1355,7 @@ static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } diff --git a/fs/seq_file.c b/fs/seq_file.c index e676c8b0cf5d..8bbb1ad46335 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. - * @file: file in question * @inode: its inode + * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/signalfd.c b/fs/signalfd.c index 736bebf93591..d1a5f43ce466 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) fd_install(ufd, file); } else { - struct fd f = fdget(ufd); - if (!fd_file(f)) + CLASS(fd, f)(ufd); + if (fd_empty(f)) return -EBADF; ctx = fd_file(f)->private_data; - if (fd_file(f)->f_op != &signalfd_fops) { - fdput(f); + if (fd_file(f)->f_op != &signalfd_fops) return -EINVAL; - } spin_lock_irq(¤t->sighand->siglock); ctx->sigmask = *mask; spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); - fdput(f); } return ufd; diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 2aff6d1395ce..9f05f94e265a 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET - select NETFS_SUPPORT select NLS select NLS_UCS2_UTILS select CRYPTO diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 0ff2491c311d..fe738623cf1b 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -17,6 +17,11 @@ static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); static void cfids_laundromat_worker(struct work_struct *work); +struct cached_dir_dentry { + struct list_head entry; + struct dentry *dentry; +}; + static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, bool lookup_only, @@ -59,6 +64,16 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, list_add(&cfid->entry, &cfids->entries); cfid->on_list = true; kref_get(&cfid->refcount); + /* + * Set @cfid->has_lease to true during construction so that the lease + * reference can be put in cached_dir_lease_break() due to a potential + * lease break right after the request is sent or while @cfid is still + * being cached, or if a reconnection is triggered during construction. + * Concurrent processes won't be to use it yet due to @cfid->time being + * zero. + */ + cfid->has_lease = true; + spin_unlock(&cfids->cfid_list_lock); return cfid; } @@ -147,15 +162,17 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *npath; int retries = 0, cur_sleep = 1; - if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || - is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) + if (cifs_sb->root == NULL) + return -ENOENT; + + if (tcon == NULL) return -EOPNOTSUPP; ses = tcon->ses; cfids = tcon->cfids; - if (cifs_sb->root == NULL) - return -ENOENT; + if (cfids == NULL) + return -EOPNOTSUPP; replay_again: /* reinitialize for possible replay */ @@ -176,12 +193,12 @@ replay_again: return -ENOENT; } /* - * Return cached fid if it has a lease. Otherwise, it is either a new - * entry or laundromat worker removed it from @cfids->entries. Caller - * will put last reference if the latter. + * Return cached fid if it is valid (has a lease and has a time). + * Otherwise, it is either a new entry or laundromat worker removed it + * from @cfids->entries. Caller will put last reference if the latter. */ spin_lock(&cfids->cfid_list_lock); - if (cfid->has_lease) { + if (cfid->has_lease && cfid->time) { spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); @@ -212,6 +229,7 @@ replay_again: } } cfid->dentry = dentry; + cfid->tcon = tcon; /* * We do not hold the lock for the open because in case @@ -267,15 +285,6 @@ replay_again: smb2_set_related(&rqst[1]); - /* - * Set @cfid->has_lease to true before sending out compounded request so - * its lease reference can be put in cached_dir_lease_break() due to a - * potential lease break right after the request is sent or while @cfid - * is still being cached. Concurrent processes won't be to use it yet - * due to @cfid->time being zero. - */ - cfid->has_lease = true; - if (retries) { smb2_set_replay(server, &rqst[0]); smb2_set_replay(server, &rqst[1]); @@ -292,7 +301,6 @@ replay_again: } goto oshr_free; } - cfid->tcon = tcon; cfid->is_open = true; spin_lock(&cfids->cfid_list_lock); @@ -347,6 +355,7 @@ oshr_free: SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); +out: if (rc) { spin_lock(&cfids->cfid_list_lock); if (cfid->on_list) { @@ -358,23 +367,14 @@ oshr_free: /* * We are guaranteed to have two references at this * point. One for the caller and one for a potential - * lease. Release the Lease-ref so that the directory - * will be closed when the caller closes the cached - * handle. + * lease. Release one here, and the second below. */ cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); kref_put(&cfid->refcount, smb2_close_cached_fid); - goto out; } spin_unlock(&cfids->cfid_list_lock); - } -out: - if (rc) { - if (cfid->is_open) - SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, - cfid->fid.volatile_fid); - free_cached_dir(cfid); + + kref_put(&cfid->refcount, smb2_close_cached_fid); } else { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); @@ -396,12 +396,12 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, struct cached_fids *cfids = tcon->cfids; if (cfids == NULL) - return -ENOENT; + return -EOPNOTSUPP; spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { if (dentry && cfid->dentry == dentry) { - cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + cifs_dbg(FYI, "found a cached file handle by dentry\n"); kref_get(&cfid->refcount); *ret_cfid = cfid; spin_unlock(&cfids->cfid_list_lock); @@ -477,7 +477,10 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) struct cifs_tcon *tcon; struct tcon_link *tlink; struct cached_fids *cfids; + struct cached_dir_dentry *tmp_list, *q; + LIST_HEAD(entry); + spin_lock(&cifs_sb->tlink_tree_lock); for (node = rb_first(root); node; node = rb_next(node)) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); tcon = tlink_tcon(tlink); @@ -486,11 +489,30 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) cfids = tcon->cfids; if (cfids == NULL) continue; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { - dput(cfid->dentry); + tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + spin_lock(&cfid->fid_lock); + tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; + spin_unlock(&cfid->fid_lock); + + list_add_tail(&tmp_list->entry, &entry); } + spin_unlock(&cfids->cfid_list_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); + + list_for_each_entry_safe(tmp_list, q, &entry, entry) { + list_del(&tmp_list->entry); + dput(tmp_list->dentry); + kfree(tmp_list); } + + /* Flush any pending work that will drop dentries */ + flush_workqueue(cfid_put_wq); } /* @@ -501,50 +523,71 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) { struct cached_fids *cfids = tcon->cfids; struct cached_fid *cfid, *q; - LIST_HEAD(entry); if (cfids == NULL) return; + /* + * Mark all the cfids as closed, and move them to the cfids->dying list. + * They'll be cleaned up later by cfids_invalidation_worker. Take + * a reference to each cfid during this process. + */ spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - list_move(&cfid->entry, &entry); + list_move(&cfid->entry, &cfids->dying); cfids->num_entries--; cfid->is_open = false; cfid->on_list = false; - /* To prevent race with smb2_cached_lease_break() */ - kref_get(&cfid->refcount); - } - spin_unlock(&cfids->cfid_list_lock); - - list_for_each_entry_safe(cfid, q, &entry, entry) { - list_del(&cfid->entry); - cancel_work_sync(&cfid->lease_break); if (cfid->has_lease) { /* - * We lease was never cancelled from the server so we - * need to drop the reference. + * The lease was never cancelled from the server, + * so steal that reference. */ - spin_lock(&cfids->cfid_list_lock); cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); - } - /* Drop the extra reference opened above*/ - kref_put(&cfid->refcount, smb2_close_cached_fid); + } else + kref_get(&cfid->refcount); } + /* + * Queue dropping of the dentries once locks have been dropped + */ + if (!list_empty(&cfids->dying)) + queue_work(cfid_put_wq, &cfids->invalidation_work); + spin_unlock(&cfids->cfid_list_lock); } static void -smb2_cached_lease_break(struct work_struct *work) +cached_dir_offload_close(struct work_struct *work) { struct cached_fid *cfid = container_of(work, - struct cached_fid, lease_break); + struct cached_fid, close_work); + struct cifs_tcon *tcon = cfid->tcon; + + WARN_ON(cfid->on_list); - spin_lock(&cfid->cfids->cfid_list_lock); - cfid->has_lease = false; - spin_unlock(&cfid->cfids->cfid_list_lock); kref_put(&cfid->refcount, smb2_close_cached_fid); + cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cached_close); +} + +/* + * Release the cached directory's dentry, and then queue work to drop cached + * directory itself (closing on server if needed). + * + * Must be called with a reference to the cached_fid and a reference to the + * tcon. + */ +static void cached_dir_put_work(struct work_struct *work) +{ + struct cached_fid *cfid = container_of(work, struct cached_fid, + put_work); + struct dentry *dentry; + + spin_lock(&cfid->fid_lock); + dentry = cfid->dentry; + cfid->dentry = NULL; + spin_unlock(&cfid->fid_lock); + + dput(dentry); + queue_work(serverclose_wq, &cfid->close_work); } int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) @@ -561,6 +604,7 @@ int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) !memcmp(lease_key, cfid->fid.lease_key, SMB2_LEASE_KEY_SIZE)) { + cfid->has_lease = false; cfid->time = 0; /* * We found a lease remove it from the list @@ -570,8 +614,10 @@ int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) cfid->on_list = false; cfids->num_entries--; - queue_work(cifsiod_wq, - &cfid->lease_break); + ++tcon->tc_count; + trace_smb3_tcon_ref(tcon->debug_id, tcon->tc_count, + netfs_trace_tcon_ref_get_cached_lease_break); + queue_work(cfid_put_wq, &cfid->put_work); spin_unlock(&cfids->cfid_list_lock); return true; } @@ -593,7 +639,8 @@ static struct cached_fid *init_cached_dir(const char *path) return NULL; } - INIT_WORK(&cfid->lease_break, smb2_cached_lease_break); + INIT_WORK(&cfid->close_work, cached_dir_offload_close); + INIT_WORK(&cfid->put_work, cached_dir_put_work); INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); @@ -606,6 +653,9 @@ static void free_cached_dir(struct cached_fid *cfid) { struct cached_dirent *dirent, *q; + WARN_ON(work_pending(&cfid->close_work)); + WARN_ON(work_pending(&cfid->put_work)); + dput(cfid->dentry); cfid->dentry = NULL; @@ -623,10 +673,30 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(cfid); } +static void cfids_invalidation_worker(struct work_struct *work) +{ + struct cached_fids *cfids = container_of(work, struct cached_fids, + invalidation_work); + struct cached_fid *cfid, *q; + LIST_HEAD(entry); + + spin_lock(&cfids->cfid_list_lock); + /* move cfids->dying to the local list */ + list_cut_before(&entry, &cfids->dying, &cfids->dying); + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + /* Drop the ref-count acquired in invalidate_all_cached_dirs */ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + static void cfids_laundromat_worker(struct work_struct *work) { struct cached_fids *cfids; struct cached_fid *cfid, *q; + struct dentry *dentry; LIST_HEAD(entry); cfids = container_of(work, struct cached_fids, laundromat_work.work); @@ -638,33 +708,42 @@ static void cfids_laundromat_worker(struct work_struct *work) cfid->on_list = false; list_move(&cfid->entry, &entry); cfids->num_entries--; - /* To prevent race with smb2_cached_lease_break() */ - kref_get(&cfid->refcount); + if (cfid->has_lease) { + /* + * Our lease has not yet been cancelled from the + * server. Steal that reference. + */ + cfid->has_lease = false; + } else + kref_get(&cfid->refcount); } } spin_unlock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &entry, entry) { list_del(&cfid->entry); - /* - * Cancel and wait for the work to finish in case we are racing - * with it. - */ - cancel_work_sync(&cfid->lease_break); - if (cfid->has_lease) { + + spin_lock(&cfid->fid_lock); + dentry = cfid->dentry; + cfid->dentry = NULL; + spin_unlock(&cfid->fid_lock); + + dput(dentry); + if (cfid->is_open) { + spin_lock(&cifs_tcp_ses_lock); + ++cfid->tcon->tc_count; + trace_smb3_tcon_ref(cfid->tcon->debug_id, cfid->tcon->tc_count, + netfs_trace_tcon_ref_get_cached_laundromat); + spin_unlock(&cifs_tcp_ses_lock); + queue_work(serverclose_wq, &cfid->close_work); + } else /* - * Our lease has not yet been cancelled from the server - * so we need to drop the reference. + * Drop the ref-count from above, either the lease-ref (if there + * was one) or the extra one acquired. */ - spin_lock(&cfids->cfid_list_lock); - cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); kref_put(&cfid->refcount, smb2_close_cached_fid); - } - /* Drop the extra reference opened above */ - kref_put(&cfid->refcount, smb2_close_cached_fid); } - queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); } @@ -677,9 +756,11 @@ struct cached_fids *init_cached_dirs(void) return NULL; spin_lock_init(&cfids->cfid_list_lock); INIT_LIST_HEAD(&cfids->entries); + INIT_LIST_HEAD(&cfids->dying); + INIT_WORK(&cfids->invalidation_work, cfids_invalidation_worker); INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker); - queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); return cfids; @@ -698,6 +779,7 @@ void free_cached_dirs(struct cached_fids *cfids) return; cancel_delayed_work_sync(&cfids->laundromat_work); + cancel_work_sync(&cfids->invalidation_work); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { @@ -705,6 +787,11 @@ void free_cached_dirs(struct cached_fids *cfids) cfid->is_open = false; list_move(&cfid->entry, &entry); } + list_for_each_entry_safe(cfid, q, &cfids->dying, entry) { + cfid->on_list = false; + cfid->is_open = false; + list_move(&cfid->entry, &entry); + } spin_unlock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &entry, entry) { diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 81ba0fd5cc16..1dfe79d947a6 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -44,7 +44,8 @@ struct cached_fid { spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; - struct work_struct lease_break; + struct work_struct put_work; + struct work_struct close_work; struct smb2_file_all_info file_all_info; struct cached_dirents dirents; }; @@ -53,10 +54,13 @@ struct cached_fid { struct cached_fids { /* Must be held when: * - accessing the cfids->entries list + * - accessing the cfids->dying list */ spinlock_t cfid_list_lock; int num_entries; struct list_head entries; + struct list_head dying; + struct work_struct invalidation_work; struct delayed_work laundromat_work; }; diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index af7849e5974f..28f568b5fc27 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -82,6 +82,9 @@ struct key_type cifs_spnego_key_type = { /* strlen of ";pid=0x" */ #define PID_KEY_LEN 7 +/* strlen of ";upcall_target=" */ +#define UPCALL_TARGET_KEY_LEN 15 + /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * cifs_get_spnego_key(struct cifs_ses *sesInfo, @@ -108,6 +111,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, if (sesInfo->user_name) desc_len += USER_KEY_LEN + strlen(sesInfo->user_name); + if (sesInfo->upcall_target == UPTARGET_MOUNT) + desc_len += UPCALL_TARGET_KEY_LEN + 5; // strlen("mount") + else + desc_len += UPCALL_TARGET_KEY_LEN + 3; // strlen("app") + spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); if (description == NULL) @@ -156,6 +164,14 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp = description + strlen(description); sprintf(dp, ";pid=0x%x", current->pid); + if (sesInfo->upcall_target == UPTARGET_MOUNT) { + dp = description + strlen(description); + sprintf(dp, ";upcall_target=mount"); + } else { + dp = description + strlen(description); + sprintf(dp, ";upcall_target=app"); + } + cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); spnego_key = request_key(&cifs_spnego_key_type, description, ""); diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index 79d99a913944..4cc6e0896fad 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -484,10 +484,21 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, /** * Remap spaces and periods found at the end of every * component of the path. The special cases of '.' and - * '..' do not need to be dealt with explicitly because - * they are addressed in namei.c:link_path_walk(). + * '..' are need to be handled because of symlinks. + * They are treated as non-end-of-string to avoid + * remapping and breaking symlinks pointing to . or .. **/ - if ((i == srclen - 1) || (source[i+1] == '\\')) + if ((i == 0 || source[i-1] == '\\') && + source[i] == '.' && + (i == srclen-1 || source[i+1] == '\\')) + end_of_string = false; /* "." case */ + else if (i >= 1 && + (i == 1 || source[i-2] == '\\') && + source[i-1] == '.' && + source[i] == '.' && + (i == srclen-1 || source[i+1] == '\\')) + end_of_string = false; /* ".." case */ + else if ((i == srclen - 1) || (source[i+1] == '\\')) end_of_string = true; else end_of_string = false; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 1d294d53f662..ba79aa2107cc 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -885,12 +885,17 @@ unsigned int setup_authusers_ACE(struct smb_ace *pntace) * Fill in the special SID based on the mode. See * https://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ -unsigned int setup_special_mode_ACE(struct smb_ace *pntace, __u64 nmode) +unsigned int setup_special_mode_ACE(struct smb_ace *pntace, + bool posix, + __u64 nmode) { int i; unsigned int ace_size = 28; - pntace->type = ACCESS_DENIED_ACE_TYPE; + if (posix) + pntace->type = ACCESS_ALLOWED_ACE_TYPE; + else + pntace->type = ACCESS_DENIED_ACE_TYPE; pntace->flags = 0x0; pntace->access_req = 0; pntace->sid.num_subauth = 3; @@ -933,7 +938,8 @@ static void populate_new_aces(char *nacl_base, struct smb_sid *pownersid, struct smb_sid *pgrpsid, __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, - bool modefromsid) + bool modefromsid, + bool posix) { __u64 nmode; u32 num_aces = 0; @@ -950,13 +956,15 @@ static void populate_new_aces(char *nacl_base, num_aces = *pnum_aces; nsize = *pnsize; - if (modefromsid) { - pnntace = (struct smb_ace *) (nacl_base + nsize); - nsize += setup_special_mode_ACE(pnntace, nmode); - num_aces++; + if (modefromsid || posix) { pnntace = (struct smb_ace *) (nacl_base + nsize); - nsize += setup_authusers_ACE(pnntace); + nsize += setup_special_mode_ACE(pnntace, posix, nmode); num_aces++; + if (modefromsid) { + pnntace = (struct smb_ace *) (nacl_base + nsize); + nsize += setup_authusers_ACE(pnntace); + num_aces++; + } goto set_size; } @@ -1076,7 +1084,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, struct smb_sid *pownersid, struct smb_sid *pgrpsid, - __u64 *pnmode, bool mode_from_sid) + __u64 *pnmode, bool mode_from_sid, bool posix) { int i; u16 size = 0; @@ -1094,11 +1102,11 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, nsize = sizeof(struct smb_acl); /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ - if (!pdacl) { + if (!pdacl || posix) { populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, - mode_from_sid); + mode_from_sid, posix); goto finalize_dacl; } @@ -1115,7 +1123,7 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, - mode_from_sid); + mode_from_sid, posix); new_aces_set = true; } @@ -1144,7 +1152,7 @@ next_ace: populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, - mode_from_sid); + mode_from_sid, posix); new_aces_set = true; } @@ -1251,7 +1259,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, - bool mode_from_sid, bool id_from_sid, int *aclflag) + bool mode_from_sid, bool id_from_sid, bool posix, int *aclflag) { int rc = 0; __u32 dacloffset; @@ -1288,7 +1296,7 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, ndacl_ptr->num_aces = cpu_to_le32(0); rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, - pnmode, mode_from_sid); + pnmode, mode_from_sid, posix); sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); /* copy the non-dacl portion of secdesc */ @@ -1584,13 +1592,16 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, struct smb_ntsd *pntsd = NULL; /* acl obtained from server */ struct smb_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct tcon_link *tlink; struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; const u32 info = 0; + bool posix; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + posix = tlink_tcon(tlink)->posix_extensions; ops = tlink_tcon(tlink)->ses->server->ops; @@ -1622,12 +1633,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, id_from_sid = false; /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ - nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ - if (mode_from_sid) - nsecdesclen += 2 * sizeof(struct smb_ace); + if (posix) + nsecdesclen = 1 * sizeof(struct smb_ace); + else if (mode_from_sid) + nsecdesclen = secdesclen + (2 * sizeof(struct smb_ace)); else /* cifsacl */ - nsecdesclen += 5 * sizeof(struct smb_ace); + nsecdesclen = secdesclen + (5 * sizeof(struct smb_ace)); } else { /* chown */ /* When ownership changes, changes new owner sid length could be different */ nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2); @@ -1657,7 +1669,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, } rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid, - mode_from_sid, id_from_sid, &aclflag); + mode_from_sid, id_from_sid, posix, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 000e1ef3beea..b800c9f585d8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -157,6 +157,7 @@ struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; struct workqueue_struct *deferredclose_wq; struct workqueue_struct *serverclose_wq; +struct workqueue_struct *cfid_put_wq; __u32 cifs_lock_secret; /* @@ -397,7 +398,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL); if (!cifs_inode) return NULL; - cifs_inode->cifsAttrs = 0x20; /* default */ + cifs_inode->cifsAttrs = ATTR_ARCHIVE; /* default */ cifs_inode->time = 0; /* * Until the file is open and we have gotten oplock info back from the @@ -546,6 +547,30 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root) return 0; } +static void +cifs_show_upcall_target(struct seq_file *s, struct cifs_sb_info *cifs_sb) +{ + if (cifs_sb->ctx->upcall_target == UPTARGET_UNSPECIFIED) { + seq_puts(s, ",upcall_target=app"); + return; + } + + seq_puts(s, ",upcall_target="); + + switch (cifs_sb->ctx->upcall_target) { + case UPTARGET_APP: + seq_puts(s, "app"); + break; + case UPTARGET_MOUNT: + seq_puts(s, "mount"); + break; + default: + /* shouldn't ever happen */ + seq_puts(s, "unknown"); + break; + } +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -562,6 +587,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_show_option(s, "vers", tcon->ses->server->vals->version_string); cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); + cifs_show_upcall_target(s, cifs_sb); if (tcon->no_lease) seq_puts(s, ",nolease"); @@ -1780,7 +1806,7 @@ static int cifs_init_netfs(void) nomem_subreqpool: kmem_cache_destroy(cifs_io_subrequest_cachep); nomem_subreq: - mempool_destroy(&cifs_io_request_pool); + mempool_exit(&cifs_io_request_pool); nomem_reqpool: kmem_cache_destroy(cifs_io_request_cachep); nomem_req: @@ -1895,9 +1921,16 @@ init_cifs(void) goto out_destroy_deferredclose_wq; } + cfid_put_wq = alloc_workqueue("cfid_put_wq", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cfid_put_wq) { + rc = -ENOMEM; + goto out_destroy_serverclose_wq; + } + rc = cifs_init_inodecache(); if (rc) - goto out_destroy_serverclose_wq; + goto out_destroy_cfid_put_wq; rc = cifs_init_netfs(); if (rc) @@ -1965,6 +1998,8 @@ out_destroy_netfs: cifs_destroy_netfs(); out_destroy_inodecache: cifs_destroy_inodecache(); +out_destroy_cfid_put_wq: + destroy_workqueue(cfid_put_wq); out_destroy_serverclose_wq: destroy_workqueue(serverclose_wq); out_destroy_deferredclose_wq: @@ -2008,6 +2043,7 @@ exit_cifs(void) destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); destroy_workqueue(serverclose_wq); + destroy_workqueue(cfid_put_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 71b720dbb2ce..a762dbbbd959 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -146,6 +146,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 51 -#define CIFS_VERSION "2.51" +#define SMB3_PRODUCT_BUILD 52 +#define CIFS_VERSION "2.52" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 5041b1ffc244..6e63abe461fd 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -153,6 +153,12 @@ enum securityEnum { Kerberos, /* Kerberos via SPNEGO */ }; +enum upcall_target_enum { + UPTARGET_UNSPECIFIED, /* not specified, defaults to app */ + UPTARGET_MOUNT, /* upcall to the mount namespace */ + UPTARGET_APP, /* upcall to the application namespace which did the mount */ +}; + enum cifs_reparse_type { CIFS_REPARSE_TYPE_NFS, CIFS_REPARSE_TYPE_WSL, @@ -588,6 +594,7 @@ struct smb_version_operations { /* Check for STATUS_NETWORK_NAME_DELETED */ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, + const char *full_path, struct kvec *rsp_iov, struct cifs_open_info_data *data); int (*create_reparse_symlink)(const unsigned int xid, @@ -1084,6 +1091,7 @@ struct cifs_ses { struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ enum securityEnum sectype; /* what security flavor was specified? */ + enum upcall_target_enum upcall_target; /* what upcall target was specified? */ bool sign; /* is signing required? */ bool domainAuto:1; bool expired_pwd; /* track if access denied or expired pwd so can know if need to update */ @@ -1983,7 +1991,7 @@ require use of the stronger protocol */ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc + * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search @@ -2071,6 +2079,7 @@ extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern struct workqueue_struct *deferredclose_wq; extern struct workqueue_struct *serverclose_wq; +extern struct workqueue_struct *cfid_put_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_sm_req_poolp; @@ -2223,7 +2232,7 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, struct kvec *iov = &rqst[i].rq_iov[j]; addr = (unsigned long)iov->iov_base + skip; - if (unlikely(is_vmalloc_addr((void *)addr))) { + if (is_vmalloc_or_module_addr((void *)addr)) { len = iov->iov_len - skip; nents += DIV_ROUND_UP(offset_in_page(addr) + len, PAGE_SIZE); @@ -2250,7 +2259,7 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, unsigned int off = offset_in_page(addr); addr &= PAGE_MASK; - if (unlikely(is_vmalloc_addr((void *)addr))) { + if (is_vmalloc_or_module_addr((void *)addr)) { do { unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 68c716e6261b..d26f9bbb5382 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -244,7 +244,9 @@ extern int cifs_set_acl(struct mnt_idmap *idmap, extern int set_cifs_acl(struct smb_ntsd *pntsd, __u32 len, struct inode *ino, const char *path, int flag); extern unsigned int setup_authusers_ACE(struct smb_ace *pace); -extern unsigned int setup_special_mode_ACE(struct smb_ace *pace, __u64 nmode); +extern unsigned int setup_special_mode_ACE(struct smb_ace *pace, + bool posix, + __u64 nmode); extern unsigned int setup_special_user_owner_ACE(struct smb_ace *pace); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); @@ -252,10 +254,6 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read); -extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, - struct page *page, - unsigned int page_offset, - unsigned int to_read); int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read); @@ -316,8 +314,7 @@ extern void cifs_move_llist(struct list_head *source, struct list_head *dest); extern void cifs_free_llist(struct list_head *llist); extern void cifs_del_lock_waiters(struct cifsLockInfo *lock); -extern int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, - const struct nls_table *nlsc); +int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon); extern int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, @@ -553,13 +550,6 @@ extern int generate_smb311signingkey(struct cifs_ses *ses, struct TCP_Server_Info *server); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY -extern int CIFSSMBCopy(unsigned int xid, - struct cifs_tcon *source_tcon, - const char *fromName, - const __u16 target_tid, - const char *toName, const int flags, - const struct nls_table *nls_codepage, - int remap_special_chars); extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, @@ -623,11 +613,7 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_ses *ses); -bool is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); @@ -640,9 +626,6 @@ cifs_chan_set_in_reconnect(struct cifs_ses *ses, void cifs_chan_clear_in_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server); void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server); @@ -675,6 +658,7 @@ char *extract_hostname(const char *unc); char *extract_sharename(const char *unc); int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, + const char *full_path, bool unicode, struct cifs_open_info_data *data); int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, @@ -683,6 +667,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, int cifs_sfu_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +umode_t wire_mode_to_posix(u32 wire, bool is_dir); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index c6f15dbe860a..7f1cacc89dbb 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -70,10 +70,9 @@ static struct { static int cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) { - int rc; - struct cifs_ses *ses; struct TCP_Server_Info *server; - struct nls_table *nls_codepage = NULL; + struct cifs_ses *ses; + int rc; /* * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for @@ -131,8 +130,6 @@ again: } spin_unlock(&server->srv_lock); - nls_codepage = ses->local_nls; - /* * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session @@ -155,8 +152,17 @@ again: spin_unlock(&ses->ses_lock); rc = cifs_negotiate_protocol(0, ses, server); - if (!rc) - rc = cifs_setup_session(0, ses, server, nls_codepage); + if (!rc) { + rc = cifs_setup_session(0, ses, server, ses->local_nls); + if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect if an alternate + * password is available. + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + } /* do we need to reconnect tcon? */ if (rc || !tcon->need_reconnect) { @@ -166,7 +172,7 @@ again: skip_sess_setup: cifs_mark_open_files_invalid(tcon); - rc = cifs_tree_connect(0, tcon, nls_codepage); + rc = cifs_tree_connect(0, tcon); mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); @@ -1261,14 +1267,6 @@ openRetry: return rc; } -static void cifs_readv_worker(struct work_struct *work) -{ - struct cifs_io_subrequest *rdata = - container_of(work, struct cifs_io_subrequest, subreq.work); - - netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); -} - static void cifs_readv_callback(struct mid_q_entry *mid) { @@ -1322,20 +1320,24 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->result == -ENODATA) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && rdata->subreq.start + trans == ictx->remote_i_size) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + } else if (rdata->got_bytes > 0) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } + if (rdata->got_bytes) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } rdata->credits.value = 0; + rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; - INIT_WORK(&rdata->subreq.work, cifs_readv_worker); queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); add_credits(server, &credits, 0); @@ -1673,10 +1675,13 @@ cifs_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { result = -ENOSPC; - else + } else { result = written; + if (written > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -2340,69 +2345,6 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, } int -CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon, - const char *fromName, const __u16 target_tid, const char *toName, - const int flags, const struct nls_table *nls_codepage, int remap) -{ - int rc = 0; - COPY_REQ *pSMB = NULL; - COPY_RSP *pSMBr = NULL; - int bytes_returned; - int name_len, name_len2; - __u16 count; - - cifs_dbg(FYI, "In CIFSSMBCopy\n"); -copyRetry: - rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - pSMB->BufferFormat = 0x04; - pSMB->Tid2 = target_tid; - - pSMB->Flags = cpu_to_le16(flags & COPY_TREE); - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, - fromName, PATH_MAX, nls_codepage, - remap); - name_len++; /* trailing null */ - name_len *= 2; - pSMB->OldFileName[name_len] = 0x04; /* pad */ - /* protocol requires ASCII signature byte on Unicode string */ - pSMB->OldFileName[name_len + 1] = 0x00; - name_len2 = - cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], - toName, PATH_MAX, nls_codepage, remap); - name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; - name_len2 *= 2; /* convert to bytes */ - } else { - name_len = copy_path_name(pSMB->OldFileName, fromName); - pSMB->OldFileName[name_len] = 0x04; /* 2nd buffer format */ - name_len2 = copy_path_name(pSMB->OldFileName+name_len+1, toName); - name_len2++; /* signature byte */ - } - - count = 1 /* 1st signature byte */ + name_len + name_len2; - inc_rfc1001_len(pSMB, count); - pSMB->ByteCount = cpu_to_le16(count); - - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) { - cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n", - rc, le16_to_cpu(pSMBr->CopyCount)); - } - cifs_buf_release(pSMB); - - if (rc == -EAGAIN) - goto copyRetry; - - return rc; -} - -int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, const struct nls_table *nls_codepage, int remap) @@ -4383,8 +4325,8 @@ getDFSRetry: * CIFSGetDFSRefer() may be called from cifs_reconnect_tcon() and thus * causing an infinite recursion. */ - rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, - (void **)&pSMB, (void **)&pSMBr); + rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, + (void **)&pSMB, (void **)&pSMBr); if (rc) return rc; @@ -5406,7 +5348,7 @@ SetTimesRetry: param_offset = offsetof(struct smb_com_transaction2_spi_req, InformationLevel) - 4; offset = param_offset + params; - data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + data_offset = (char *)pSMB + offsetof(typeof(*pSMB), hdr.Protocol) + offset; pSMB->ParameterOffset = cpu_to_le16(param_offset); pSMB->DataOffset = cpu_to_le16(offset); pSMB->SetupCount = 1; diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 63b5a55b7a57..766b4de13da7 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -166,7 +166,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t last, index = start / PAGE_SIZE; size_t len, off, foff; - ssize_t ret = 0; void *p; int s = 0; @@ -193,9 +192,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) memcpy(&sample[s], p, len2); kunmap_local(p); - if (ret < 0) - return ret; - s += len2; if (len2 < SZ_2K || s >= max - SZ_2K) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index adf8758847f6..eaa6be4456d0 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -795,18 +795,6 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) } int -cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, - unsigned int page_offset, unsigned int to_read) -{ - struct msghdr smb_msg = {}; - struct bio_vec bv; - - bvec_set_page(&bv, page, to_read, page_offset); - iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); - return cifs_readv_from_socket(server, &smb_msg); -} - -int cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, unsigned int to_read) { @@ -999,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); + if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; + + /* Release netns reference for the socket. */ + put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1049,7 +1041,10 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + /* Release netns reference for this server. */ + put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); + kfree(server->hostname); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -1647,8 +1642,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) /* srv_count can never go negative */ WARN_ON(server->srv_count < 0); - put_net(cifs_net_ns(server)); - list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -1678,8 +1671,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; - kfree(server->hostname); - server->hostname = NULL; task = xchg(&server->tsk, NULL); if (task) @@ -1726,6 +1717,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; + + /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); @@ -1857,6 +1850,7 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); + /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1865,8 +1859,10 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) + if (tcp_ses->ssocket) { sock_release(tcp_ses->ssocket); + put_net(cifs_net_ns(tcp_ses)); + } kfree(tcp_ses); } return ERR_PTR(rc); @@ -1910,11 +1906,35 @@ static int match_session(struct cifs_ses *ses, CIFS_MAX_USERNAME_LEN)) return 0; if ((ctx->username && strlen(ctx->username) != 0) && - ses->password != NULL && - strncmp(ses->password, - ctx->password ? ctx->password : "", - CIFS_MAX_PASSWORD_LEN)) - return 0; + ses->password != NULL) { + + /* New mount can only share sessions with an existing mount if: + * 1. Both password and password2 match, or + * 2. password2 of the old mount matches password of the new mount + * and password of the old mount matches password2 of the new + * mount + */ + if (ses->password2 != NULL && ctx->password2 != NULL) { + if (!((strncmp(ses->password, ctx->password ? + ctx->password : "", CIFS_MAX_PASSWORD_LEN) == 0 && + strncmp(ses->password2, ctx->password2, + CIFS_MAX_PASSWORD_LEN) == 0) || + (strncmp(ses->password, ctx->password2, + CIFS_MAX_PASSWORD_LEN) == 0 && + strncmp(ses->password2, ctx->password ? + ctx->password : "", CIFS_MAX_PASSWORD_LEN) == 0))) + return 0; + + } else if ((ses->password2 == NULL && ctx->password2 != NULL) || + (ses->password2 != NULL && ctx->password2 == NULL)) { + return 0; + + } else { + if (strncmp(ses->password, ctx->password ? + ctx->password : "", CIFS_MAX_PASSWORD_LEN)) + return 0; + } + } } if (strcmp(ctx->local_nls->charset, ses->local_nls->charset)) @@ -2257,6 +2277,7 @@ struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { int rc = 0; + int retries = 0; unsigned int xid; struct cifs_ses *ses; struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; @@ -2275,6 +2296,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) cifs_dbg(FYI, "Session needs reconnect\n"); mutex_lock(&ses->session_mutex); + +retry_old_session: rc = cifs_negotiate_protocol(xid, ses, server); if (rc) { mutex_unlock(&ses->session_mutex); @@ -2287,6 +2310,13 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) rc = cifs_setup_session(xid, ses, server, ctx->local_nls); if (rc) { + if (((rc == -EACCES) || (rc == -EKEYEXPIRED) || + (rc == -EKEYREVOKED)) && !retries && ses->password2) { + retries++; + cifs_dbg(FYI, "Session reconnect failed, retrying with alternate password\n"); + swap(ses->password, ses->password2); + goto retry_old_session; + } mutex_unlock(&ses->session_mutex); /* problem -- put our reference */ cifs_put_smb_ses(ses); @@ -2352,6 +2382,26 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses->sectype = ctx->sectype; ses->sign = ctx->sign; + + /* + *Explicitly marking upcall_target mount option for easier handling + * by cifs_spnego.c and eventually cifs.upcall.c + */ + + switch (ctx->upcall_target) { + case UPTARGET_UNSPECIFIED: /* default to app */ + case UPTARGET_APP: + ses->upcall_target = UPTARGET_APP; + break; + case UPTARGET_MOUNT: + ses->upcall_target = UPTARGET_MOUNT; + break; + default: + // should never happen + ses->upcall_target = UPTARGET_APP; + break; + } + ses->local_nls = load_nls(ctx->local_nls->charset); /* add server as first channel */ @@ -2362,6 +2412,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) ses->chans_need_reconnect = 1; spin_unlock(&ses->chan_lock); +retry_new_session: mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses, server); if (!rc) @@ -2374,8 +2425,16 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) sizeof(ses->smb3signingkey)); spin_unlock(&ses->chan_lock); - if (rc) - goto get_ses_fail; + if (rc) { + if (((rc == -EACCES) || (rc == -EKEYEXPIRED) || + (rc == -EKEYREVOKED)) && !retries && ses->password2) { + retries++; + cifs_dbg(FYI, "Session setup failed, retrying with alternate password\n"); + swap(ses->password, ses->password2); + goto retry_new_session; + } else + goto get_ses_fail; + } /* * success, put it on the list and add it as first channel @@ -2482,9 +2541,6 @@ cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace) list_del_init(&tcon->tcon_list); tcon->status = TID_EXITING; -#ifdef CONFIG_CIFS_DFS_UPCALL - list_replace_init(&tcon->dfs_ses_list, &ses_list); -#endif spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); @@ -2492,6 +2548,7 @@ cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace) cancel_delayed_work_sync(&tcon->query_interfaces); #ifdef CONFIG_CIFS_DFS_UPCALL cancel_delayed_work_sync(&tcon->dfs_cache_work); + list_replace_init(&tcon->dfs_ses_list, &ses_list); #endif if (tcon->use_witness) { @@ -2564,7 +2621,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (ses->server->dialect >= SMB20_PROT_ID && (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)) - nohandlecache = ctx->nohandlecache; + nohandlecache = ctx->nohandlecache || !dir_cache_timeout; else nohandlecache = true; tcon = tcon_info_alloc(!nohandlecache, netfs_trace_tcon_ref_new); @@ -3082,13 +3139,22 @@ generic_ip_connect(struct TCP_Server_Info *server) if (server->ssocket) { socket = server->ssocket; } else { - rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + struct net *net = cifs_net_ns(server); + + rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } + /* + * Grab netns reference for the socket. + * + * It'll be released here, on error, or in clean_demultiplex_info() upon server + * teardown. + */ + get_net(net); + /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); socket = server->ssocket; @@ -3101,8 +3167,10 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) + if (rc < 0) { + put_net(cifs_net_ns(server)); return rc; + } /* * Eventually check for other socket options to change from @@ -3139,6 +3207,7 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); + put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; @@ -3147,6 +3216,9 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); + if (rc < 0) + put_net(cifs_net_ns(server)); + return rc; } @@ -4328,10 +4400,10 @@ cifs_prune_tlinks(struct work_struct *work) } #ifndef CONFIG_CIFS_DFS_UPCALL -int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc) +int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon) { - int rc; const struct smb_version_operations *ops = tcon->ses->server->ops; + int rc; /* only send once per connect */ spin_lock(&tcon->tc_lock); @@ -4354,7 +4426,8 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); - rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, + tcon, tcon->ses->local_nls); if (rc) { spin_lock(&tcon->tc_lock); if (tcon->status == TID_IN_TCON) diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 3f6077c68d68..4647df9e1e3b 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -321,49 +321,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) return rc; } -/* Update dfs referral path of superblock */ -static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, - const char *target) -{ - int rc = 0; - size_t len = strlen(target); - char *refpath, *npath; - - if (unlikely(len < 2 || *target != '\\')) - return -EINVAL; - - if (target[1] == '\\') { - len += 1; - refpath = kmalloc(len, GFP_KERNEL); - if (!refpath) - return -ENOMEM; - - scnprintf(refpath, len, "%s", target); - } else { - len += sizeof("\\"); - refpath = kmalloc(len, GFP_KERNEL); - if (!refpath) - return -ENOMEM; - - scnprintf(refpath, len, "\\%s", target); - } - - npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb)); - kfree(refpath); - - if (IS_ERR(npath)) { - rc = PTR_ERR(npath); - } else { - mutex_lock(&server->refpath_lock); - spin_lock(&server->srv_lock); - kfree(server->leaf_fullpath); - server->leaf_fullpath = npath; - spin_unlock(&server->srv_lock); - mutex_unlock(&server->refpath_lock); - } - return rc; -} - static int target_share_matches_server(struct TCP_Server_Info *server, char *share, bool *target_match) { @@ -388,77 +345,22 @@ static int target_share_matches_server(struct TCP_Server_Info *server, char *sha return rc; } -static void __tree_connect_ipc(const unsigned int xid, char *tree, - struct cifs_sb_info *cifs_sb, - struct cifs_ses *ses) -{ - struct TCP_Server_Info *server = ses->server; - struct cifs_tcon *tcon = ses->tcon_ipc; - int rc; - - spin_lock(&ses->ses_lock); - spin_lock(&ses->chan_lock); - if (cifs_chan_needs_reconnect(ses, server) || - ses->ses_status != SES_GOOD) { - spin_unlock(&ses->chan_lock); - spin_unlock(&ses->ses_lock); - cifs_server_dbg(FYI, "%s: skipping ipc reconnect due to disconnected ses\n", - __func__); - return; - } - spin_unlock(&ses->chan_lock); - spin_unlock(&ses->ses_lock); - - cifs_server_lock(server); - scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); - cifs_server_unlock(server); - - rc = server->ops->tree_connect(xid, ses, tree, tcon, - cifs_sb->local_nls); - cifs_server_dbg(FYI, "%s: tree_reconnect %s: %d\n", __func__, tree, rc); - spin_lock(&tcon->tc_lock); - if (rc) { - tcon->status = TID_NEED_TCON; - } else { - tcon->status = TID_GOOD; - tcon->need_reconnect = false; - } - spin_unlock(&tcon->tc_lock); -} - -static void tree_connect_ipc(const unsigned int xid, char *tree, - struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) -{ - struct cifs_ses *ses = tcon->ses; - - __tree_connect_ipc(xid, tree, cifs_sb, ses); - __tree_connect_ipc(xid, tree, cifs_sb, CIFS_DFS_ROOT_SES(ses)); -} - -static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, char *tree, bool islink, - struct dfs_cache_tgt_list *tl) +static int tree_connect_dfs_target(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + char *tree, bool islink, + struct dfs_cache_tgt_list *tl) { - int rc; + const struct smb_version_operations *ops = tcon->ses->server->ops; struct TCP_Server_Info *server = tcon->ses->server; - const struct smb_version_operations *ops = server->ops; - struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses); - char *share = NULL, *prefix = NULL; struct dfs_cache_tgt_iterator *tit; + char *share = NULL, *prefix = NULL; bool target_match; - - tit = dfs_cache_get_tgt_iterator(tl); - if (!tit) { - rc = -ENOENT; - goto out; - } + int rc = -ENOENT; /* Try to tree connect to all dfs targets */ - for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) { - const char *target = dfs_cache_get_tgt_name(tit); - DFS_CACHE_TGT_LIST(ntl); - + for (tit = dfs_cache_get_tgt_iterator(tl); + tit; tit = dfs_cache_get_next_tgt(tl, tit)) { kfree(share); kfree(prefix); share = prefix = NULL; @@ -479,74 +381,21 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t } dfs_cache_noreq_update_tgthint(server->leaf_fullpath + 1, tit); - tree_connect_ipc(xid, tree, cifs_sb, tcon); - scnprintf(tree, MAX_TREE_SIZE, "\\%s", share); - if (!islink) { - rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls); - break; - } - - /* - * If no dfs referrals were returned from link target, then just do a TREE_CONNECT - * to it. Otherwise, cache the dfs referral and then mark current tcp ses for - * reconnect so either the demultiplex thread or the echo worker will reconnect to - * newly resolved target. - */ - if (dfs_cache_find(xid, root_ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target, - NULL, &ntl)) { - rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls); - if (rc) - continue; - + rc = ops->tree_connect(xid, tcon->ses, tree, + tcon, tcon->ses->local_nls); + if (islink && !rc && cifs_sb) rc = cifs_update_super_prepath(cifs_sb, prefix); - } else { - /* Target is another dfs share */ - rc = update_server_fullpath(server, cifs_sb, target); - dfs_cache_free_tgts(tl); - - if (!rc) { - rc = -EREMOTE; - list_replace_init(&ntl.tl_list, &tl->tl_list); - } else - dfs_cache_free_tgts(&ntl); - } break; } -out: kfree(share); kfree(prefix); - - return rc; -} - -static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, char *tree, bool islink, - struct dfs_cache_tgt_list *tl) -{ - int rc; - int num_links = 0; - struct TCP_Server_Info *server = tcon->ses->server; - char *old_fullpath = server->leaf_fullpath; - - do { - rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl); - if (!rc || rc != -EREMOTE) - break; - } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS); - /* - * If we couldn't tree connect to any targets from last referral path, then - * retry it from newly resolved dfs referral. - */ - if (rc && server->leaf_fullpath != old_fullpath) - cifs_signal_cifsd_for_reconnect(server, true); - dfs_cache_free_tgts(tl); return rc; } -int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc) +int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon) { int rc; struct TCP_Server_Info *server = tcon->ses->server; @@ -588,7 +437,8 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru cifs_server_lock(server); scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); cifs_server_unlock(server); - rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tree, + tcon, tcon->ses->local_nls); goto out; } @@ -596,14 +446,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru if (!IS_ERR(sb)) cifs_sb = CIFS_SB(sb); - /* - * Tree connect to last share in @tcon->tree_name whether dfs super or - * cached dfs referral was not found. - */ - if (!cifs_sb || !server->leaf_fullpath || + /* Tree connect to last share in @tcon->tree_name if no DFS referral */ + if (!server->leaf_fullpath || dfs_cache_noreq_find(server->leaf_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, - cifs_sb ? cifs_sb->local_nls : nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, + tcon, tcon->ses->local_nls); goto out; } diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 110f03df012a..541608b0267e 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -24,8 +24,8 @@ #include "dfs_cache.h" -#define CACHE_HTABLE_SIZE 32 -#define CACHE_MAX_ENTRIES 64 +#define CACHE_HTABLE_SIZE 512 +#define CACHE_MAX_ENTRIES 1024 #define CACHE_MIN_TTL 120 /* 2 minutes */ #define CACHE_DEFAULT_TTL 300 /* 5 minutes */ @@ -173,8 +173,8 @@ static int dfscache_proc_show(struct seq_file *m, void *v) "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", - ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); + str_yes_no(DFS_INTERLINK(ce->hdr_flags)), + ce->path_consumed, str_yes_no(cache_entry_expired(ce))); list_for_each_entry(t, &ce->tlist, list) { seq_printf(m, " %s%s\n", @@ -242,9 +242,9 @@ static inline void dump_ce(const struct cache_entry *ce) ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", + str_yes_no(DFS_INTERLINK(ce->hdr_flags)), ce->path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); + str_yes_no(cache_entry_expired(ce))); dump_tgts(ce); } diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a58a3333ecc3..79de2f2f9c41 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -227,7 +227,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) return; failed: - netfs_read_subreq_terminated(subreq, rc, false); + subreq->error = rc; + netfs_read_subreq_terminated(subreq); } /* @@ -990,7 +991,11 @@ int cifs_open(struct inode *inode, struct file *file) } /* Get the cached handle as SMB2 close is deferred */ - rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { + rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + } else { + rc = cifs_get_readable_path(tcon, full_path, &cfile); + } if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 28c4e576d460..49123f458d0c 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -67,6 +67,12 @@ static const match_table_t cifs_secflavor_tokens = { { Opt_sec_err, NULL } }; +static const match_table_t cifs_upcall_target = { + { Opt_upcall_target_mount, "mount" }, + { Opt_upcall_target_application, "app" }, + { Opt_upcall_target_err, NULL } +}; + const struct fs_parameter_spec smb3_fs_parameters[] = { /* Mount options that take no arguments */ fsparam_flag_no("user_xattr", Opt_user_xattr), @@ -178,6 +184,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("sec", Opt_sec), fsparam_string("cache", Opt_cache), fsparam_string("reparse", Opt_reparse), + fsparam_string("upcall_target", Opt_upcalltarget), /* Arguments that should be ignored */ fsparam_flag("guest", Opt_ignore), @@ -248,6 +255,29 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c return 0; } +static int +cifs_parse_upcall_target(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) +{ + substring_t args[MAX_OPT_ARGS]; + + ctx->upcall_target = UPTARGET_UNSPECIFIED; + + switch (match_token(value, cifs_upcall_target, args)) { + case Opt_upcall_target_mount: + ctx->upcall_target = UPTARGET_MOUNT; + break; + case Opt_upcall_target_application: + ctx->upcall_target = UPTARGET_APP; + break; + + default: + cifs_errorf(fc, "bad upcall target: %s\n", value); + return 1; + } + + return 0; +} + static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_loose, "loose" }, { Opt_cache_strict, "strict" }, @@ -890,12 +920,37 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0) +int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) +{ + if (ses->password && + cifs_sb->ctx->password && + strcmp(ses->password, cifs_sb->ctx->password)) { + kfree_sensitive(cifs_sb->ctx->password); + cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL); + if (!cifs_sb->ctx->password) + return -ENOMEM; + } + if (ses->password2 && + cifs_sb->ctx->password2 && + strcmp(ses->password2, cifs_sb->ctx->password2)) { + kfree_sensitive(cifs_sb->ctx->password2); + cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL); + if (!cifs_sb->ctx->password2) { + kfree_sensitive(cifs_sb->ctx->password); + cifs_sb->ctx->password = NULL; + return -ENOMEM; + } + } + return 0; +} + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; + char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; int rc; @@ -915,14 +970,63 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); + if (need_recon == false) STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); else { + if (ctx->password) { + new_password = kstrdup(ctx->password, GFP_KERNEL); + if (!new_password) + return -ENOMEM; + } else + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); + } + + /* + * if a new password2 has been specified, then reset it's value + * inside the ses struct + */ + if (ctx->password2) { + new_password2 = kstrdup(ctx->password2, GFP_KERNEL); + if (!new_password2) { + kfree_sensitive(new_password); + return -ENOMEM; + } + } else + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2); + + /* + * we may update the passwords in the ses struct below. Make sure we do + * not race with smb2_reconnect + */ + mutex_lock(&ses->session_mutex); + + /* + * smb2_reconnect may swap password and password2 in case session setup + * failed. First get ctx passwords in sync with ses passwords. It should + * be okay to do this even if this function were to return an error at a + * later stage + */ + rc = smb3_sync_session_ctx_passwords(cifs_sb, ses); + if (rc) { + mutex_unlock(&ses->session_mutex); + return rc; + } + + /* + * now that allocations for passwords are done, commit them + */ + if (new_password) { kfree_sensitive(ses->password); - ses->password = kstrdup(ctx->password, GFP_KERNEL); + ses->password = new_password; + } + if (new_password2) { kfree_sensitive(ses->password2); - ses->password2 = kstrdup(ctx->password2, GFP_KERNEL); + ses->password2 = new_password2; } + + mutex_unlock(&ses->session_mutex); + STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -1443,6 +1547,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; + case Opt_upcalltarget: + if (cifs_parse_upcall_target(fc, param->string, ctx) != 0) + goto cifs_parse_mount_err; + break; case Opt_cache: if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; @@ -1620,6 +1728,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } /* case Opt_ignore: - is ignored as expected ... */ + if (ctx->multiuser && ctx->upcall_target == UPTARGET_MOUNT) { + cifs_errorf(fc, "multiuser mount option not supported with upcalltarget set as 'mount'\n"); + goto cifs_parse_mount_err; + } + return 0; cifs_parse_mount_err: diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 890d6d9d4a59..ac6baa774ad3 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -61,6 +61,12 @@ enum cifs_sec_param { Opt_sec_err }; +enum cifs_upcall_target_param { + Opt_upcall_target_mount, + Opt_upcall_target_application, + Opt_upcall_target_err +}; + enum cifs_param { /* Mount options that take no arguments */ Opt_user_xattr, @@ -114,6 +120,8 @@ enum cifs_param { Opt_multichannel, Opt_compress, Opt_witness, + Opt_is_upcall_target_mount, + Opt_is_upcall_target_application, /* Mount options which take numeric value */ Opt_backupuid, @@ -157,6 +165,7 @@ enum cifs_param { Opt_sec, Opt_cache, Opt_reparse, + Opt_upcalltarget, /* Mount options to be ignored */ Opt_ignore, @@ -198,6 +207,7 @@ struct smb3_fs_context { umode_t file_mode; umode_t dir_mode; enum securityEnum sectype; /* sectype requested via mnt opts */ + enum upcall_target_enum upcall_target; /* where to upcall for mount */ bool sign; /* was signing requested via mnt opts? */ bool ignore_signature:1; bool retry:1; @@ -299,6 +309,7 @@ static inline struct smb3_fs_context *smb3_fc2context(const struct fs_context *f } extern int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx); +extern int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); /* diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index eff3f57235ee..f146e06c97eb 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -598,6 +598,17 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); + } else if (bytes_read == 16) { + /* + * Windows NFS server before Windows Server 2012 + * stores major and minor number in SFU-modified + * style, just as 32-bit numbers. Recognize it. + */ + __u32 mjr; /* major */ + __u32 mnr; /* minor */ + mjr = le32_to_cpu(*(__le32 *)(pbuf+8)); + mnr = le32_to_cpu(*(__le32 *)(pbuf+12)); + fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxCHR\0", pbuf, 8) == 0) { cifs_dbg(FYI, "Char device\n"); @@ -610,6 +621,17 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); + } else if (bytes_read == 16) { + /* + * Windows NFS server before Windows Server 2012 + * stores major and minor number in SFU-modified + * style, just as 32-bit numbers. Recognize it. + */ + __u32 mjr; /* major */ + __u32 mnr; /* minor */ + mjr = le32_to_cpu(*(__le32 *)(pbuf+8)); + mnr = le32_to_cpu(*(__le32 *)(pbuf+12)); + fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("LnxSOCK", pbuf, 8) == 0) { cifs_dbg(FYI, "Socket\n"); @@ -724,6 +746,88 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, #endif } +#define POSIX_TYPE_FILE 0 +#define POSIX_TYPE_DIR 1 +#define POSIX_TYPE_SYMLINK 2 +#define POSIX_TYPE_CHARDEV 3 +#define POSIX_TYPE_BLKDEV 4 +#define POSIX_TYPE_FIFO 5 +#define POSIX_TYPE_SOCKET 6 + +#define POSIX_X_OTH 0000001 +#define POSIX_W_OTH 0000002 +#define POSIX_R_OTH 0000004 +#define POSIX_X_GRP 0000010 +#define POSIX_W_GRP 0000020 +#define POSIX_R_GRP 0000040 +#define POSIX_X_USR 0000100 +#define POSIX_W_USR 0000200 +#define POSIX_R_USR 0000400 +#define POSIX_STICKY 0001000 +#define POSIX_SET_GID 0002000 +#define POSIX_SET_UID 0004000 + +#define POSIX_OTH_MASK 0000007 +#define POSIX_GRP_MASK 0000070 +#define POSIX_USR_MASK 0000700 +#define POSIX_PERM_MASK 0000777 +#define POSIX_FILETYPE_MASK 0070000 + +#define POSIX_FILETYPE_SHIFT 12 + +static u32 wire_perms_to_posix(u32 wire) +{ + u32 mode = 0; + + mode |= (wire & POSIX_X_OTH) ? S_IXOTH : 0; + mode |= (wire & POSIX_W_OTH) ? S_IWOTH : 0; + mode |= (wire & POSIX_R_OTH) ? S_IROTH : 0; + mode |= (wire & POSIX_X_GRP) ? S_IXGRP : 0; + mode |= (wire & POSIX_W_GRP) ? S_IWGRP : 0; + mode |= (wire & POSIX_R_GRP) ? S_IRGRP : 0; + mode |= (wire & POSIX_X_USR) ? S_IXUSR : 0; + mode |= (wire & POSIX_W_USR) ? S_IWUSR : 0; + mode |= (wire & POSIX_R_USR) ? S_IRUSR : 0; + mode |= (wire & POSIX_STICKY) ? S_ISVTX : 0; + mode |= (wire & POSIX_SET_GID) ? S_ISGID : 0; + mode |= (wire & POSIX_SET_UID) ? S_ISUID : 0; + + return mode; +} + +static u32 posix_filetypes[] = { + S_IFREG, + S_IFDIR, + S_IFLNK, + S_IFCHR, + S_IFBLK, + S_IFIFO, + S_IFSOCK +}; + +static u32 wire_filetype_to_posix(u32 wire_type) +{ + if (wire_type >= ARRAY_SIZE(posix_filetypes)) { + pr_warn("Unexpected type %u", wire_type); + return 0; + } + return posix_filetypes[wire_type]; +} + +umode_t wire_mode_to_posix(u32 wire, bool is_dir) +{ + u32 wire_type; + u32 mode; + + wire_type = (wire & POSIX_FILETYPE_MASK) >> POSIX_FILETYPE_SHIFT; + /* older servers do not set POSIX file type in the mode field in the response */ + if ((wire_type == 0) && is_dir) + mode = wire_perms_to_posix(wire) | S_IFDIR; + else + mode = (wire_perms_to_posix(wire) | wire_filetype_to_posix(wire_type)); + return (umode_t)mode; +} + /* Fill a cifs_fattr struct with info from POSIX info struct */ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, @@ -760,20 +864,14 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->HardLinks); - fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode); + fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode), + fattr->cf_cifsattrs & ATTR_DIRECTORY); if (cifs_open_data_reparse(data) && cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) goto out_reparse; - fattr->cf_mode &= ~S_IFMT; - if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { - fattr->cf_mode |= S_IFDIR; - fattr->cf_dtype = DT_DIR; - } else { /* file */ - fattr->cf_mode |= S_IFREG; - fattr->cf_dtype = DT_REG; - } + fattr->cf_dtype = S_DT(fattr->cf_mode); out_reparse: if (S_ISLNK(fattr->cf_mode)) { @@ -1115,6 +1213,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = 0; } else if (iov && server->ops->parse_reparse_point) { rc = server->ops->parse_reparse_point(cifs_sb, + full_path, iov, data); } break; @@ -1848,6 +1947,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } + netfs_wait_for_outstanding_io(inode); cifs_close_deferred_file_under_dentry(tcon, full_path); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -2365,8 +2465,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, } cifs_close_deferred_file_under_dentry(tcon, from_name); - if (d_inode(target_dentry) != NULL) + if (d_inode(target_dentry) != NULL) { + netfs_wait_for_outstanding_io(d_inode(target_dentry)); cifs_close_deferred_file_under_dentry(tcon, to_name); + } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); @@ -2473,13 +2575,10 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - spin_lock(&cfid->fid_lock); if (cfid->time && cifs_i->time > cfid->time) { - spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); return false; } - spin_unlock(&cfid->fid_lock); close_cached_dir(cfid); } /* @@ -3062,6 +3161,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) int rc = -EACCES; __u32 dosattr = 0; __u64 mode = NO_CHANGE_64; + bool posix = cifs_sb_master_tcon(cifs_sb)->posix_extensions; xid = get_xid(); @@ -3152,7 +3252,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) mode = attrs->ia_mode; rc = 0; if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)) { + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) || + posix) { rc = id_mode_to_cifs_acl(inode, full_path, &mode, INVALID_UID, INVALID_GID); if (rc) { diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 2ce193609d8b..56439da4f119 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -72,7 +72,6 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, unsigned long srcfd) { int rc; - struct fd src_file; struct inode *src_inode; cifs_dbg(FYI, "ioctl copychunk range\n"); @@ -89,8 +88,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, return rc; } - src_file = fdget(srcfd); - if (!fd_file(src_file)) { + CLASS(fd, src_file)(srcfd); + if (fd_empty(src_file)) { rc = -EBADF; goto out_drop_write; } @@ -98,20 +97,18 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) { rc = -EBADF; cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); - goto out_fput; + goto out_drop_write; } src_inode = file_inode(fd_file(src_file)); rc = -EINVAL; if (S_ISDIR(src_inode->i_mode)) - goto out_fput; + goto out_drop_write; rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0, src_inode->i_size, 0); if (rc > 0) rc = 0; -out_fput: - fdput(src_file); out_drop_write: mnt_drop_write_file(dst_file); return rc; diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index 0f788031b740..e3f9213131c4 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -196,11 +196,28 @@ static struct vfsmount *cifs_do_automount(struct path *path) struct smb3_fs_context tmp; char *full_path; struct vfsmount *mnt; + struct cifs_sb_info *mntpt_sb; + struct cifs_ses *ses; if (IS_ROOT(mntpt)) return ERR_PTR(-ESTALE); - cur_ctx = CIFS_SB(mntpt->d_sb)->ctx; + mntpt_sb = CIFS_SB(mntpt->d_sb); + ses = cifs_sb_master_tcon(mntpt_sb)->ses; + cur_ctx = mntpt_sb->ctx; + + /* + * At this point, the root session should be in the mntpt sb. We should + * bring the sb context passwords in sync with the root session's + * passwords. This would help prevent unnecessary retries and password + * swaps for automounts. + */ + mutex_lock(&ses->session_mutex); + rc = smb3_sync_session_ctx_passwords(mntpt_sb, ses); + mutex_unlock(&ses->session_mutex); + + if (rc) + return ERR_PTR(rc); fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt); if (IS_ERR(fc)) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index b3a8f9c6fcff..273358d20a46 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -71,6 +71,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + bool posix = cifs_sb_master_tcon(cifs_sb)->posix_extensions; + bool reparse_need_reval = false; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); int rc; @@ -85,7 +87,21 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, * this spares us an invalidation. */ retry: - if ((fattr->cf_cifsattrs & ATTR_REPARSE) || + if (posix) { + switch (fattr->cf_mode & S_IFMT) { + case S_IFLNK: + case S_IFBLK: + case S_IFCHR: + reparse_need_reval = true; + break; + default: + break; + } + } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { + reparse_need_reval = true; + } + + if (reparse_need_reval || (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)) return; @@ -241,31 +257,29 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, fattr->cf_nlink = le32_to_cpu(info->HardLinks); fattr->cf_cifsattrs = le32_to_cpu(info->DosAttributes); - /* - * Since we set the inode type below we need to mask off - * to avoid strange results if bits set above. - * XXX: why not make server&client use the type bits? - */ - fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT; + if (fattr->cf_cifsattrs & ATTR_REPARSE) + fattr->cf_cifstag = le32_to_cpu(info->ReparseTag); + + /* The Mode field in the response can now include the file type as well */ + fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode), + fattr->cf_cifsattrs & ATTR_DIRECTORY); + fattr->cf_dtype = S_DT(le32_to_cpu(info->Mode)); + + switch (fattr->cf_mode & S_IFMT) { + case S_IFLNK: + case S_IFBLK: + case S_IFCHR: + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + break; + default: + break; + } cifs_dbg(FYI, "posix fattr: dev %d, reparse %d, mode %o\n", le32_to_cpu(info->DeviceId), le32_to_cpu(info->ReparseTag), le32_to_cpu(info->Mode)); - if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { - fattr->cf_mode |= S_IFDIR; - fattr->cf_dtype = DT_DIR; - } else { - /* - * mark anything that is not a dir as regular - * file. special files should have the REPARSE - * attribute and will be marked as needing revaluation - */ - fattr->cf_mode |= S_IFREG; - fattr->cf_dtype = DT_REG; - } - sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index c848b5e88d32..d88b41133e00 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -14,6 +14,12 @@ #include "fs_context.h" #include "reparse.h" +static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, + const unsigned int xid, + const char *full_path, + const char *symname, + bool *directory); + int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname) @@ -24,10 +30,14 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, struct inode *new; struct kvec iov; __le16 *path; + bool directory; char *sym, sep = CIFS_DIR_SEP(cifs_sb); u16 len, plen; int rc = 0; + if (strlen(symname) > REPARSE_SYM_PATH_MAX) + return -ENAMETOOLONG; + sym = kstrdup(symname, GFP_KERNEL); if (!sym) return -ENOMEM; @@ -45,7 +55,19 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, goto out; } - plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX); + /* + * SMB distinguish between symlink to directory and symlink to file. + * They cannot be exchanged (symlink of file type which points to + * directory cannot be resolved and vice-versa). Try to detect if + * the symlink target could be a directory or not. When detection + * fails then treat symlink as a file (non-directory) symlink. + */ + directory = false; + rc = detect_directory_symlink_target(cifs_sb, xid, full_path, symname, &directory); + if (rc < 0) + goto out; + + plen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX); len = sizeof(*buf) + plen * 2; buf = kzalloc(len, GFP_KERNEL); if (!buf) { @@ -69,7 +91,8 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, iov.iov_base = buf; iov.iov_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov, NULL); + tcon, full_path, directory, + &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); else @@ -81,6 +104,144 @@ out: return rc; } +static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, + const unsigned int xid, + const char *full_path, + const char *symname, + bool *directory) +{ + char sep = CIFS_DIR_SEP(cifs_sb); + struct cifs_open_parms oparms; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + const char *basename; + struct cifs_fid fid; + char *resolved_path; + int full_path_len; + int basename_len; + int symname_len; + char *path_sep; + __u32 oplock; + int open_rc; + + /* + * First do some simple check. If the original Linux symlink target ends + * with slash, or last path component is dot or dot-dot then it is for + * sure symlink to the directory. + */ + basename = kbasename(symname); + basename_len = strlen(basename); + if (basename_len == 0 || /* symname ends with slash */ + (basename_len == 1 && basename[0] == '.') || /* last component is "." */ + (basename_len == 2 && basename[0] == '.' && basename[1] == '.')) { /* or ".." */ + *directory = true; + return 0; + } + + /* + * For absolute symlinks it is not possible to determinate + * if it should point to directory or file. + */ + if (symname[0] == '/') { + cifs_dbg(FYI, + "%s: cannot determinate if the symlink target path '%s' " + "is directory or not, creating '%s' as file symlink\n", + __func__, symname, full_path); + return 0; + } + + /* + * If it was not detected as directory yet and the symlink is relative + * then try to resolve the path on the SMB server, check if the path + * exists and determinate if it is a directory or not. + */ + + full_path_len = strlen(full_path); + symname_len = strlen(symname); + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + resolved_path = kzalloc(full_path_len + symname_len + 1, GFP_KERNEL); + if (!resolved_path) { + cifs_put_tlink(tlink); + return -ENOMEM; + } + + /* + * Compose the resolved SMB symlink path from the SMB full path + * and Linux target symlink path. + */ + memcpy(resolved_path, full_path, full_path_len+1); + path_sep = strrchr(resolved_path, sep); + if (path_sep) + path_sep++; + else + path_sep = resolved_path; + memcpy(path_sep, symname, symname_len+1); + if (sep == '\\') + convert_delimiter(path_sep, sep); + + tcon = tlink_tcon(tlink); + oparms = CIFS_OPARMS(cifs_sb, tcon, resolved_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE); + oparms.fid = &fid; + + /* Try to open as a directory (NOT_FILE) */ + oplock = 0; + oparms.create_options = cifs_create_options(cifs_sb, + CREATE_NOT_FILE | OPEN_REPARSE_POINT); + open_rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); + if (open_rc == 0) { + /* Successful open means that the target path is definitely a directory. */ + *directory = true; + tcon->ses->server->ops->close(xid, tcon, &fid); + } else if (open_rc == -ENOTDIR) { + /* -ENOTDIR means that the target path is definitely a file. */ + *directory = false; + } else if (open_rc == -ENOENT) { + /* -ENOENT means that the target path does not exist. */ + cifs_dbg(FYI, + "%s: symlink target path '%s' does not exist, " + "creating '%s' as file symlink\n", + __func__, symname, full_path); + } else { + /* Try to open as a file (NOT_DIR) */ + oplock = 0; + oparms.create_options = cifs_create_options(cifs_sb, + CREATE_NOT_DIR | OPEN_REPARSE_POINT); + open_rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); + if (open_rc == 0) { + /* Successful open means that the target path is definitely a file. */ + *directory = false; + tcon->ses->server->ops->close(xid, tcon, &fid); + } else if (open_rc == -EISDIR) { + /* -EISDIR means that the target path is definitely a directory. */ + *directory = true; + } else { + /* + * This code branch is called when we do not have a permission to + * open the resolved_path or some other client/process denied + * opening the resolved_path. + * + * TODO: Try to use ops->query_dir_first on the parent directory + * of resolved_path, search for basename of resolved_path and + * check if the ATTR_DIRECTORY is set in fi.Attributes. In some + * case this could work also when opening of the path is denied. + */ + cifs_dbg(FYI, + "%s: cannot determinate if the symlink target path '%s' " + "is directory or not, creating '%s' as file symlink\n", + __func__, symname, full_path); + } + } + + kfree(resolved_path); + cifs_put_tlink(tlink); + return 0; +} + static int nfs_set_reparse_buf(struct reparse_posix_data *buf, mode_t mode, dev_t dev, struct kvec *iov) @@ -137,7 +298,7 @@ static int mknod_nfs(unsigned int xid, struct inode *inode, }; new = smb2_get_reparse_inode(&data, inode->i_sb, xid, - tcon, full_path, &iov, NULL); + tcon, full_path, false, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); else @@ -217,8 +378,8 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode, memset(iov, 0, sizeof(*iov)); - /* Exclude $LXDEV xattr for sockets and fifos */ - if (S_ISSOCK(_mode) || S_ISFIFO(_mode)) + /* Exclude $LXDEV xattr for non-device files */ + if (!S_ISBLK(_mode) && !S_ISCHR(_mode)) num_xattrs = ARRAY_SIZE(xattrs) - 1; else num_xattrs = ARRAY_SIZE(xattrs); @@ -283,7 +444,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, data.wsl.eas_len = len; new = smb2_get_reparse_inode(&data, inode->i_sb, - xid, tcon, full_path, + xid, tcon, full_path, false, &reparse_iov, &xattr_iov); if (!IS_ERR(new)) d_instantiate(dentry, new); @@ -374,9 +535,95 @@ static int parse_reparse_posix(struct reparse_posix_data *buf, return 0; } +int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, + bool unicode, bool relative, + const char *full_path, + struct cifs_sb_info *cifs_sb) +{ + char sep = CIFS_DIR_SEP(cifs_sb); + char *linux_target = NULL; + char *smb_target = NULL; + int levels; + int rc; + int i; + + /* Check that length it valid for unicode/non-unicode mode */ + if (!len || (unicode && (len % 2))) { + cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); + rc = -EIO; + goto out; + } + + /* + * Check that buffer does not contain UTF-16 null codepoint in unicode + * mode or null byte in non-unicode mode because Linux cannot process + * symlink with null byte. + */ + if ((unicode && UniStrnlen((wchar_t *)buf, len/2) != len/2) || + (!unicode && strnlen(buf, len) != len)) { + cifs_dbg(VFS, "srv returned null byte in native symlink target location\n"); + rc = -EIO; + goto out; + } + + smb_target = cifs_strndup_from_utf16(buf, len, unicode, cifs_sb->local_nls); + if (!smb_target) { + rc = -ENOMEM; + goto out; + } + + if (smb_target[0] == sep && relative) { + /* + * This is a relative SMB symlink from the top of the share, + * which is the top level directory of the Linux mount point. + * Linux does not support such relative symlinks, so convert + * it to the relative symlink from the current directory. + * full_path is the SMB path to the symlink (from which is + * extracted current directory) and smb_target is the SMB path + * where symlink points, therefore full_path must always be on + * the SMB share. + */ + int smb_target_len = strlen(smb_target)+1; + levels = 0; + for (i = 1; full_path[i]; i++) { /* i=1 to skip leading sep */ + if (full_path[i] == sep) + levels++; + } + linux_target = kmalloc(levels*3 + smb_target_len, GFP_KERNEL); + if (!linux_target) { + rc = -ENOMEM; + goto out; + } + for (i = 0; i < levels; i++) { + linux_target[i*3 + 0] = '.'; + linux_target[i*3 + 1] = '.'; + linux_target[i*3 + 2] = sep; + } + memcpy(linux_target + levels*3, smb_target+1, smb_target_len); /* +1 to skip leading sep */ + } else { + linux_target = smb_target; + smb_target = NULL; + } + + if (sep == '\\') + convert_delimiter(linux_target, '/'); + + rc = 0; + *target = linux_target; + + cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, *target); + +out: + if (rc != 0) + kfree(linux_target); + kfree(smb_target); + return rc; +} + static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, u32 plen, bool unicode, struct cifs_sb_info *cifs_sb, + const char *full_path, struct cifs_open_info_data *data) { unsigned int len; @@ -391,20 +638,64 @@ static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, return -EIO; } - data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs, - len, unicode, + return smb2_parse_native_symlink(&data->symlink_target, + sym->PathBuffer + offs, + len, + unicode, + le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE, + full_path, + cifs_sb); +} + +static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) +{ + int len = le16_to_cpu(buf->ReparseDataLength); + int symname_utf8_len; + __le16 *symname_utf16; + int symname_utf16_len; + + if (len <= sizeof(buf->Flags)) { + cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n"); + return -EIO; + } + + /* PathBuffer is in UTF-8 but without trailing null-term byte */ + symname_utf8_len = len - sizeof(buf->Flags); + /* + * Check that buffer does not contain null byte + * because Linux cannot process symlink with null byte. + */ + if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) { + cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n"); + return -EIO; + } + symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL); + if (!symname_utf16) + return -ENOMEM; + symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len, + UTF16_LITTLE_ENDIAN, + (wchar_t *) symname_utf16, symname_utf8_len * 2); + if (symname_utf16_len < 0) { + kfree(symname_utf16); + return symname_utf16_len; + } + symname_utf16_len *= 2; /* utf8s_to_utf16s() returns number of u16 items, not byte length */ + + data->symlink_target = cifs_strndup_from_utf16((u8 *)symname_utf16, + symname_utf16_len, true, cifs_sb->local_nls); + kfree(symname_utf16); if (!data->symlink_target) return -ENOMEM; - convert_delimiter(data->symlink_target, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target); - return 0; } int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, + const char *full_path, bool unicode, struct cifs_open_info_data *data) { struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -419,12 +710,20 @@ int parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_SYMLINK: return parse_reparse_symlink( (struct reparse_symlink_data_buffer *)buf, - plen, unicode, cifs_sb, data); + plen, unicode, cifs_sb, full_path, data); case IO_REPARSE_TAG_LX_SYMLINK: + return parse_reparse_wsl_symlink( + (struct reparse_wsl_symlink_data_buffer *)buf, + cifs_sb, data); case IO_REPARSE_TAG_AF_UNIX: case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_LX_CHR: case IO_REPARSE_TAG_LX_BLK: + if (le16_to_cpu(buf->ReparseDataLength) != 0) { + cifs_dbg(VFS, "srv returned malformed buffer for reparse point: 0x%08x\n", + le32_to_cpu(buf->ReparseTag)); + return -EIO; + } break; default: cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", @@ -435,6 +734,7 @@ int parse_reparse_point(struct reparse_data_buffer *buf, } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, + const char *full_path, struct kvec *rsp_iov, struct cifs_open_info_data *data) { @@ -444,7 +744,7 @@ int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, buf = (struct reparse_data_buffer *)((u8 *)io + le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, true, data); + return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data); } static void wsl_to_fattr(struct cifs_open_info_data *data, @@ -503,44 +803,60 @@ out: fattr->cf_dtype = S_DT(fattr->cf_mode); } -bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, - struct cifs_fattr *fattr, - struct cifs_open_info_data *data) +static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data) { struct reparse_posix_data *buf = data->reparse.posix; - u32 tag = data->reparse.tag; - if (tag == IO_REPARSE_TAG_NFS && buf) { - if (le16_to_cpu(buf->ReparseDataLength) < sizeof(buf->InodeType)) + + if (buf == NULL) + return true; + + if (le16_to_cpu(buf->ReparseDataLength) < sizeof(buf->InodeType)) { + WARN_ON_ONCE(1); + return false; + } + + switch (le64_to_cpu(buf->InodeType)) { + case NFS_SPECFILE_CHR: + if (le16_to_cpu(buf->ReparseDataLength) != sizeof(buf->InodeType) + 8) { + WARN_ON_ONCE(1); return false; - switch (le64_to_cpu(buf->InodeType)) { - case NFS_SPECFILE_CHR: - if (le16_to_cpu(buf->ReparseDataLength) != sizeof(buf->InodeType) + 8) - return false; - fattr->cf_mode |= S_IFCHR; - fattr->cf_rdev = reparse_mkdev(buf->DataBuffer); - break; - case NFS_SPECFILE_BLK: - if (le16_to_cpu(buf->ReparseDataLength) != sizeof(buf->InodeType) + 8) - return false; - fattr->cf_mode |= S_IFBLK; - fattr->cf_rdev = reparse_mkdev(buf->DataBuffer); - break; - case NFS_SPECFILE_FIFO: - fattr->cf_mode |= S_IFIFO; - break; - case NFS_SPECFILE_SOCK: - fattr->cf_mode |= S_IFSOCK; - break; - case NFS_SPECFILE_LNK: - fattr->cf_mode |= S_IFLNK; - break; - default: + } + fattr->cf_mode |= S_IFCHR; + fattr->cf_rdev = reparse_mkdev(buf->DataBuffer); + break; + case NFS_SPECFILE_BLK: + if (le16_to_cpu(buf->ReparseDataLength) != sizeof(buf->InodeType) + 8) { WARN_ON_ONCE(1); return false; } - goto out; + fattr->cf_mode |= S_IFBLK; + fattr->cf_rdev = reparse_mkdev(buf->DataBuffer); + break; + case NFS_SPECFILE_FIFO: + fattr->cf_mode |= S_IFIFO; + break; + case NFS_SPECFILE_SOCK: + fattr->cf_mode |= S_IFSOCK; + break; + case NFS_SPECFILE_LNK: + fattr->cf_mode |= S_IFLNK; + break; + default: + WARN_ON_ONCE(1); + return false; } + return true; +} + +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data) +{ + u32 tag = data->reparse.tag; + bool ok; switch (tag) { case IO_REPARSE_TAG_INTERNAL: @@ -560,15 +876,19 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, case IO_REPARSE_TAG_LX_BLK: wsl_to_fattr(data, cifs_sb, tag, fattr); break; + case IO_REPARSE_TAG_NFS: + ok = posix_reparse_to_fattr(cifs_sb, fattr, data); + if (!ok) + return false; + break; case 0: /* SMB1 symlink */ case IO_REPARSE_TAG_SYMLINK: - case IO_REPARSE_TAG_NFS: fattr->cf_mode |= S_IFLNK; break; default: return false; } -out: + fattr->cf_dtype = S_DT(fattr->cf_mode); return true; } diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 158e7b7aae64..ff05b0e75c92 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -12,6 +12,8 @@ #include "fs_context.h" #include "cifsglob.h" +#define REPARSE_SYM_PATH_MAX 4060 + /* * Used only by cifs.ko to ignore reparse points from files when client or * server doesn't support FSCTL_GET_REPARSE_POINT. @@ -115,7 +117,9 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, int smb2_mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov, +int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, + const char *full_path, + struct kvec *rsp_iov, struct cifs_open_info_data *data); #endif /* _CIFS_REPARSE_H */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 3216f786908f..91d4d409cb1d 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -27,31 +27,6 @@ static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); -bool -is_server_using_iface(struct TCP_Server_Info *server, - struct cifs_server_iface *iface) -{ - struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; - struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; - struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; - struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; - - if (server->dstaddr.ss_family != iface->sockaddr.ss_family) - return false; - if (server->dstaddr.ss_family == AF_INET) { - if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) - return false; - } else if (server->dstaddr.ss_family == AF_INET6) { - if (memcmp(&s6->sin6_addr, &i6->sin6_addr, - sizeof(i6->sin6_addr)) != 0) - return false; - } else { - /* unknown family.. */ - return false; - } - return true; -} - bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) { int i; @@ -115,18 +90,6 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses, ses->chans[chan_index].in_reconnect = false; } -bool -cifs_chan_in_reconnect(struct cifs_ses *ses, - struct TCP_Server_Info *server) -{ - unsigned int chan_index = cifs_ses_get_chan_index(ses, server); - - if (chan_index == CIFS_INVAL_CHAN_INDEX) - return true; /* err on the safer side */ - - return CIFS_CHAN_IN_RECONNECT(ses, chan_index); -} - void cifs_chan_set_need_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) @@ -359,10 +322,7 @@ done: spin_unlock(&ses->chan_lock); } -/* - * update the iface for the channel if necessary. - * Must be called with chan_lock held. - */ +/* update the iface for the channel if necessary. */ void cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) { @@ -487,26 +447,6 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) spin_unlock(&ses->chan_lock); } -/* - * If server is a channel of ses, return the corresponding enclosing - * cifs_chan otherwise return NULL. - */ -struct cifs_chan * -cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) -{ - int i; - - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server == server) { - spin_unlock(&ses->chan_lock); - return &ses->chans[i]; - } - } - spin_unlock(&ses->chan_lock); - return NULL; -} - static int cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) @@ -523,11 +463,11 @@ cifs_ses_add_channel(struct cifs_ses *ses, if (iface->sockaddr.ss_family == AF_INET) cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", - ses, iface->speed, iface->rdma_capable ? "yes" : "no", + ses, iface->speed, str_yes_no(iface->rdma_capable), &ipv4->sin_addr); else cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n", - ses, iface->speed, iface->rdma_capable ? "yes" : "no", + ses, iface->speed, str_yes_no(iface->rdma_capable), &ipv6->sin6_addr); /* diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 9a6ece66c4d3..db3695eddcf9 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -994,17 +994,17 @@ static int cifs_query_symlink(const unsigned int xid, } static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, + const char *full_path, struct kvec *rsp_iov, struct cifs_open_info_data *data) { struct reparse_data_buffer *buf; TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; - bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE); u32 plen = le16_to_cpu(io->ByteCount); buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, unicode, data); + return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data); } static bool diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index e301349b0078..e836bc2193dd 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -63,12 +63,12 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) return sym; } -int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path) +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, + const char *full_path, char **path) { struct smb2_symlink_err_rsp *sym; unsigned int sub_offs, sub_len; unsigned int print_offs, print_len; - char *s; if (!cifs_sb || !iov || !iov->iov_base || !iov->iov_len || !path) return -EINVAL; @@ -86,15 +86,13 @@ int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec iov->iov_len < SMB2_SYMLINK_STRUCT_SIZE + print_offs + print_len) return -EINVAL; - s = cifs_strndup_from_utf16((char *)sym->PathBuffer + sub_offs, sub_len, true, - cifs_sb->local_nls); - if (!s) - return -ENOMEM; - convert_delimiter(s, '/'); - cifs_dbg(FYI, "%s: symlink target: %s\n", __func__, s); - - *path = s; - return 0; + return smb2_parse_native_symlink(path, + (char *)sym->PathBuffer + sub_offs, + sub_len, + true, + le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE, + full_path, + cifs_sb); } int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf) @@ -126,6 +124,7 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 goto out; if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) { rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov, + oparms->path, &data->symlink_target); if (!rc) { memset(smb2_data, 0, sizeof(*smb2_data)); diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 4e9e225520a6..a55f0044d30b 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -828,6 +828,7 @@ finished: static int parse_create_response(struct cifs_open_info_data *data, struct cifs_sb_info *cifs_sb, + const char *full_path, const struct kvec *iov) { struct smb2_create_rsp *rsp = iov->iov_base; @@ -841,6 +842,7 @@ static int parse_create_response(struct cifs_open_info_data *data, break; case STATUS_STOPPED_ON_SYMLINK: rc = smb2_parse_symlink_response(cifs_sb, iov, + full_path, &data->symlink_target); if (rc) return rc; @@ -930,18 +932,19 @@ int smb2_query_path_info(const unsigned int xid, switch (rc) { case 0: - rc = parse_create_response(data, cifs_sb, &out_iov[0]); + rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); break; case -EOPNOTSUPP: /* * BB TODO: When support for special files added to Samba * re-verify this path. */ - rc = parse_create_response(data, cifs_sb, &out_iov[0]); + rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); if (rc || !data->reparse_point) goto out; - cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + if (!tcon->posix_extensions) + cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; /* * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create * response. @@ -1198,6 +1201,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, + bool directory, struct kvec *reparse_iov, struct kvec *xattr_iov) { @@ -1217,7 +1221,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_DIR | OPEN_REPARSE_POINT, + (directory ? CREATE_NOT_FILE : CREATE_NOT_DIR) | OPEN_REPARSE_POINT, ACL_NO_MODE); if (xattr_iov) oparms.ea_cctx = xattr_iov; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6b385fce3f2a..7121d9e0f404 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1158,7 +1158,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; unsigned int size[1]; void *data[1]; - struct smb2_file_full_ea_info *ea = NULL; + struct smb2_file_full_ea_info *ea; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; int retries = 0, cur_sleep = 1; @@ -1179,6 +1179,7 @@ replay_again: if (!utf16_path) return -ENOMEM; + ea = NULL; resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; vars = kzalloc(sizeof(*vars), GFP_KERNEL); if (!vars) { @@ -2605,7 +2606,7 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; unsigned long len = smb_rqst_len(server, rqst); - int i, num_padding; + int num_padding; shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { @@ -2614,44 +2615,13 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - - /* No padding needed */ - if (!(len & 7)) - goto finished; - - num_padding = 8 - (len & 7); - if (!smb3_encryption_required(tcon)) { - /* - * If we do not have encryption then we can just add an extra - * iov for the padding. - */ + if (!IS_ALIGNED(len, 8)) { + num_padding = 8 - (len & 7); rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; len += num_padding; - } else { - /* - * We can not add a small padding iov for the encryption case - * because the encryption framework can not handle the padding - * iovs. - * We have to flatten this into a single buffer and add - * the padding to it. - */ - for (i = 1; i < rqst->rq_nvec; i++) { - memcpy(rqst->rq_iov[0].iov_base + - rqst->rq_iov[0].iov_len, - rqst->rq_iov[i].iov_base, - rqst->rq_iov[i].iov_len); - rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; - } - memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, - 0, num_padding); - rqst->rq_iov[0].iov_len += num_padding; - len += num_padding; - rqst->rq_nvec = 1; } - - finished: shdr->NextCommand = cpu_to_le32(len); } @@ -2962,7 +2932,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, struct fsctl_get_dfs_referral_req *dfs_req = NULL; struct get_dfs_referral_rsp *dfs_rsp = NULL; u32 dfs_req_size = 0, dfs_rsp_size = 0; - int retry_count = 0; + int retry_once = 0; cifs_dbg(FYI, "%s: path: %s\n", __func__, search_name); @@ -3011,21 +2981,25 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, /* Path to resolve in an UTF-16 null-terminated string */ memcpy(dfs_req->RequestFileName, utf16_path, utf16_path_len); - do { + for (;;) { rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, (char **)&dfs_rsp, &dfs_rsp_size); - if (!is_retryable_error(rc)) + if (fatal_signal_pending(current)) { + rc = -EINTR; + break; + } + if (!is_retryable_error(rc) || retry_once++) break; usleep_range(512, 2048); - } while (++retry_count < 5); + } if (!rc && !dfs_rsp) rc = -EIO; if (rc) { if (!is_retryable_error(rc) && rc != -ENOENT && rc != -EOPNOTSUPP) - cifs_tcon_dbg(VFS, "%s: ioctl error: rc=%d\n", __func__, rc); + cifs_tcon_dbg(FYI, "%s: ioctl error: rc=%d\n", __func__, rc); goto out; } @@ -4079,7 +4053,7 @@ map_oplock_to_lease(u8 oplock) if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) return SMB2_LEASE_WRITE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_II) - return SMB2_LEASE_READ_CACHING_LE; + return SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) return SMB2_LEASE_HANDLE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_WRITE_CACHING_LE; @@ -4414,7 +4388,7 @@ static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size) p = kmalloc(sizeof(*p), GFP_NOFS); if (!p) goto nomem; - folioq_init(p); + folioq_init(p, 0); if (tail) { tail->next = p; p->prev = tail; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index b2f16a7b696d..9f54596a6866 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -216,10 +216,9 @@ static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server, bool from_reconnect) { - int rc = 0; - struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; int xid; + int rc = 0; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -229,11 +228,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, if (tcon == NULL) return 0; - /* - * Need to also skip SMB2_IOCTL because it is used for checking nested dfs links in - * cifs_tree_connect(). - */ - if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) + if (smb2_command == SMB2_TREE_CONNECT) return 0; spin_lock(&tcon->tc_lock); @@ -334,8 +329,6 @@ again: } spin_unlock(&server->srv_lock); - nls_codepage = ses->local_nls; - /* * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session @@ -372,7 +365,7 @@ again: } } - rc = cifs_setup_session(0, ses, server, nls_codepage); + rc = cifs_setup_session(0, ses, server, ses->local_nls); if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { /* * Try alternate password for next reconnect (key rotation @@ -406,7 +399,7 @@ skip_sess_setup: if (tcon->use_persistent) tcon->need_reopen_files = true; - rc = cifs_tree_connect(0, tcon, nls_codepage); + rc = cifs_tree_connect(0, tcon); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { @@ -494,6 +487,7 @@ out: case SMB2_CHANGE_NOTIFY: case SMB2_QUERY_INFO: case SMB2_SET_INFO: + case SMB2_IOCTL: rc = -EAGAIN; } failed: @@ -1231,7 +1225,9 @@ SMB2_negotiate(const unsigned int xid, * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context * Set the cipher type manually. */ - if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + if ((server->dialect == SMB30_PROT_ID || + server->dialect == SMB302_PROT_ID) && + (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, @@ -2683,7 +2679,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) ptr += sizeof(struct smb3_acl); /* create one ACE to hold the mode embedded in reserved special SID */ - acelen = setup_special_mode_ACE((struct smb_ace *)ptr, (__u64)mode); + acelen = setup_special_mode_ACE((struct smb_ace *)ptr, false, (__u64)mode); ptr += acelen; acl_size = acelen + sizeof(struct smb3_acl); ace_count = 1; @@ -4504,14 +4500,6 @@ smb2_new_read_req(void **buf, unsigned int *total_len, return rc; } -static void smb2_readv_worker(struct work_struct *work) -{ - struct cifs_io_subrequest *rdata = - container_of(work, struct cifs_io_subrequest, subreq.work); - - netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); -} - static void smb2_readv_callback(struct mid_q_entry *mid) { @@ -4619,15 +4607,17 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } + if (rdata->got_bytes) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_response_clear); rdata->credits.value = 0; + rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); - INIT_WORK(&rdata->subreq.work, smb2_readv_worker); - queue_work(cifsiod_wq, &rdata->subreq.work); + netfs_read_subreq_terminated(&rdata->subreq); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, server->credits, server->in_flight, @@ -4844,10 +4834,14 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + cifs_stats_bytes_written(tcon, written); + + if (written < wdata->subreq.len) { wdata->result = -ENOSPC; - else + } else if (written > 0) { wdata->subreq.len = written; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -5016,7 +5010,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) } #endif - if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) + if (wdata->subreq.retry_count > 0) smb2_set_replay(server, &rqst); cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", @@ -5160,6 +5154,7 @@ replay_again: cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); + cifs_stats_bytes_written(io_parms->tcon, *nbytes); trace_smb3_write_done(0, 0, xid, req->PersistentFileId, io_parms->tcon->tid, @@ -6208,7 +6203,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, req->StructureSize = cpu_to_le16(36); total_len += 12; - memcpy(req->LeaseKey, lease_key, 16); + memcpy(req->LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); req->LeaseState = lease_state; flags |= CIFS_NO_RSP_BUF; diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index c7e1b149877a..09349fa8da03 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -37,8 +37,6 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); -extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, - __u64 ses_id); extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid); extern int smb2_calc_signature(struct smb_rqst *rqst, @@ -61,6 +59,7 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, + bool directory, struct kvec *reparse_iov, struct kvec *xattr_iov); int smb2_query_reparse_point(const unsigned int xid, @@ -112,7 +111,14 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_read); -int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov, char **path); +int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, + bool unicode, bool relative, + const char *full_path, + struct cifs_sb_info *cifs_sb); +int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, + const struct kvec *iov, + const char *full_path, + char **path); int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf); extern int smb2_unlock_range(struct cifsFileInfo *cfile, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index b486b14bb330..475b36c27f65 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -74,7 +74,7 @@ err: static -int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) +int smb3_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { struct cifs_chan *chan; struct TCP_Server_Info *pserver; @@ -168,16 +168,41 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) return NULL; } -struct cifs_ses * -smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id) +static int smb2_get_sign_key(struct TCP_Server_Info *server, + __u64 ses_id, u8 *key) { struct cifs_ses *ses; + int rc = -ENOENT; + + if (SERVER_IS_CHAN(server)) + server = server->primary_server; spin_lock(&cifs_tcp_ses_lock); - ses = smb2_find_smb_ses_unlocked(server, ses_id); - spin_unlock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid != ses_id) + continue; - return ses; + rc = 0; + spin_lock(&ses->ses_lock); + switch (ses->ses_status) { + case SES_EXITING: /* SMB2_LOGOFF */ + case SES_GOOD: + if (likely(ses->auth_key.response)) { + memcpy(key, ses->auth_key.response, + SMB2_NTLMV2_SESSKEY_SIZE); + } else { + rc = -EIO; + } + break; + default: + rc = -EAGAIN; + break; + } + spin_unlock(&ses->ses_lock); + break; + } + spin_unlock(&cifs_tcp_ses_lock); + return rc; } static struct cifs_tcon * @@ -236,14 +261,16 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct cifs_ses *ses; struct shash_desc *shash = NULL; struct smb_rqst drqst; + __u64 sid = le64_to_cpu(shdr->SessionId); + u8 key[SMB2_NTLMV2_SESSKEY_SIZE]; - ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); - if (unlikely(!ses)) { - cifs_server_dbg(FYI, "%s: Could not find session\n", __func__); - return -ENOENT; + rc = smb2_get_sign_key(server, sid, key); + if (unlikely(rc)) { + cifs_server_dbg(FYI, "%s: [sesid=0x%llx] couldn't find signing key: %d\n", + __func__, sid, rc); + return rc; } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); @@ -260,8 +287,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, shash = server->secmech.hmacsha256; } - rc = crypto_shash_setkey(shash->tfm, ses->auth_key.response, - SMB2_NTLMV2_SESSKEY_SIZE); + rc = crypto_shash_setkey(shash->tfm, key, sizeof(key)); if (rc) { cifs_server_dbg(VFS, "%s: Could not update with response\n", @@ -303,8 +329,6 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) cifs_free_hash(&shash); - if (ses) - cifs_put_smb_ses(ses); return rc; } @@ -570,7 +594,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; - rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); + rc = smb3_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (unlikely(rc)) { cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__); return rc; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 0b52d22a91a0..12cbd3428a6d 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -44,6 +44,8 @@ EM(netfs_trace_tcon_ref_free_ipc, "FRE Ipc ") \ EM(netfs_trace_tcon_ref_free_ipc_fail, "FRE Ipc-F ") \ EM(netfs_trace_tcon_ref_free_reconnect_server, "FRE Reconn") \ + EM(netfs_trace_tcon_ref_get_cached_laundromat, "GET Ch-Lau") \ + EM(netfs_trace_tcon_ref_get_cached_lease_break, "GET Ch-Lea") \ EM(netfs_trace_tcon_ref_get_cancelled_close, "GET Cn-Cls") \ EM(netfs_trace_tcon_ref_get_dfs_refer, "GET DfsRef") \ EM(netfs_trace_tcon_ref_get_find, "GET Find ") \ @@ -52,6 +54,7 @@ EM(netfs_trace_tcon_ref_new, "NEW ") \ EM(netfs_trace_tcon_ref_new_ipc, "NEW Ipc ") \ EM(netfs_trace_tcon_ref_new_reconnect_server, "NEW Reconn") \ + EM(netfs_trace_tcon_ref_put_cached_close, "PUT Ch-Cls") \ EM(netfs_trace_tcon_ref_put_cancelled_close, "PUT Cn-Cls") \ EM(netfs_trace_tcon_ref_put_cancelled_close_fid, "PUT Cn-Fid") \ EM(netfs_trace_tcon_ref_put_cancelled_mid, "PUT Cn-Mid") \ diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 91812150186c..0dc80959ce48 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -418,19 +418,16 @@ out: return rc; } -struct send_req_vars { - struct smb2_transform_hdr tr_hdr; - struct smb_rqst rqst[MAX_COMPOUND]; - struct kvec iov; -}; - static int smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst, int flags) { - struct send_req_vars *vars; - struct smb_rqst *cur_rqst; - struct kvec *iov; + struct smb2_transform_hdr tr_hdr; + struct smb_rqst new_rqst[MAX_COMPOUND] = {}; + struct kvec iov = { + .iov_base = &tr_hdr, + .iov_len = sizeof(tr_hdr), + }; int rc; if (flags & CIFS_COMPRESS_REQ) @@ -447,26 +444,15 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - vars = kzalloc(sizeof(*vars), GFP_NOFS); - if (!vars) - return -ENOMEM; - cur_rqst = vars->rqst; - iov = &vars->iov; - - iov->iov_base = &vars->tr_hdr; - iov->iov_len = sizeof(vars->tr_hdr); - cur_rqst[0].rq_iov = iov; - cur_rqst[0].rq_nvec = 1; + new_rqst[0].rq_iov = &iov; + new_rqst[0].rq_nvec = 1; rc = server->ops->init_transform_rq(server, num_rqst + 1, - &cur_rqst[0], rqst); - if (rc) - goto out; - - rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]); - smb3_free_compound_rqst(num_rqst, &cur_rqst[1]); -out: - kfree(vars); + new_rqst, rqst); + if (!rc) { + rc = __smb_send_rqst(server, num_rqst + 1, new_rqst); + smb3_free_compound_rqst(num_rqst, &new_rqst[1]); + } return rc; } diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 9f272cc8f566..3c7c706c797d 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1552,6 +1552,15 @@ struct reparse_symlink_data_buffer { /* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ +/* For IO_REPARSE_TAG_LX_SYMLINK */ +struct reparse_wsl_symlink_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le32 Flags; + __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */ +} __packed; + struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index b931a99ab9c8..5c4c5121fece 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -104,7 +104,7 @@ int build_spnego_ntlmssp_neg_blob(unsigned char **pbuffer, u16 *buflen, oid_len + ntlmssp_len) * 2 + neg_result_len + oid_len + ntlmssp_len; - buf = kmalloc(total_len, GFP_KERNEL); + buf = kmalloc(total_len, KSMBD_DEFAULT_GFP); if (!buf) return -ENOMEM; @@ -140,7 +140,7 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, int total_len = 4 + compute_asn_hdr_len_bytes(neg_result_len) * 2 + neg_result_len; - buf = kmalloc(total_len, GFP_KERNEL); + buf = kmalloc(total_len, KSMBD_DEFAULT_GFP); if (!buf) return -ENOMEM; @@ -217,7 +217,7 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, if (!vlen) return -EINVAL; - conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); + conn->mechToken = kmemdup_nul(value, vlen, KSMBD_DEFAULT_GFP); if (!conn->mechToken) return -ENOMEM; diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 09b20039636e..2a5b4a96bf99 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -151,7 +151,7 @@ static int calc_ntlmv2_hash(struct ksmbd_conn *conn, struct ksmbd_session *sess, /* convert user_name to unicode */ len = strlen(user_name(sess->user)); - uniname = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + uniname = kzalloc(2 + UNICODE_LEN(len), KSMBD_DEFAULT_GFP); if (!uniname) { ret = -ENOMEM; goto out; @@ -175,7 +175,7 @@ static int calc_ntlmv2_hash(struct ksmbd_conn *conn, struct ksmbd_session *sess, /* Convert domain name or conn name to unicode and uppercase */ len = strlen(dname); - domain = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + domain = kzalloc(2 + UNICODE_LEN(len), KSMBD_DEFAULT_GFP); if (!domain) { ret = -ENOMEM; goto out; @@ -254,7 +254,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, } len = CIFS_CRYPTO_KEY_SIZE + blen; - construct = kzalloc(len, GFP_KERNEL); + construct = kzalloc(len, KSMBD_DEFAULT_GFP); if (!construct) { rc = -ENOMEM; goto out; @@ -361,7 +361,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (sess_key_len > CIFS_KEY_SIZE) return -EINVAL; - ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), KSMBD_DEFAULT_GFP); if (!ctx_arc4) return -ENOMEM; @@ -451,7 +451,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, chgblob->NegotiateFlags = cpu_to_le32(flags); len = strlen(ksmbd_netbios_name()); - name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + name = kmalloc(2 + UNICODE_LEN(len), KSMBD_DEFAULT_GFP); if (!name) return -ENOMEM; @@ -512,6 +512,7 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len) { struct ksmbd_spnego_authen_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; int retval; @@ -540,7 +541,10 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, goto out; } - user = ksmbd_alloc_user(&resp->login_response); + if (resp->login_response.status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(resp->login_response.account); + + user = ksmbd_alloc_user(&resp->login_response, resp_ext); if (!user) { ksmbd_debug(AUTH, "login failure\n"); retval = -ENOMEM; @@ -1012,6 +1016,8 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, ses_enc_key = enc ? sess->smb3encryptionkey : sess->smb3decryptionkey; + if (enc) + ksmbd_user_session_get(sess); memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); return 0; @@ -1039,7 +1045,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, if (!nvec) return NULL; - nr_entries = kcalloc(nvec, sizeof(int), GFP_KERNEL); + nr_entries = kcalloc(nvec, sizeof(int), KSMBD_DEFAULT_GFP); if (!nr_entries) return NULL; @@ -1059,7 +1065,8 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, /* Add two entries for transform header and signature */ total_entries += 2; - sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL); + sg = kmalloc_array(total_entries, sizeof(struct scatterlist), + KSMBD_DEFAULT_GFP); if (!sg) { kfree(nr_entries); return NULL; @@ -1159,7 +1166,7 @@ int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, goto free_ctx; } - req = aead_request_alloc(tfm, GFP_KERNEL); + req = aead_request_alloc(tfm, KSMBD_DEFAULT_GFP); if (!req) { rc = -ENOMEM; goto free_ctx; @@ -1178,7 +1185,7 @@ int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov, } iv_len = crypto_aead_ivsize(tfm); - iv = kzalloc(iv_len, GFP_KERNEL); + iv = kzalloc(iv_len, KSMBD_DEFAULT_GFP); if (!iv) { rc = -ENOMEM; goto free_sg; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index aa2a37a7ce84..f8a40f65db6a 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -52,7 +52,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) { struct ksmbd_conn *conn; - conn = kzalloc(sizeof(struct ksmbd_conn), GFP_KERNEL); + conn = kzalloc(sizeof(struct ksmbd_conn), KSMBD_DEFAULT_GFP); if (!conn) return NULL; @@ -119,8 +119,8 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) requests_queue = &conn->requests; + atomic_inc(&conn->req_running); if (requests_queue) { - atomic_inc(&conn->req_running); spin_lock(&conn->request_lock); list_add_tail(&work->request_entry, requests_queue); spin_unlock(&conn->request_lock); @@ -131,11 +131,14 @@ void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + atomic_dec(&conn->req_running); + if (waitqueue_active(&conn->req_running_q)) + wake_up(&conn->req_running_q); + if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) return; - atomic_dec(&conn->req_running); spin_lock(&conn->request_lock); list_del_init(&work->request_entry); spin_unlock(&conn->request_lock); @@ -307,7 +310,7 @@ int ksmbd_conn_handler_loop(void *p) { struct ksmbd_conn *conn = (struct ksmbd_conn *)p; struct ksmbd_transport *t = conn->transport; - unsigned int pdu_size, max_allowed_pdu_size; + unsigned int pdu_size, max_allowed_pdu_size, max_req; char hdr_buf[4] = {0,}; int size; @@ -317,6 +320,7 @@ int ksmbd_conn_handler_loop(void *p) if (t->ops->prepare && t->ops->prepare(t)) goto out; + max_req = server_conf.max_inflight_req; conn->last_active = jiffies; set_freezable(); while (ksmbd_conn_alive(conn)) { @@ -326,6 +330,13 @@ int ksmbd_conn_handler_loop(void *p) kvfree(conn->request_buf); conn->request_buf = NULL; +recheck: + if (atomic_read(&conn->req_running) + 1 > max_req) { + wait_event_interruptible(conn->req_running_q, + atomic_read(&conn->req_running) < max_req); + goto recheck; + } + size = t->ops->read(t, hdr_buf, sizeof(hdr_buf), -1); if (size != sizeof(hdr_buf)) break; @@ -358,7 +369,7 @@ int ksmbd_conn_handler_loop(void *p) /* 4 for rfc1002 length field */ /* 1 for implied bcc[0] */ size = pdu_size + 4 + 1; - conn->request_buf = kvmalloc(size, GFP_KERNEL); + conn->request_buf = kvmalloc(size, KSMBD_DEFAULT_GFP); if (!conn->request_buf) break; @@ -403,6 +414,7 @@ int ksmbd_conn_handler_loop(void *p) out: ksmbd_conn_set_releasing(conn); /* Wait till all reference dropped to the Server object*/ + ksmbd_debug(CONN, "Wait for all pending requests(%d)\n", atomic_read(&conn->r_count)); wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); if (IS_ENABLED(CONFIG_UNICODE)) @@ -461,7 +473,7 @@ again: up_read(&conn_list_lock); if (!list_empty(&conn_list)) { - schedule_timeout_interruptible(HZ / 10); /* 100ms */ + msleep(100); goto again; } } diff --git a/fs/smb/server/crypto_ctx.c b/fs/smb/server/crypto_ctx.c index 81488d04199d..ce733dc9a4a3 100644 --- a/fs/smb/server/crypto_ctx.c +++ b/fs/smb/server/crypto_ctx.c @@ -89,7 +89,7 @@ static struct shash_desc *alloc_shash_desc(int id) return NULL; shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm), - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!shash) crypto_free_shash(tfm); else @@ -133,7 +133,7 @@ static struct ksmbd_crypto_ctx *ksmbd_find_crypto_ctx(void) ctx_list.avail_ctx++; spin_unlock(&ctx_list.ctx_lock); - ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), KSMBD_DEFAULT_GFP); if (!ctx) { spin_lock(&ctx_list.ctx_lock); ctx_list.avail_ctx--; @@ -258,7 +258,7 @@ int ksmbd_crypto_create(void) init_waitqueue_head(&ctx_list.ctx_wait); ctx_list.avail_ctx = 1; - ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), KSMBD_DEFAULT_GFP); if (!ctx) return -ENOMEM; list_add(&ctx->list, &ctx_list.idle_ctx); diff --git a/fs/smb/server/glob.h b/fs/smb/server/glob.h index d528b20b37a8..4ea187af2348 100644 --- a/fs/smb/server/glob.h +++ b/fs/smb/server/glob.h @@ -44,4 +44,6 @@ extern int ksmbd_debug_types; #define UNICODE_LEN(x) ((x) * 2) +#define KSMBD_DEFAULT_GFP (GFP_KERNEL | __GFP_RETRY_MAYFAIL) + #endif /* __KSMBD_GLOB_H */ diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 38e6fd2da3b8..3d01d9d15293 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -51,6 +51,9 @@ * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response) * This event is to make kerberos authentication to be processed in * userspace. + * + * - KSMBD_EVENT_LOGIN_REQUEST_EXT/RESPONSE_EXT(ksmbd_login_request_ext/response_ext) + * This event is to get user account extension info to user IPC daemon. */ #define KSMBD_GENL_NAME "SMBD_GENL" @@ -146,6 +149,16 @@ struct ksmbd_login_response { }; /* + * IPC user login response extension. + */ +struct ksmbd_login_response_ext { + __u32 handle; + __s32 ngroups; /* supplementary group count */ + __s8 reserved[128]; /* Reserved room */ + __s8 ____payload[]; +}; + +/* * IPC request to fetch net share config. */ struct ksmbd_share_config_request { @@ -306,6 +319,9 @@ enum ksmbd_event { KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, + KSMBD_EVENT_LOGIN_REQUEST_EXT, + KSMBD_EVENT_LOGIN_RESPONSE_EXT, + __KSMBD_EVENT_MAX, KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1 }; @@ -336,6 +352,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) #define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) +#define KSMBD_USER_FLAG_EXTENSION BIT(6) /* * Share config flags. diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index d7c676c151e2..4af2e6007c29 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -18,7 +18,7 @@ static struct workqueue_struct *ksmbd_wq; struct ksmbd_work *ksmbd_alloc_work_struct(void) { - struct ksmbd_work *work = kmem_cache_zalloc(work_cache, GFP_KERNEL); + struct ksmbd_work *work = kmem_cache_zalloc(work_cache, KSMBD_DEFAULT_GFP); if (work) { work->compound_fid = KSMBD_NO_FID; @@ -30,7 +30,7 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void) INIT_LIST_HEAD(&work->aux_read_list); work->iov_alloc_cnt = 4; work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec), - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!work->iov) { kmem_cache_free(work_cache, work); work = NULL; @@ -114,7 +114,7 @@ static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len, if (aux_size) { need_iov_cnt++; - ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL); + ar = kmalloc(sizeof(struct aux_read), KSMBD_DEFAULT_GFP); if (!ar) return -ENOMEM; } @@ -125,7 +125,7 @@ static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len, work->iov_alloc_cnt += 4; new = krealloc(work->iov, sizeof(struct kvec) * work->iov_alloc_cnt, - GFP_KERNEL | __GFP_ZERO); + KSMBD_DEFAULT_GFP | __GFP_ZERO); if (!new) { kfree(ar); work->iov_alloc_cnt -= 4; @@ -169,7 +169,7 @@ int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, int allocate_interim_rsp_buf(struct ksmbd_work *work) { - work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, KSMBD_DEFAULT_GFP); if (!work->response_buf) return -ENOMEM; work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; diff --git a/fs/smb/server/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c index a18e27e9e0cd..0e2ae994ab52 100644 --- a/fs/smb/server/mgmt/ksmbd_ida.c +++ b/fs/smb/server/mgmt/ksmbd_ida.c @@ -4,31 +4,32 @@ */ #include "ksmbd_ida.h" +#include "../glob.h" int ksmbd_acquire_smb2_tid(struct ida *ida) { - return ida_alloc_range(ida, 1, 0xFFFFFFFE, GFP_KERNEL); + return ida_alloc_range(ida, 1, 0xFFFFFFFE, KSMBD_DEFAULT_GFP); } int ksmbd_acquire_smb2_uid(struct ida *ida) { int id; - id = ida_alloc_min(ida, 1, GFP_KERNEL); + id = ida_alloc_min(ida, 1, KSMBD_DEFAULT_GFP); if (id == 0xFFFE) - id = ida_alloc_min(ida, 1, GFP_KERNEL); + id = ida_alloc_min(ida, 1, KSMBD_DEFAULT_GFP); return id; } int ksmbd_acquire_async_msg_id(struct ida *ida) { - return ida_alloc_min(ida, 1, GFP_KERNEL); + return ida_alloc_min(ida, 1, KSMBD_DEFAULT_GFP); } int ksmbd_acquire_id(struct ida *ida) { - return ida_alloc(ida, GFP_KERNEL); + return ida_alloc(ida, KSMBD_DEFAULT_GFP); } void ksmbd_release_id(struct ida *ida, int id) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index d8d03070ae44..d3d5f99bdd34 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -102,11 +102,11 @@ static int parse_veto_list(struct ksmbd_share_config *share, if (!sz) break; - p = kzalloc(sizeof(struct ksmbd_veto_pattern), GFP_KERNEL); + p = kzalloc(sizeof(struct ksmbd_veto_pattern), KSMBD_DEFAULT_GFP); if (!p) return -ENOMEM; - p->pattern = kstrdup(veto_list, GFP_KERNEL); + p->pattern = kstrdup(veto_list, KSMBD_DEFAULT_GFP); if (!p->pattern) { kfree(p); return -ENOMEM; @@ -150,14 +150,14 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, goto out; } - share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL); + share = kzalloc(sizeof(struct ksmbd_share_config), KSMBD_DEFAULT_GFP); if (!share) goto out; share->flags = resp->flags; atomic_set(&share->refcount, 1); INIT_LIST_HEAD(&share->veto_list); - share->name = kstrdup(name, GFP_KERNEL); + share->name = kstrdup(name, KSMBD_DEFAULT_GFP); if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { int path_len = PATH_MAX; @@ -166,7 +166,7 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, path_len = resp->payload_sz - resp->veto_list_sz; share->path = kstrndup(ksmbd_share_config_path(resp), path_len, - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (share->path) { share->path_sz = strlen(share->path); while (share->path_sz > 1 && diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index 94a52a75014a..ecfc57508671 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -31,7 +31,8 @@ ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name) if (!sc) return status; - tree_conn = kzalloc(sizeof(struct ksmbd_tree_connect), GFP_KERNEL); + tree_conn = kzalloc(sizeof(struct ksmbd_tree_connect), + KSMBD_DEFAULT_GFP); if (!tree_conn) { status.ret = -ENOMEM; goto out_error; @@ -80,7 +81,7 @@ ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name) init_waitqueue_head(&tree_conn->refcount_q); ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn, - GFP_KERNEL)); + KSMBD_DEFAULT_GFP)); if (ret) { status.ret = -ENOMEM; goto out_error; diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c index 279d00feff21..56c9a38ca878 100644 --- a/fs/smb/server/mgmt/user_config.c +++ b/fs/smb/server/mgmt/user_config.c @@ -12,6 +12,7 @@ struct ksmbd_user *ksmbd_login_user(const char *account) { struct ksmbd_login_response *resp; + struct ksmbd_login_response_ext *resp_ext = NULL; struct ksmbd_user *user = NULL; resp = ksmbd_ipc_login_request(account); @@ -21,41 +22,69 @@ struct ksmbd_user *ksmbd_login_user(const char *account) if (!(resp->status & KSMBD_USER_FLAG_OK)) goto out; - user = ksmbd_alloc_user(resp); + if (resp->status & KSMBD_USER_FLAG_EXTENSION) + resp_ext = ksmbd_ipc_login_request_ext(account); + + user = ksmbd_alloc_user(resp, resp_ext); out: kvfree(resp); return user; } -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext) { - struct ksmbd_user *user = NULL; + struct ksmbd_user *user; - user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL); + user = kmalloc(sizeof(struct ksmbd_user), KSMBD_DEFAULT_GFP); if (!user) return NULL; - user->name = kstrdup(resp->account, GFP_KERNEL); + user->name = kstrdup(resp->account, KSMBD_DEFAULT_GFP); user->flags = resp->status; user->gid = resp->gid; user->uid = resp->uid; user->passkey_sz = resp->hash_sz; - user->passkey = kmalloc(resp->hash_sz, GFP_KERNEL); + user->passkey = kmalloc(resp->hash_sz, KSMBD_DEFAULT_GFP); if (user->passkey) memcpy(user->passkey, resp->hash, resp->hash_sz); - if (!user->name || !user->passkey) { - kfree(user->name); - kfree(user->passkey); - kfree(user); - user = NULL; + user->ngroups = 0; + user->sgid = NULL; + + if (!user->name || !user->passkey) + goto err_free; + + if (resp_ext) { + if (resp_ext->ngroups > NGROUPS_MAX) { + pr_err("ngroups(%u) from login response exceeds max groups(%d)\n", + resp_ext->ngroups, NGROUPS_MAX); + goto err_free; + } + + user->sgid = kmemdup(resp_ext->____payload, + resp_ext->ngroups * sizeof(gid_t), + KSMBD_DEFAULT_GFP); + if (!user->sgid) + goto err_free; + + user->ngroups = resp_ext->ngroups; + ksmbd_debug(SMB, "supplementary groups : %d\n", user->ngroups); } + return user; + +err_free: + kfree(user->name); + kfree(user->passkey); + kfree(user); + return NULL; } void ksmbd_free_user(struct ksmbd_user *user) { ksmbd_ipc_logout_request(user->name, user->flags); + kfree(user->sgid); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h index e068a19fd904..8c227b8d4954 100644 --- a/fs/smb/server/mgmt/user_config.h +++ b/fs/smb/server/mgmt/user_config.h @@ -18,6 +18,8 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + int ngroups; + gid_t *sgid; }; static inline bool user_guest(struct ksmbd_user *user) @@ -60,7 +62,8 @@ static inline unsigned int user_gid(struct ksmbd_user *user) } struct ksmbd_user *ksmbd_login_user(const char *account); -struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp, + struct ksmbd_login_response_ext *resp_ext); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 99416ce9f501..71c6939dfbf1 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -90,7 +90,7 @@ static int __rpc_method(char *rpc_name) int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { - struct ksmbd_session_rpc *entry; + struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; int method; @@ -98,7 +98,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!method) return -EINVAL; - entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL); + entry = kzalloc(sizeof(struct ksmbd_session_rpc), KSMBD_DEFAULT_GFP); if (!entry) return -ENOMEM; @@ -106,16 +106,19 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) goto free_entry; - xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); + old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP); + if (xa_is_err(old)) + goto free_id; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto free_id; + goto erase_xa; kvfree(resp); return entry->id; -free_id: +erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); +free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); @@ -175,11 +178,13 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; + down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (sess->state != SMB2_SESSION_VALID || - time_after(jiffies, - sess->last_active + SMB2_SESSION_TIMEOUT)) { + if (atomic_read(&sess->refcnt) == 0 && + (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT))) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); ksmbd_session_destroy(sess); @@ -187,6 +192,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -195,7 +201,7 @@ int ksmbd_session_register(struct ksmbd_conn *conn, sess->dialect = conn->dialect; memcpy(sess->ClientGUID, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE); ksmbd_expire_session(conn); - return xa_err(xa_store(&conn->sessions, sess->id, sess, GFP_KERNEL)); + return xa_err(xa_store(&conn->sessions, sess->id, sess, KSMBD_DEFAULT_GFP)); } static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess) @@ -228,7 +234,6 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } - up_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { @@ -248,6 +253,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, @@ -257,8 +263,10 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, down_read(&conn->session_lock); sess = xa_load(&conn->sessions, id); - if (sess) + if (sess) { sess->last_active = jiffies; + ksmbd_user_session_get(sess); + } up_read(&conn->session_lock); return sess; } @@ -270,7 +278,7 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); if (sess) - sess->last_active = jiffies; + ksmbd_user_session_get(sess); up_read(&sessions_table_lock); return sess; @@ -289,12 +297,28 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, return sess; } +void ksmbd_user_session_get(struct ksmbd_session *sess) +{ + atomic_inc(&sess->refcnt); +} + +void ksmbd_user_session_put(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (atomic_read(&sess->refcnt) <= 0) + WARN_ON(1); + else + atomic_dec(&sess->refcnt); +} + struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, u64 sess_id) { struct preauth_session *sess; - sess = kmalloc(sizeof(struct preauth_session), GFP_KERNEL); + sess = kmalloc(sizeof(struct preauth_session), KSMBD_DEFAULT_GFP); if (!sess) return NULL; @@ -378,7 +402,7 @@ static struct ksmbd_session *__session_create(int protocol) if (protocol != CIFDS_SESSION_FLAG_SMB2) return NULL; - sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL); + sess = kzalloc(sizeof(struct ksmbd_session), KSMBD_DEFAULT_GFP); if (!sess) return NULL; @@ -393,6 +417,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); + atomic_set(&sess->refcnt, 1); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index dc9fded2cd43..c1c4b20bd5c6 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -61,6 +61,8 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; rwlock_t tree_conns_lock; + + atomic_t refcnt; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +void ksmbd_user_session_get(struct ksmbd_session *sess); +void ksmbd_user_session_put(struct ksmbd_session *sess); #endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c index 1a5faa6f6e7b..cb2a11ffb23f 100644 --- a/fs/smb/server/misc.c +++ b/fs/smb/server/misc.c @@ -165,7 +165,7 @@ char *convert_to_nt_pathname(struct ksmbd_share_config *share, char *pathname, *ab_pathname, *nt_pathname; int share_path_len = share->path_sz; - pathname = kmalloc(PATH_MAX, GFP_KERNEL); + pathname = kmalloc(PATH_MAX, KSMBD_DEFAULT_GFP); if (!pathname) return ERR_PTR(-EACCES); @@ -180,7 +180,8 @@ char *convert_to_nt_pathname(struct ksmbd_share_config *share, goto free_pathname; } - nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL); + nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, + KSMBD_DEFAULT_GFP); if (!nt_pathname) { nt_pathname = ERR_PTR(-ENOMEM); goto free_pathname; @@ -232,7 +233,7 @@ char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name) char *cf_name; int cf_len; - cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL); + cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, KSMBD_DEFAULT_GFP); if (!cf_name) return ERR_PTR(-ENOMEM); @@ -294,7 +295,7 @@ char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name) path_len = share->path_sz; name_len = strlen(name); - new_name = kmalloc(path_len + name_len + 2, GFP_KERNEL); + new_name = kmalloc(path_len + name_len + 2, KSMBD_DEFAULT_GFP); if (!new_name) return new_name; @@ -320,7 +321,7 @@ char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info, if (!sz) return NULL; - conv = kmalloc(sz, GFP_KERNEL); + conv = kmalloc(sz, KSMBD_DEFAULT_GFP); if (!conv) return NULL; diff --git a/fs/smb/server/ndr.c b/fs/smb/server/ndr.c index 3507d8f89074..58d71560f626 100644 --- a/fs/smb/server/ndr.c +++ b/fs/smb/server/ndr.c @@ -18,7 +18,7 @@ static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz) { char *data; - data = krealloc(n->data, n->offset + sz + 1024, GFP_KERNEL); + data = krealloc(n->data, n->offset + sz + 1024, KSMBD_DEFAULT_GFP); if (!data) return -ENOMEM; @@ -174,7 +174,7 @@ int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) n->offset = 0; n->length = 1024; - n->data = kzalloc(n->length, GFP_KERNEL); + n->data = kzalloc(n->length, KSMBD_DEFAULT_GFP); if (!n->data) return -ENOMEM; @@ -350,7 +350,7 @@ int ndr_encode_posix_acl(struct ndr *n, n->offset = 0; n->length = 1024; - n->data = kzalloc(n->length, GFP_KERNEL); + n->data = kzalloc(n->length, KSMBD_DEFAULT_GFP); if (!n->data) return -ENOMEM; @@ -401,7 +401,7 @@ int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) n->offset = 0; n->length = 2048; - n->data = kzalloc(n->length, GFP_KERNEL); + n->data = kzalloc(n->length, KSMBD_DEFAULT_GFP); if (!n->data) return -ENOMEM; @@ -505,7 +505,7 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) return ret; acl->sd_size = n->length - n->offset; - acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL); + acl->sd_buf = kzalloc(acl->sd_size, KSMBD_DEFAULT_GFP); if (!acl->sd_buf) return -ENOMEM; diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 4142c7ad5fa9..3a3fe4afbdf0 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -34,7 +34,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, struct ksmbd_session *sess = work->sess; struct oplock_info *opinfo; - opinfo = kzalloc(sizeof(struct oplock_info), GFP_KERNEL); + opinfo = kzalloc(sizeof(struct oplock_info), KSMBD_DEFAULT_GFP); if (!opinfo) return NULL; @@ -94,7 +94,7 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) { struct lease *lease; - lease = kmalloc(sizeof(struct lease), GFP_KERNEL); + lease = kmalloc(sizeof(struct lease), KSMBD_DEFAULT_GFP); if (!lease) return -ENOMEM; @@ -709,7 +709,7 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo) if (!work) return -ENOMEM; - br_info = kmalloc(sizeof(struct oplock_break_info), GFP_KERNEL); + br_info = kmalloc(sizeof(struct oplock_break_info), KSMBD_DEFAULT_GFP); if (!br_info) { ksmbd_free_work_struct(work); return -ENOMEM; @@ -812,7 +812,7 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) if (!work) return -ENOMEM; - br_info = kmalloc(sizeof(struct lease_break_info), GFP_KERNEL); + br_info = kmalloc(sizeof(struct lease_break_info), KSMBD_DEFAULT_GFP); if (!br_info) { ksmbd_free_work_struct(work); return -ENOMEM; @@ -1057,7 +1057,7 @@ static int add_lease_global_list(struct oplock_info *opinfo) } read_unlock(&lease_list_lock); - lb = kmalloc(sizeof(struct lease_table), GFP_KERNEL); + lb = kmalloc(sizeof(struct lease_table), KSMBD_DEFAULT_GFP); if (!lb) return -ENOMEM; @@ -1499,7 +1499,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (IS_ERR_OR_NULL(cc)) return NULL; - lreq = kzalloc(sizeof(struct lease_ctx_info), GFP_KERNEL); + lreq = kzalloc(sizeof(struct lease_ctx_info), KSMBD_DEFAULT_GFP); if (!lreq) return NULL; diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 231d2d224656..601e7fcbcf1e 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -47,7 +47,7 @@ static int ___server_conf_set(int idx, char *val) return -EINVAL; kfree(server_conf.conf[idx]); - server_conf.conf[idx] = kstrdup(val, GFP_KERNEL); + server_conf.conf[idx] = kstrdup(val, KSMBD_DEFAULT_GFP); if (!server_conf.conf[idx]) return -ENOMEM; return 0; @@ -247,6 +247,8 @@ send: if (rc < 0) conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); } + if (work->sess) + ksmbd_user_session_put(work->sess); ksmbd_conn_write(work); } @@ -273,8 +275,12 @@ static void handle_ksmbd_work(struct work_struct *wk) * disconnection. waitqueue_active is safe because it * uses atomic operation for condition. */ + atomic_inc(&conn->refcnt); if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) wake_up(&conn->r_count_q); + + if (atomic_dec_and_test(&conn->refcnt)) + kfree(conn); } /** @@ -289,6 +295,10 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) struct ksmbd_work *work; int err; + err = ksmbd_init_smb_server(conn); + if (err) + return 0; + work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -299,12 +309,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) work->request_buf = conn->request_buf; conn->request_buf = NULL; - err = ksmbd_init_smb_server(work); - if (err) { - ksmbd_free_work_struct(work); - return 0; - } - ksmbd_conn_enqueue_request(work); atomic_inc(&conn->r_count); /* update activity on connection */ @@ -357,6 +361,7 @@ static int server_conf_init(void) server_conf.auth_mechs |= KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5; #endif + server_conf.max_inflight_req = SMB2_MAX_CREDITS; return 0; } @@ -409,7 +414,7 @@ static int __queue_ctrl_work(int type) { struct server_ctrl_struct *ctrl; - ctrl = kmalloc(sizeof(struct server_ctrl_struct), GFP_KERNEL); + ctrl = kmalloc(sizeof(struct server_ctrl_struct), KSMBD_DEFAULT_GFP); if (!ctrl) return -ENOMEM; diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 4fc529335271..94187628ff08 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -42,6 +42,7 @@ struct ksmbd_server_config { struct smb_sid domain_sid; unsigned int auth_mechs; unsigned int max_connections; + unsigned int max_inflight_req; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 797b0f24097b..772deec5b90f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -67,8 +67,10 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) return false; sess = ksmbd_session_lookup_all(conn, id); - if (sess) + if (sess) { + ksmbd_user_session_put(sess); return true; + } pr_err("Invalid user session id: %llu\n", id); return false; } @@ -551,7 +553,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) if (le32_to_cpu(hdr->NextCommand) > 0) sz = large_sz; - work->response_buf = kvzalloc(sz, GFP_KERNEL); + work->response_buf = kvzalloc(sz, KSMBD_DEFAULT_GFP); if (!work->response_buf) return -ENOMEM; @@ -693,6 +695,9 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) struct smb2_hdr *rsp_hdr; struct ksmbd_work *in_work = ksmbd_alloc_work_struct(); + if (!in_work) + return; + if (allocate_interim_rsp_buf(in_work)) { pr_err("smb_allocate_rsp_buf failed!\n"); ksmbd_free_work_struct(in_work); @@ -1095,6 +1100,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) return rc; } + ksmbd_conn_lock(conn); smb2_buf_len = get_rfc1002_len(work->request_buf); smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects); if (smb2_neg_size > smb2_buf_len) { @@ -1145,7 +1151,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) case SMB311_PROT_ID: conn->preauth_info = kzalloc(sizeof(struct preauth_integrity_info), - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!conn->preauth_info) { rc = -ENOMEM; rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -1245,6 +1251,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + ksmbd_conn_unlock(conn); if (rc) rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; @@ -1264,7 +1271,7 @@ static int alloc_preauth_hash(struct ksmbd_session *sess, return 0; sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue, - PREAUTH_HASHVALUE_SIZE, GFP_KERNEL); + PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP); if (!sess->Preauth_HashValue) return -ENOMEM; @@ -1350,7 +1357,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, sz = sizeof(struct challenge_message); sz += (strlen(ksmbd_netbios_name()) * 2 + 1 + 4) * 6; - neg_blob = kzalloc(sz, GFP_KERNEL); + neg_blob = kzalloc(sz, KSMBD_DEFAULT_GFP); if (!neg_blob) return -ENOMEM; @@ -1541,12 +1548,12 @@ binding_session: if (conn->dialect >= SMB30_PROT_ID) { chann = lookup_chann_list(sess, conn); if (!chann) { - chann = kmalloc(sizeof(struct channel), GFP_KERNEL); + chann = kmalloc(sizeof(struct channel), KSMBD_DEFAULT_GFP); if (!chann) return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); } } @@ -1622,12 +1629,12 @@ static int krb5_authenticate(struct ksmbd_work *work, if (conn->dialect >= SMB30_PROT_ID) { chann = lookup_chann_list(sess, conn); if (!chann) { - chann = kmalloc(sizeof(struct channel), GFP_KERNEL); + chann = kmalloc(sizeof(struct channel), KSMBD_DEFAULT_GFP); if (!chann) return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); } } @@ -1664,7 +1671,7 @@ int smb2_sess_setup(struct ksmbd_work *work) unsigned int negblob_len, negblob_off; int rc = 0; - ksmbd_debug(SMB, "Received request for session setup\n"); + ksmbd_debug(SMB, "Received smb2 session setup request\n"); WORK_BUFFERS(work, req, rsp); @@ -1699,29 +1706,35 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->dialect != sess->dialect) { rc = -EINVAL; + ksmbd_user_session_put(sess); goto out_err; } if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { rc = -EINVAL; + ksmbd_user_session_put(sess); goto out_err; } if (strncmp(conn->ClientGUID, sess->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { rc = -ENOENT; + ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_IN_PROGRESS) { rc = -EACCES; + ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_EXPIRED) { rc = -EFAULT; + ksmbd_user_session_put(sess); goto out_err; } + ksmbd_user_session_put(sess); if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; @@ -1729,7 +1742,8 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } - if (ksmbd_session_lookup(conn, sess_id)) { + sess = ksmbd_session_lookup(conn, sess_id); + if (!sess) { rc = -EACCES; goto out_err; } @@ -1936,6 +1950,8 @@ int smb2_tree_connect(struct ksmbd_work *work) struct ksmbd_share_config *share = NULL; int rc = -EINVAL; + ksmbd_debug(SMB, "Received smb2 tree connect request\n"); + WORK_BUFFERS(work, req, rsp); treename = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->PathOffset), @@ -2132,9 +2148,9 @@ int smb2_tree_disconnect(struct ksmbd_work *work) struct ksmbd_tree_connect *tcon = work->tcon; int err; - WORK_BUFFERS(work, req, rsp); + ksmbd_debug(SMB, "Received smb2 tree disconnect request\n"); - ksmbd_debug(SMB, "request\n"); + WORK_BUFFERS(work, req, rsp); if (!tcon) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); @@ -2191,15 +2207,15 @@ err_out: int smb2_session_logoff(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; struct smb2_logoff_req *req; struct smb2_logoff_rsp *rsp; - struct ksmbd_session *sess; u64 sess_id; int err; WORK_BUFFERS(work, req, rsp); - ksmbd_debug(SMB, "request\n"); + ksmbd_debug(SMB, "Received smb2 session logoff request\n"); ksmbd_conn_lock(conn); if (!ksmbd_conn_good(conn)) { @@ -2215,11 +2231,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_close_session_fds(work); ksmbd_conn_wait_idle(conn); - /* - * Re-lookup session to validate if session is deleted - * while waiting request complete - */ - sess = ksmbd_session_lookup_all(conn, sess_id); if (ksmbd_tree_conn_session_logoff(sess)) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; @@ -2228,7 +2239,9 @@ int smb2_session_logoff(struct ksmbd_work *work) } ksmbd_destroy_file_table(&sess->file_table); + down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; + up_write(&conn->session_lock); ksmbd_free_user(sess->user); sess->user = NULL; @@ -2340,7 +2353,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, le16_to_cpu(eabuf->EaValueLength)) return -EINVAL; - attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL); + attr_name = kmalloc(XATTR_NAME_MAX + 1, KSMBD_DEFAULT_GFP); if (!attr_name) return -ENOMEM; @@ -2843,6 +2856,8 @@ int smb2_open(struct ksmbd_work *work) __le32 daccess, maximal_access = 0; int iov_len = 0; + ksmbd_debug(SMB, "Received smb2 create request\n"); + WORK_BUFFERS(work, req, rsp); if (req->hdr.NextCommand && !work->next_smb2_rcv_hdr_off && @@ -2891,7 +2906,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out2; } } else { - name = kstrdup("", GFP_KERNEL); + name = kstrdup("", KSMBD_DEFAULT_GFP); if (!name) { rc = -ENOMEM; goto err_out2; @@ -3332,7 +3347,7 @@ int smb2_open(struct ksmbd_work *work) sizeof(struct smb_sid) * 3 + sizeof(struct smb_acl) + sizeof(struct smb_ace) * ace_num * 2, - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!pntsd) { posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); @@ -3979,6 +3994,26 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777); + switch (ksmbd_kstat->kstat->mode & S_IFMT) { + case S_IFDIR: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT); + break; + case S_IFLNK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT); + break; + case S_IFCHR: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFBLK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFIFO: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT); + break; + case S_IFSOCK: + posix_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT); + } + posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); posix_info->DosAttributes = S_ISDIR(ksmbd_kstat->kstat->mode) ? @@ -4218,6 +4253,7 @@ static bool __query_dir(struct dir_context *ctx, const char *name, int namlen, /* dot and dotdot entries are already reserved */ if (!strcmp(".", name) || !strcmp("..", name)) return true; + d_info->num_scan++; if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) return true; if (!match_pattern(name, namlen, priv->search_pattern)) @@ -4290,6 +4326,8 @@ int smb2_query_dir(struct ksmbd_work *work) int buffer_sz; struct smb2_query_dir_private query_dir_private = {NULL, }; + ksmbd_debug(SMB, "Received smb2 query directory request\n"); + WORK_BUFFERS(work, req, rsp); if (ksmbd_override_fsids(work)) { @@ -4378,9 +4416,18 @@ int smb2_query_dir(struct ksmbd_work *work) query_dir_private.info_level = req->FileInformationClass; dir_fp->readdir_data.private = &query_dir_private; set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); - +again: + d_info.num_scan = 0; rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); /* + * num_entry can be 0 if the directory iteration stops before reaching + * the end of the directory and no file is matched with the search + * pattern. + */ + if (rc >= 0 && !d_info.num_entry && d_info.num_scan && + d_info.out_buf_len > 0) + goto again; + /* * req->OutputBufferLength is too small to contain even one entry. * In this case, it immediately returns OutputBufferLength 0 to client. */ @@ -4940,7 +4987,7 @@ static int get_file_stream_info(struct ksmbd_work *work, /* plus : size */ streamlen += 1; - stream_buf = kmalloc(streamlen + 1, GFP_KERNEL); + stream_buf = kmalloc(streamlen + 1, KSMBD_DEFAULT_GFP); if (!stream_buf) break; @@ -5157,6 +5204,26 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); file_info->HardLinks = cpu_to_le32(stat.nlink); file_info->Mode = cpu_to_le32(stat.mode & 0777); + switch (stat.mode & S_IFMT) { + case S_IFDIR: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_DIR << POSIX_FILETYPE_SHIFT); + break; + case S_IFLNK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_SYMLINK << POSIX_FILETYPE_SHIFT); + break; + case S_IFCHR: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_CHARDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFBLK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_BLKDEV << POSIX_FILETYPE_SHIFT); + break; + case S_IFIFO: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_FIFO << POSIX_FILETYPE_SHIFT); + break; + case S_IFSOCK: + file_info->Mode |= cpu_to_le32(POSIX_TYPE_SOCKET << POSIX_FILETYPE_SHIFT); + } + file_info->DeviceId = cpu_to_le32(stat.rdev); /* @@ -5596,9 +5663,9 @@ int smb2_query_info(struct ksmbd_work *work) struct smb2_query_info_rsp *rsp; int rc = 0; - WORK_BUFFERS(work, req, rsp); + ksmbd_debug(SMB, "Received request smb2 query info request\n"); - ksmbd_debug(SMB, "GOT query info request\n"); + WORK_BUFFERS(work, req, rsp); if (ksmbd_override_fsids(work)) { rc = -ENOMEM; @@ -5703,6 +5770,8 @@ int smb2_close(struct ksmbd_work *work) u64 time; int err = 0; + ksmbd_debug(SMB, "Received smb2 close request\n"); + WORK_BUFFERS(work, req, rsp); if (test_share_config_flag(work->tcon->share_conf, @@ -5819,6 +5888,8 @@ int smb2_echo(struct ksmbd_work *work) { struct smb2_echo_rsp *rsp = smb2_get_msg(work->response_buf); + ksmbd_debug(SMB, "Received smb2 echo request\n"); + if (work->next_smb2_rcv_hdr_off) rsp = ksmbd_resp_buf_next(work); @@ -5915,7 +5986,7 @@ static int smb2_create_link(struct ksmbd_work *work, return -EINVAL; ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n"); - pathname = kmalloc(PATH_MAX, GFP_KERNEL); + pathname = kmalloc(PATH_MAX, KSMBD_DEFAULT_GFP); if (!pathname) return -ENOMEM; @@ -6000,15 +6071,13 @@ static int set_file_basic_info(struct ksmbd_file *fp, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - attrs.ia_valid |= ATTR_CTIME; if (file_info->ChangeTime) - attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - else - attrs.ia_ctime = inode_get_ctime(inode); + inode_set_ctime_to_ts(inode, + ksmbd_NTtimeToUnix(file_info->ChangeTime)); if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); - attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME); } if (file_info->Attributes) { @@ -6050,8 +6119,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); - inode_set_ctime_to_ts(inode, attrs.ia_ctime); - attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); } @@ -6359,7 +6426,7 @@ int smb2_set_info(struct ksmbd_work *work) int rc = 0; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; - ksmbd_debug(SMB, "Received set info request\n"); + ksmbd_debug(SMB, "Received smb2 set info request\n"); if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); @@ -6479,7 +6546,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) } aux_payload_buf = - kvmalloc(rpc_resp->payload_sz, GFP_KERNEL); + kvmalloc(rpc_resp->payload_sz, KSMBD_DEFAULT_GFP); if (!aux_payload_buf) { err = -ENOMEM; goto out; @@ -6585,6 +6652,8 @@ int smb2_read(struct ksmbd_work *work) unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; void *aux_payload_buf; + ksmbd_debug(SMB, "Received smb2 read request\n"); + if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { ksmbd_debug(SMB, "IPC pipe read request\n"); @@ -6645,6 +6714,10 @@ int smb2_read(struct ksmbd_work *work) } offset = le64_to_cpu(req->Offset); + if (offset < 0) { + err = -EINVAL; + goto out; + } length = le32_to_cpu(req->Length); mincount = le32_to_cpu(req->MinimumCount); @@ -6658,7 +6731,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", fp->filp, offset, length); - aux_payload_buf = kvzalloc(length, GFP_KERNEL); + aux_payload_buf = kvzalloc(ALIGN(length, 8), KSMBD_DEFAULT_GFP); if (!aux_payload_buf) { err = -ENOMEM; goto out; @@ -6810,7 +6883,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, int ret; ssize_t nbytes; - data_buf = kvzalloc(length, GFP_KERNEL); + data_buf = kvzalloc(length, KSMBD_DEFAULT_GFP); if (!data_buf) return -ENOMEM; @@ -6850,6 +6923,8 @@ int smb2_write(struct ksmbd_work *work) int err = 0; unsigned int max_write_size = work->conn->vals->max_write_size; + ksmbd_debug(SMB, "Received smb2 write request\n"); + WORK_BUFFERS(work, req, rsp); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -6858,6 +6933,8 @@ int smb2_write(struct ksmbd_work *work) } offset = le64_to_cpu(req->Offset); + if (offset < 0) + return -EINVAL; length = le32_to_cpu(req->Length); if (req->Channel == SMB2_CHANNEL_RDMA_V1 || @@ -6988,7 +7065,7 @@ int smb2_flush(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", req->VolatileFileId); + ksmbd_debug(SMB, "Received smb2 flush request(fid : %llu)\n", req->VolatileFileId); err = ksmbd_vfs_fsync(work, req->VolatileFileId, req->PersistentFileId); if (err) @@ -7139,7 +7216,7 @@ static struct ksmbd_lock *smb2_lock_init(struct file_lock *flock, { struct ksmbd_lock *lock; - lock = kzalloc(sizeof(struct ksmbd_lock), GFP_KERNEL); + lock = kzalloc(sizeof(struct ksmbd_lock), KSMBD_DEFAULT_GFP); if (!lock) return NULL; @@ -7200,7 +7277,7 @@ int smb2_lock(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - ksmbd_debug(SMB, "Received lock request\n"); + ksmbd_debug(SMB, "Received smb2 lock request\n"); fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", req->VolatileFileId); @@ -7407,7 +7484,7 @@ skip: "would have to wait for getting lock\n"); list_add(&smb_lock->llist, &rollback_list); - argv = kmalloc(sizeof(void *), GFP_KERNEL); + argv = kmalloc(sizeof(void *), KSMBD_DEFAULT_GFP); if (!argv) { err = -ENOMEM; goto out; @@ -7967,6 +8044,8 @@ int smb2_ioctl(struct ksmbd_work *work) int ret = 0; char *buffer; + ksmbd_debug(SMB, "Received smb2 ioctl request\n"); + if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); rsp = ksmbd_resp_buf_next(work); @@ -8593,6 +8672,8 @@ int smb2_oplock_break(struct ksmbd_work *work) struct smb2_oplock_break *req; struct smb2_oplock_break *rsp; + ksmbd_debug(SMB, "Received smb2 oplock break acknowledgment request\n"); + WORK_BUFFERS(work, req, rsp); switch (le16_to_cpu(req->StructureSize)) { @@ -8623,6 +8704,8 @@ int smb2_notify(struct ksmbd_work *work) struct smb2_change_notify_req *req; struct smb2_change_notify_rsp *rsp; + ksmbd_debug(SMB, "Received smb2 notify\n"); + WORK_BUFFERS(work, req, rsp); if (work->next_smb2_rcv_hdr_off && req->hdr.NextCommand) { @@ -8901,7 +8984,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work) int rc = -ENOMEM; void *tr_buf; - tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); + tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, KSMBD_DEFAULT_GFP); if (!tr_buf) return rc; @@ -8950,6 +9033,7 @@ int smb3_decrypt_req(struct ksmbd_work *work) le64_to_cpu(tr_hdr->SessionId)); return -ECONNABORTED; } + ksmbd_user_session_put(sess); iov[0].iov_base = buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 649dacf7e8c4..17a0b18a8406 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -502,4 +502,14 @@ static inline void *smb2_get_msg(void *buf) return buf + 4; } +#define POSIX_TYPE_FILE 0 +#define POSIX_TYPE_DIR 1 +#define POSIX_TYPE_SYMLINK 2 +#define POSIX_TYPE_CHARDEV 3 +#define POSIX_TYPE_BLKDEV 4 +#define POSIX_TYPE_FIFO 5 +#define POSIX_TYPE_SOCKET 6 + +#define POSIX_FILETYPE_SHIFT 12 + #endif /* _SMB2PDU_H */ diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 5b8d75e78ffb..425c756bcfb8 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -18,8 +18,8 @@ #include "mgmt/share_config.h" /*for shortname implementation */ -static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; -#define MANGLE_BASE (sizeof(basechars) / sizeof(char) - 1) +static const char *basechars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; +#define MANGLE_BASE (strlen(basechars) - 1) #define MAGIC_CHAR '~' #define PERIOD '.' #define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) @@ -358,7 +358,7 @@ static int smb1_check_user_session(struct ksmbd_work *work) static int smb1_allocate_rsp_buf(struct ksmbd_work *work) { work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, - GFP_KERNEL); + KSMBD_DEFAULT_GFP); work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; if (!work->response_buf) { @@ -388,6 +388,10 @@ static struct smb_version_ops smb1_server_ops = { .set_rsp_status = set_smb1_rsp_status, }; +static struct smb_version_values smb1_server_values = { + .max_credits = SMB2_MAX_CREDITS, +}; + static int smb1_negotiate(struct ksmbd_work *work) { return ksmbd_smb_negotiate_common(work, SMB_COM_NEGOTIATE); @@ -399,18 +403,18 @@ static struct smb_version_cmds smb1_server_cmds[1] = { static int init_smb1_server(struct ksmbd_conn *conn) { + conn->vals = &smb1_server_values; conn->ops = &smb1_server_ops; conn->cmds = smb1_server_cmds; conn->max_cmds = ARRAY_SIZE(smb1_server_cmds); return 0; } -int ksmbd_init_smb_server(struct ksmbd_work *work) +int ksmbd_init_smb_server(struct ksmbd_conn *conn) { - struct ksmbd_conn *conn = work->conn; __le32 proto; - proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol; + proto = *(__le32 *)((struct smb_hdr *)conn->request_buf)->Protocol; if (conn->need_neg == false) { if (proto == SMB1_PROTO_NUMBER) return -EINVAL; @@ -572,7 +576,7 @@ static int smb_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Unsupported SMB1 protocol\n"); - if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp, + if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp + 4, sizeof(struct smb_negotiate_rsp) - 4)) return -ENOMEM; @@ -736,13 +740,15 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, struct ksmbd_share_config *share) { struct ksmbd_session *sess = work->sess; + struct ksmbd_user *user = sess->user; struct cred *cred; struct group_info *gi; unsigned int uid; unsigned int gid; + int i; - uid = user_uid(sess->user); - gid = user_gid(sess->user); + uid = user_uid(user); + gid = user_gid(user); if (share->force_uid != KSMBD_SHARE_INVALID_UID) uid = share->force_uid; if (share->force_gid != KSMBD_SHARE_INVALID_GID) @@ -755,11 +761,18 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, gid); - gi = groups_alloc(0); + gi = groups_alloc(user->ngroups); if (!gi) { abort_creds(cred); return -ENOMEM; } + + for (i = 0; i < user->ngroups; i++) + gi->gid[i] = make_kgid(&init_user_ns, user->sgid[i]); + + if (user->ngroups) + groups_sort(gi); + set_groups(cred, gi); put_group_info(gi); @@ -768,10 +781,6 @@ int __ksmbd_override_fsids(struct ksmbd_work *work, WARN_ON(work->saved_cred); work->saved_cred = override_creds(cred); - if (!work->saved_cred) { - abort_creds(cred); - return -EINVAL; - } return 0; } @@ -783,13 +792,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work) void ksmbd_revert_fsids(struct ksmbd_work *work) { const struct cred *cred; - WARN_ON(!work->saved_cred); - cred = current_cred(); - revert_creds(work->saved_cred); - put_cred(cred); + cred = revert_creds(work->saved_cred); work->saved_cred = NULL; + put_cred(cred); } __le32 smb_map_generic_desired_access(__le32 daccess) diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index cc1d6dfe29d5..a3d8a905b07e 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn); int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); -int ksmbd_init_smb_server(struct ksmbd_work *work); +int ksmbd_init_smb_server(struct ksmbd_conn *conn); struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 1c9775f1efa5..d39d3e553366 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -345,10 +345,10 @@ int init_acl_state(struct posix_acl_state *state, int cnt) */ alloc = sizeof(struct posix_ace_state_array) + cnt * sizeof(struct posix_user_ace_state); - state->users = kzalloc(alloc, GFP_KERNEL); + state->users = kzalloc(alloc, KSMBD_DEFAULT_GFP); if (!state->users) return -ENOMEM; - state->groups = kzalloc(alloc, GFP_KERNEL); + state->groups = kzalloc(alloc, KSMBD_DEFAULT_GFP); if (!state->groups) { kfree(state->users); return -ENOMEM; @@ -410,7 +410,7 @@ static void parse_dacl(struct mnt_idmap *idmap, return; } - ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), KSMBD_DEFAULT_GFP); if (!ppace) { free_acl_state(&default_acl_state); free_acl_state(&acl_state); @@ -553,7 +553,7 @@ static void parse_dacl(struct mnt_idmap *idmap, if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { fattr->cf_acls = posix_acl_alloc(acl_state.users->n + - acl_state.groups->n + 4, GFP_KERNEL); + acl_state.groups->n + 4, KSMBD_DEFAULT_GFP); if (fattr->cf_acls) { cf_pace = fattr->cf_acls->a_entries; posix_state_to_acl(&acl_state, cf_pace); @@ -567,7 +567,7 @@ static void parse_dacl(struct mnt_idmap *idmap, if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { fattr->cf_dacls = posix_acl_alloc(default_acl_state.users->n + - default_acl_state.groups->n + 4, GFP_KERNEL); + default_acl_state.groups->n + 4, KSMBD_DEFAULT_GFP); if (fattr->cf_dacls) { cf_pdace = fattr->cf_dacls->a_entries; posix_state_to_acl(&default_acl_state, cf_pdace); @@ -595,7 +595,7 @@ static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, for (i = 0; i < fattr->cf_acls->a_count; i++, pace++) { int flags = 0; - sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + sid = kmalloc(sizeof(struct smb_sid), KSMBD_DEFAULT_GFP); if (!sid) break; @@ -662,7 +662,7 @@ posix_default_acl: pace = fattr->cf_dacls->a_entries; for (i = 0; i < fattr->cf_dacls->a_count; i++, pace++) { - sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + sid = kmalloc(sizeof(struct smb_sid), KSMBD_DEFAULT_GFP); if (!sid) break; @@ -906,7 +906,7 @@ int build_sec_desc(struct mnt_idmap *idmap, gid_t gid; unsigned int sid_type = SIDOWNER; - nowner_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + nowner_sid_ptr = kmalloc(sizeof(struct smb_sid), KSMBD_DEFAULT_GFP); if (!nowner_sid_ptr) return -ENOMEM; @@ -915,7 +915,7 @@ int build_sec_desc(struct mnt_idmap *idmap, sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, nowner_sid_ptr); - ngroup_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + ngroup_sid_ptr = kmalloc(sizeof(struct smb_sid), KSMBD_DEFAULT_GFP); if (!ngroup_sid_ptr) { kfree(nowner_sid_ptr); return -ENOMEM; @@ -1032,7 +1032,8 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, goto free_parent_pntsd; } - aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, GFP_KERNEL); + aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, + KSMBD_DEFAULT_GFP); if (!aces_base) { rc = -ENOMEM; goto free_parent_pntsd; @@ -1126,7 +1127,7 @@ pass: pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size + pgroup_sid_size + sizeof(struct smb_acl) + nt_size; - pntsd = kzalloc(pntsd_alloc_size, GFP_KERNEL); + pntsd = kzalloc(pntsd_alloc_size, KSMBD_DEFAULT_GFP); if (!pntsd) { rc = -ENOMEM; goto free_aces_base; diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 8752ac82c557..befaf42b84cc 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -120,6 +120,12 @@ static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { }, + [KSMBD_EVENT_LOGIN_REQUEST_EXT] = { + .len = sizeof(struct ksmbd_login_request), + }, + [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = { + .len = sizeof(struct ksmbd_login_response_ext), + }, }; static struct genl_ops ksmbd_genl_ops[] = { @@ -187,6 +193,14 @@ static struct genl_ops ksmbd_genl_ops[] = { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, .doit = handle_generic_event, }, + { + .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT, + .doit = handle_generic_event, + }, }; static struct genl_family ksmbd_genl_family = { @@ -198,7 +212,7 @@ static struct genl_family ksmbd_genl_family = { .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), - .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1, + .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1, }; static void ksmbd_nl_init_fixup(void) @@ -230,7 +244,7 @@ static struct ksmbd_ipc_msg *ipc_msg_alloc(size_t sz) struct ksmbd_ipc_msg *msg; size_t msg_sz = sz + sizeof(struct ksmbd_ipc_msg); - msg = kvzalloc(msg_sz, GFP_KERNEL); + msg = kvzalloc(msg_sz, KSMBD_DEFAULT_GFP); if (msg) msg->sz = sz; return msg; @@ -269,7 +283,7 @@ static int handle_response(int type, void *payload, size_t sz) entry->type + 1, type); } - entry->response = kvzalloc(sz, GFP_KERNEL); + entry->response = kvzalloc(sz, KSMBD_DEFAULT_GFP); if (!entry->response) { ret = -ENOMEM; break; @@ -305,8 +319,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); - if (req->smb2_max_credits) + if (req->smb2_max_credits) { init_smb2_max_credits(req->smb2_max_credits); + server_conf.max_inflight_req = + req->smb2_max_credits; + } if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); @@ -430,7 +447,7 @@ static int ipc_msg_send(struct ksmbd_ipc_msg *msg) if (!ksmbd_tools_pid) return ret; - skb = genlmsg_new(msg->sz, GFP_KERNEL); + skb = genlmsg_new(msg->sz, KSMBD_DEFAULT_GFP); if (!skb) return -ENOMEM; @@ -459,16 +476,24 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) { unsigned int msg_sz = entry->msg_sz; - if (entry->type == KSMBD_EVENT_RPC_REQUEST) { + switch (entry->type) { + case KSMBD_EVENT_RPC_REQUEST: + { struct ksmbd_rpc_command *resp = entry->response; msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; - } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) { + break; + } + case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST: + { struct ksmbd_spnego_authen_response *resp = entry->response; msg_sz = sizeof(struct ksmbd_spnego_authen_response) + resp->session_key_len + resp->spnego_blob_len; - } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) { + break; + } + case KSMBD_EVENT_SHARE_CONFIG_REQUEST: + { struct ksmbd_share_config_response *resp = entry->response; if (resp->payload_sz) { @@ -478,6 +503,17 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry) msg_sz = sizeof(struct ksmbd_share_config_response) + resp->payload_sz; } + break; + } + case KSMBD_EVENT_LOGIN_REQUEST_EXT: + { + struct ksmbd_login_response_ext *resp = entry->response; + + if (resp->ngroups) { + msg_sz = sizeof(struct ksmbd_login_response_ext) + + resp->ngroups * sizeof(gid_t); + } + } } return entry->msg_sz != msg_sz ? -EINVAL : 0; @@ -560,6 +596,29 @@ struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) return resp; } +struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_login_request *req; + struct ksmbd_login_response_ext *resp; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT; + req = (struct ksmbd_login_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + struct ksmbd_spnego_authen_response * ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) { diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h index 5e5b90a0c187..d9b6737f8cd0 100644 --- a/fs/smb/server/transport_ipc.h +++ b/fs/smb/server/transport_ipc.h @@ -12,6 +12,8 @@ struct ksmbd_login_response * ksmbd_ipc_login_request(const char *account); +struct ksmbd_login_response_ext * +ksmbd_ipc_login_request_ext(const char *account); struct ksmbd_session; struct ksmbd_share_config; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 17c76713c6d0..c3785a5434f9 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -362,7 +362,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) struct smb_direct_transport *t; struct ksmbd_conn *conn; - t = kzalloc(sizeof(*t), GFP_KERNEL); + t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); if (!t) return NULL; @@ -462,7 +462,7 @@ static struct smb_direct_sendmsg { struct smb_direct_sendmsg *msg; - msg = mempool_alloc(t->sendmsg_mempool, GFP_KERNEL); + msg = mempool_alloc(t->sendmsg_mempool, KSMBD_DEFAULT_GFP); if (!msg) return ERR_PTR(-ENOMEM); msg->transport = t; @@ -1406,7 +1406,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, desc_buf = buf; for (i = 0; i < desc_num; i++) { msg = kzalloc(struct_size(msg, sg_list, SG_CHUNK_SIZE), - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!msg) { ret = -ENOMEM; goto out; @@ -1852,7 +1852,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t) INIT_LIST_HEAD(&t->recvmsg_queue); for (i = 0; i < t->recv_credit_max; i++) { - recvmsg = mempool_alloc(t->recvmsg_mempool, GFP_KERNEL); + recvmsg = mempool_alloc(t->recvmsg_mempool, KSMBD_DEFAULT_GFP); if (!recvmsg) goto err; recvmsg->transport = t; @@ -2144,7 +2144,7 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev) if (!rdma_frwr_is_supported(&ib_dev->attrs)) return 0; - smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); + smb_dev = kzalloc(sizeof(*smb_dev), KSMBD_DEFAULT_GFP); if (!smb_dev) return -ENOMEM; smb_dev->ib_dev = ib_dev; @@ -2283,12 +2283,14 @@ out: ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; + rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); ib_device_put(ibdev); } } + ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", + netdev->name, rdma_capable ? "true" : "false"); + return rdma_capable; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index aaed9e293b2e..0d9007285e30 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -76,7 +76,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) struct tcp_transport *t; struct ksmbd_conn *conn; - t = kzalloc(sizeof(*t), GFP_KERNEL); + t = kzalloc(sizeof(*t), KSMBD_DEFAULT_GFP); if (!t) return NULL; t->sock = client_sk; @@ -151,7 +151,7 @@ static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs return t->iov; /* not big enough -- allocate a new one and release the old */ - new_iov = kmalloc_array(nr_segs, sizeof(*new_iov), GFP_KERNEL); + new_iov = kmalloc_array(nr_segs, sizeof(*new_iov), KSMBD_DEFAULT_GFP); if (new_iov) { kfree(t->iov); t->iov = new_iov; @@ -521,6 +521,8 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, found = 1; if (iface->state != IFACE_STATE_DOWN) break; + ksmbd_debug(CONN, "netdev-up event: netdev(%s) is going up\n", + iface->name); ret = create_socket(iface); if (ret) return NOTIFY_OK; @@ -528,9 +530,11 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, } } if (!found && bind_additional_ifaces) { - iface = alloc_iface(kstrdup(netdev->name, GFP_KERNEL)); + iface = alloc_iface(kstrdup(netdev->name, KSMBD_DEFAULT_GFP)); if (!iface) return NOTIFY_OK; + ksmbd_debug(CONN, "netdev-up event: netdev(%s) is going up\n", + iface->name); ret = create_socket(iface); if (ret) break; @@ -540,6 +544,8 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, list_for_each_entry(iface, &iface_list, entry) { if (!strcmp(iface->name, netdev->name) && iface->state == IFACE_STATE_CONFIGURED) { + ksmbd_debug(CONN, "netdev-down event: netdev(%s) is going down\n", + iface->name); tcp_stop_kthread(iface->ksmbd_kthread); iface->ksmbd_kthread = NULL; mutex_lock(&iface->sock_release_lock); @@ -600,7 +606,7 @@ static struct interface *alloc_iface(char *ifname) if (!ifname) return NULL; - iface = kzalloc(sizeof(struct interface), GFP_KERNEL); + iface = kzalloc(sizeof(struct interface), KSMBD_DEFAULT_GFP); if (!iface) { kfree(ifname); return NULL; @@ -624,7 +630,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) for_each_netdev(&init_net, netdev) { if (netif_is_bridge_port(netdev)) continue; - if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) { + if (!alloc_iface(kstrdup(netdev->name, KSMBD_DEFAULT_GFP))) { rtnl_unlock(); return -ENOMEM; } @@ -635,7 +641,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) } while (ifc_list_sz > 0) { - if (!alloc_iface(kstrdup(ifc_list, GFP_KERNEL))) + if (!alloc_iface(kstrdup(ifc_list, KSMBD_DEFAULT_GFP))) return -ENOMEM; sz = strlen(ifc_list); diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c index 217106ff7b82..85e6791745ec 100644 --- a/fs/smb/server/unicode.c +++ b/fs/smb/server/unicode.c @@ -297,7 +297,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen, if (is_unicode) { len = smb_utf16_bytes((__le16 *)src, maxlen, codepage); len += nls_nullsize(codepage); - dst = kmalloc(len, GFP_KERNEL); + dst = kmalloc(len, KSMBD_DEFAULT_GFP); if (!dst) return ERR_PTR(-ENOMEM); ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage, @@ -309,7 +309,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen, } else { len = strnlen(src, maxlen); len++; - dst = kmalloc(len, GFP_KERNEL); + dst = kmalloc(len, KSMBD_DEFAULT_GFP); if (!dst) return ERR_PTR(-ENOMEM); strscpy(dst, src, len); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 7cbd580120d1..40f08eac519c 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -444,7 +444,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, } if (v_len < size) { - wbuf = kvzalloc(size, GFP_KERNEL); + wbuf = kvzalloc(size, KSMBD_DEFAULT_GFP); if (!wbuf) { err = -ENOMEM; goto out; @@ -865,7 +865,7 @@ ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) if (size <= 0) return size; - vlist = kvzalloc(size, GFP_KERNEL); + vlist = kvzalloc(size, KSMBD_DEFAULT_GFP); if (!vlist) return -ENOMEM; @@ -907,7 +907,7 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, if (xattr_len < 0) return xattr_len; - buf = kmalloc(xattr_len + 1, GFP_KERNEL); + buf = kmalloc(xattr_len + 1, KSMBD_DEFAULT_GFP); if (!buf) return -ENOMEM; @@ -1264,6 +1264,8 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, filepath, flags, path); + if (!is_last) + next[0] = '/'; if (err) goto out2; else if (is_last) @@ -1271,7 +1273,6 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, path_put(parent_path); *parent_path = *path; - next[0] = '/'; remain_len -= filename_len + 1; } @@ -1411,7 +1412,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *id smb_acl = kzalloc(sizeof(struct xattr_smb_acl) + sizeof(struct xattr_acl_entry) * posix_acls->a_count, - GFP_KERNEL); + KSMBD_DEFAULT_GFP); if (!smb_acl) goto out; @@ -1767,7 +1768,7 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, else type = ":$DATA"; - buf = kasprintf(GFP_KERNEL, "%s%s%s", + buf = kasprintf(KSMBD_DEFAULT_GFP, "%s%s%s", XATTR_NAME_STREAM, stream_name, type); if (!buf) return -ENOMEM; @@ -1896,7 +1897,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, acl_state.group.allow; acl_state.mask.allow = 0x07; - acls = posix_acl_alloc(6, GFP_KERNEL); + acls = posix_acl_alloc(6, KSMBD_DEFAULT_GFP); if (!acls) { free_acl_state(&acl_state); return -ENOMEM; diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index cb76f4b5bafe..06903024a2d8 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -43,6 +43,7 @@ struct ksmbd_dir_info { char *rptr; int name_len; int out_buf_len; + int num_scan; int num_entry; int data_count; int last_entry_offset; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index a19f4e563c7e..8d1f30dcba7e 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -188,7 +188,7 @@ static struct ksmbd_inode *ksmbd_inode_get(struct ksmbd_file *fp) if (ci) return ci; - ci = kmalloc(sizeof(struct ksmbd_inode), GFP_KERNEL); + ci = kmalloc(sizeof(struct ksmbd_inode), KSMBD_DEFAULT_GFP); if (!ci) return NULL; @@ -577,7 +577,7 @@ static int __open_id(struct ksmbd_file_table *ft, struct ksmbd_file *fp, return -EMFILE; } - idr_preload(GFP_KERNEL); + idr_preload(KSMBD_DEFAULT_GFP); write_lock(&ft->lock); ret = idr_alloc_cyclic(ft->idr, fp, 0, INT_MAX - 1, GFP_NOWAIT); if (ret >= 0) { @@ -605,7 +605,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) struct ksmbd_file *fp; int ret; - fp = kmem_cache_zalloc(filp_cache, GFP_KERNEL); + fp = kmem_cache_zalloc(filp_cache, KSMBD_DEFAULT_GFP); if (!fp) { pr_err("Failed to allocate memory\n"); return ERR_PTR(-ENOMEM); @@ -923,7 +923,7 @@ int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share, char *pathname, *ab_pathname; int ret = 0; - pathname = kmalloc(PATH_MAX, GFP_KERNEL); + pathname = kmalloc(PATH_MAX, KSMBD_DEFAULT_GFP); if (!pathname) return -EACCES; @@ -983,7 +983,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) int ksmbd_init_file_table(struct ksmbd_file_table *ft) { - ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL); + ft->idr = kzalloc(sizeof(struct idr), KSMBD_DEFAULT_GFP); if (!ft->idr) return -ENOMEM; diff --git a/fs/splice.c b/fs/splice.c index 06232d7e505f..2898fa1e9e63 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1564,21 +1564,6 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, return ret; } -static int vmsplice_type(struct fd f, int *type) -{ - if (!fd_file(f)) - return -EBADF; - if (fd_file(f)->f_mode & FMODE_WRITE) { - *type = ITER_SOURCE; - } else if (fd_file(f)->f_mode & FMODE_READ) { - *type = ITER_DEST; - } else { - fdput(f); - return -EBADF; - } - return 0; -} - /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple @@ -1602,21 +1587,25 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; - struct fd f; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; - f = fdget(fd); - error = vmsplice_type(f, &type); - if (error) - return error; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + if (fd_file(f)->f_mode & FMODE_WRITE) + type = ITER_SOURCE; + else if (fd_file(f)->f_mode & FMODE_READ) + type = ITER_DEST; + else + return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) - goto out_fdput; + return error; if (!iov_iter_count(&iter)) error = 0; @@ -1626,8 +1615,6 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); -out_fdput: - fdput(f); return error; } @@ -1635,27 +1622,22 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { - struct fd in, out; - ssize_t error; - if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; - error = -EBADF; - in = fdget(fd_in); - if (fd_file(in)) { - out = fdget(fd_out); - if (fd_file(out)) { - error = __do_splice(fd_file(in), off_in, fd_file(out), off_out, + CLASS(fd, in)(fd_in); + if (fd_empty(in)) + return -EBADF; + + CLASS(fd, out)(fd_out); + if (fd_empty(out)) + return -EBADF; + + return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); - fdput(out); - } - fdput(in); - } - return error; } /* @@ -2005,25 +1987,19 @@ ssize_t do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct fd in, out; - ssize_t error; - if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; - error = -EBADF; - in = fdget(fdin); - if (fd_file(in)) { - out = fdget(fdout); - if (fd_file(out)) { - error = do_tee(fd_file(in), fd_file(out), len, flags); - fdput(out); - } - fdput(in); - } + CLASS(fd, in)(fdin); + if (fd_empty(in)) + return -EBADF; - return error; + CLASS(fd, out)(fdout); + if (fd_empty(out)) + return -EBADF; + + return do_tee(fd_file(in), fd_file(out), len, flags); } diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 22251743fadf..d19d4db74af8 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; - int i, n, pages, bytes, res = -ENOMEM; + loff_t index; + int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); + for (i = 0, index = start_index; index <= end_index; index++) { + page[i] = (index == folio->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, index); if (page[i] == NULL) continue; diff --git a/fs/stat.c b/fs/stat.c index 41e598376d7e..2c0e111a098a 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -23,10 +23,46 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include <trace/events/timestamp.h> + #include "internal.h" #include "mount.h" /** + * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED + * @stat: where to store the resulting values + * @request_mask: STATX_* values requested + * @inode: inode from which to grab the c/mtime + * + * Given @inode, grab the ctime and mtime out if it and store the result + * in @stat. When fetching the value, flag it as QUERIED (if not already) + * so the next write will record a distinct timestamp. + * + * NB: The QUERIED flag is tracked in the ctime, but we set it there even + * if only the mtime was requested, as that ensures that the next mtime + * change will be distinct. + */ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) +{ + atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec; + + /* If neither time was requested, then don't report them */ + if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { + stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); + return; + } + + stat->mtime = inode_get_mtime(inode); + stat->ctime.tv_sec = inode->i_ctime_sec; + stat->ctime.tv_nsec = (u32)atomic_read(pcn); + if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) + stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); + stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; + trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); +} +EXPORT_SYMBOL(fill_mg_cmtime); + +/** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask @@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + if (is_mgtime(inode)) { + fill_mg_cmtime(stat, request_mask, inode); + } else { + stat->ctime = inode_get_ctime(inode); + stat->mtime = inode_get_mtime(inode); + } + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; @@ -165,7 +207,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, request_mask, - query_flags | AT_GETATTR_NOSEC); + query_flags); generic_fillattr(idmap, request_mask, inode, stat); return 0; @@ -198,9 +240,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat, { int retval; - if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) - return -EPERM; - retval = security_inode_getattr(path); if (retval) return retval; @@ -220,18 +259,13 @@ EXPORT_SYMBOL(vfs_getattr); */ int vfs_fstat(int fd, struct kstat *stat) { - struct fd f; - int error; - - f = fdget_raw(fd); - if (!fd_file(f)) + CLASS(fd_raw, f)(fd); + if (fd_empty(f)) return -EBADF; - error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0); - fdput(f); - return error; + return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0); } -int getname_statx_lookup_flags(int flags) +static int statx_lookup_flags(int flags) { int lookup_flags = 0; @@ -239,8 +273,6 @@ int getname_statx_lookup_flags(int flags) lookup_flags |= LOOKUP_FOLLOW; if (!(flags & AT_NO_AUTOMOUNT)) lookup_flags |= LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; return lookup_flags; } @@ -277,7 +309,7 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat, u32 request_mask) { CLASS(fd_raw, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask); } @@ -301,7 +333,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; - unsigned int lookup_flags = getname_statx_lookup_flags(flags); + unsigned int lookup_flags = statx_lookup_flags(flags); int error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | @@ -326,18 +358,11 @@ int vfs_fstatat(int dfd, const char __user *filename, { int ret; int statx_flags = flags | AT_NO_AUTOMOUNT; - struct filename *name; + struct filename *name = getname_maybe_null(filename, flags); - /* - * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) - * - * If AT_EMPTY_PATH is set, we expect the common case to be that - * empty path, and avoid doing all the extra pathname work. - */ - if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + if (!name && dfd >= 0) return vfs_fstat(dfd, stat); - name = getname_flags(filename, getname_statx_lookup_flags(statx_flags)); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -700,6 +725,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_mnt_id = stat->mnt_id; tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; + tmp.stx_dio_read_offset_align = stat->dio_read_offset_align; tmp.stx_subvol = stat->subvol; tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; @@ -774,24 +800,11 @@ SYSCALL_DEFINE5(statx, struct statx __user *, buffer) { int ret; - unsigned lflags; - struct filename *name; + struct filename *name = getname_maybe_null(filename, flags); - /* - * Short-circuit handling of NULL and "" paths. - * - * For a NULL path we require and accept only the AT_EMPTY_PATH flag - * (possibly |'d with AT_STATX flags). - * - * However, glibc on 32-bit architectures implements fstatat as statx - * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags. - * Supporting this results in the uglification below. - */ - lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE); - if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename)) + if (!name && dfd >= 0) return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer); - name = getname_flags(filename, getname_statx_lookup_flags(flags)); ret = do_statx(dfd, name, flags, mask, buffer); putname(name); diff --git a/fs/statfs.c b/fs/statfs.c index 9c7bb27e7932..a45ac85e6048 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -114,13 +114,11 @@ retry: int fd_statfs(int fd, struct kstatfs *st) { - struct fd f = fdget_raw(fd); - int error = -EBADF; - if (fd_file(f)) { - error = vfs_statfs(&fd_file(f)->f_path, st); - fdput(f); - } - return error; + CLASS(fd_raw, f)(fd); + + if (fd_empty(f)) + return -EBADF; + return vfs_statfs(&fd_file(f)->f_path, st); } static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) diff --git a/fs/super.c b/fs/super.c index 1db230432960..c9c7223bc2a2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, EXPORT_SYMBOL_GPL(setup_bdev_super); /** - * get_tree_bdev - Get a superblock based on a single block device + * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock + * @flags: GET_TREE_BDEV_* flags */ -int get_tree_bdev(struct fs_context *fc, - int (*fill_super)(struct super_block *, - struct fs_context *)) +int get_tree_bdev_flags(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; @@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc, error = lookup_bdev(fc->source, &dev); if (error) { - errorf(fc, "%s: Can't lookup blockdev", fc->source); + if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) + errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } - fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) @@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc, fc->root = dget(s->s_root); return 0; } +EXPORT_SYMBOL_GPL(get_tree_bdev_flags); + +/** + * get_tree_bdev - Get a superblock based on a single block device + * @fc: The filesystem context holding the parameters + * @fill_super: Helper to initialise a new superblock + */ +int get_tree_bdev(struct fs_context *fc, + int (*fill_super)(struct super_block *, + struct fs_context *)) +{ + return get_tree_bdev_flags(fc, fill_super, 0); +} EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) diff --git a/fs/sync.c b/fs/sync.c index 67df255eb189..2955cd4c77a3 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -148,11 +148,11 @@ void emergency_sync(void) */ SYSCALL_DEFINE1(syncfs, int, fd) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct super_block *sb; int ret, ret2; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; sb = fd_file(f)->f_path.dentry->d_sb; @@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err); - fdput(f); return ret ? ret : ret2; } @@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) { - struct fd f = fdget(fd); - int ret = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - ret = vfs_fsync(fd_file(f), datasync); - fdput(f); - } - return ret; + if (fd_empty(f)) + return -EBADF; + + return vfs_fsync(fd_file(f), datasync); } SYSCALL_DEFINE1(fsync, unsigned int, fd) @@ -355,16 +352,12 @@ out: int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, unsigned int flags) { - int ret; - struct fd f; + CLASS(fd, f)(fd); - ret = -EBADF; - f = fdget(fd); - if (fd_file(f)) - ret = sync_file_range(fd_file(f), offset, nbytes, flags); + if (fd_empty(f)) + return -EBADF; - fdput(f); - return ret; + return sync_file_range(fd_file(f), offset, nbytes, flags); } SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d1995e2d6c94..785408861c01 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -91,9 +91,12 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read) + if (!battr->read && !battr->read_new) return -EIO; + if (battr->read_new) + return battr->read_new(of->file, kobj, battr, buf, pos, count); + return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -152,9 +155,12 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write) + if (!battr->write && !battr->write_new) return -EIO; + if (battr->write_new) + return battr->write_new(of->file, kobj, battr, buf, pos, count); + return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -315,7 +321,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, } int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, - const struct bin_attribute *battr, umode_t mode, + const struct bin_attribute *battr, umode_t mode, size_t size, kuid_t uid, kgid_t gid, const void *ns) { const struct attribute *attr = &battr->attr; @@ -323,13 +329,19 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; + if (battr->read && battr->read_new) + return -EINVAL; + + if (battr->write && battr->write_new) + return -EINVAL; + if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if (battr->read && battr->write) + else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) ops = &sysfs_bin_kfops_rw; - else if (battr->read) + else if (battr->read || battr->read_new) ops = &sysfs_bin_kfops_ro; - else if (battr->write) + else if (battr->write || battr->write_new) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; @@ -340,7 +352,7 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, - battr->size, ops, (void *)attr, ns, key); + size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); @@ -580,8 +592,8 @@ int sysfs_create_bin_file(struct kobject *kobj, return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); - return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid, - gid, NULL); + return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, + attr->size, uid, gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index d22ad67a0f32..8b01a7eda5fb 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -87,6 +87,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; + size_t size = (*bin_attr)->size; if (update) kernfs_remove_by_name(parent, @@ -97,6 +98,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (!mode) continue; } + if (grp->bin_size) + size = grp->bin_size(kobj, *bin_attr, i); WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", @@ -104,7 +107,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, - mode, uid, gid, + mode, size, uid, gid, NULL); if (error) break; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3f28c9af5756..8e012f25e1c0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -31,7 +31,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, const struct attribute *attr, umode_t amode, kuid_t uid, kgid_t gid, const void *ns); int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, - const struct bin_attribute *battr, umode_t mode, + const struct bin_attribute *battr, umode_t mode, size_t size, kuid_t uid, kgid_t gid, const void *ns); /* diff --git a/fs/timerfd.c b/fs/timerfd.c index 137523e0bb21..9f7eb451a60f 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -79,13 +79,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; } -static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, - ktime_t now) +static void timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); - return ALARMTIMER_NORESTART; } /* @@ -394,19 +392,6 @@ static const struct file_operations timerfd_fops = { .unlocked_ioctl = timerfd_ioctl, }; -static int timerfd_fget(int fd, struct fd *p) -{ - struct fd f = fdget(fd); - if (!fd_file(f)) - return -EBADF; - if (fd_file(f)->f_op != &timerfd_fops) { - fdput(f); - return -EINVAL; - } - *p = f; - return 0; -} - SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; @@ -471,7 +456,6 @@ static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { - struct fd f; struct timerfd_ctx *ctx; int ret; @@ -479,15 +463,17 @@ static int do_timerfd_settime(int ufd, int flags, !itimerspec64_valid(new)) return -EINVAL; - ret = timerfd_fget(ufd, &f); - if (ret) - return ret; + CLASS(fd, f)(ufd); + if (fd_empty(f)) + return -EBADF; + + if (fd_file(f)->f_op != &timerfd_fops) + return -EINVAL; + ctx = fd_file(f)->private_data; - if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) { - fdput(f); + if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) return -EPERM; - } timerfd_setup_cancel(ctx, flags); @@ -535,17 +521,18 @@ static int do_timerfd_settime(int ufd, int flags, ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); - fdput(f); return ret; } static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { - struct fd f; struct timerfd_ctx *ctx; - int ret = timerfd_fget(ufd, &f); - if (ret) - return ret; + CLASS(fd, f)(ufd); + + if (fd_empty(f)) + return -EBADF; + if (fd_file(f)->f_op != &timerfd_fops) + return -EINVAL; ctx = fd_file(f)->private_data; spin_lock_irq(&ctx->wqh.lock); @@ -567,7 +554,6 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); - fdput(f); return 0; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1748dff58c3b..cfc614c638da 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -392,6 +392,9 @@ static int tracefs_reconfigure(struct fs_context *fc) struct tracefs_fs_info *sb_opts = sb->s_fs_info; struct tracefs_fs_info *new_opts = fc->s_fs_info; + if (!new_opts) + return 0; + sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; @@ -478,14 +481,17 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &tracefs_super_operations; sb->s_d_op = &tracefs_dentry_operations; - tracefs_apply_options(sb, false); - return 0; } static int tracefs_get_tree(struct fs_context *fc) { - return get_tree_single(fc, tracefs_fill_super); + int err = get_tree_single(fc, tracefs_fill_super); + + if (err) + return err; + + return tracefs_reconfigure(fc); } static void tracefs_free_fc(struct fs_context *fc) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index d79cabe193c3..2c99349cf537 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -213,12 +213,6 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 4a35f9e8f668..36ba79fbd2ff 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -981,6 +981,13 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { + ubifs_err(c, "Cannot delete inode, it has too much xattrs!"); + err = -EPERM; + ubifs_ro_mode(c, err); + return err; + } + /* * If the inode is being deleted, do not write the attached data. No * need to synchronize the write-buffer either. @@ -1012,12 +1019,6 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) struct inode *xino; struct ubifs_dent_node *xent, *pxent = NULL; - if (ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { - err = -EPERM; - ubifs_err(c, "Cannot delete inode, it has too much xattrs!"); - goto out_release; - } - lowest_xent_key(c, &key, inode->i_ino); while (1) { xent = ubifs_tnc_next_ent(c, &key, &nm); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 07351fdce722..aa8837e6247c 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -577,7 +577,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, /* Go right */ nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return (void *)nnode; + return ERR_CAST(nnode); /* Go down to level 1 */ while (nnode->level > 1) { @@ -594,7 +594,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, } nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return (void *)nnode; + return ERR_CAST(nnode); } for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index fb957d963ba6..5555dd740889 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -76,7 +76,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = &(*p)->rb_right; else { - ubifs_err(c, "orphaned twice"); + ubifs_err(c, "ino %lu orphaned twice", (unsigned long)inum); spin_unlock(&c->orphan_lock); kfree(orphan); return -EINVAL; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 291583005dd1..f3e3b2068608 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -19,9 +19,9 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/seq_file.h> -#include <linux/mount.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" @@ -773,10 +773,10 @@ static void init_constants_master(struct ubifs_info *c) * necessary to report something for the 'statfs()' call. * * Subtract the LEB reserved for GC, the LEB which is reserved for - * deletions, minimum LEBs for the index, and assume only one journal - * head is available. + * deletions, minimum LEBs for the index, the LEBs which are reserved + * for each journal head. */ - tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt; tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; @@ -981,177 +981,120 @@ enum { Opt_auth_key, Opt_auth_hash_name, Opt_ignore, - Opt_err, }; -static const match_table_t tokens = { - {Opt_fast_unmount, "fast_unmount"}, - {Opt_norm_unmount, "norm_unmount"}, - {Opt_bulk_read, "bulk_read"}, - {Opt_no_bulk_read, "no_bulk_read"}, - {Opt_chk_data_crc, "chk_data_crc"}, - {Opt_no_chk_data_crc, "no_chk_data_crc"}, - {Opt_override_compr, "compr=%s"}, - {Opt_auth_key, "auth_key=%s"}, - {Opt_auth_hash_name, "auth_hash_name=%s"}, - {Opt_ignore, "ubi=%s"}, - {Opt_ignore, "vol=%s"}, - {Opt_assert, "assert=%s"}, - {Opt_err, NULL}, +static const struct constant_table ubifs_param_compr[] = { + { "none", UBIFS_COMPR_NONE }, + { "lzo", UBIFS_COMPR_LZO }, + { "zlib", UBIFS_COMPR_ZLIB }, + { "zstd", UBIFS_COMPR_ZSTD }, + {} }; -/** - * parse_standard_option - parse a standard mount option. - * @option: the option to parse - * - * Normally, standard mount options like "sync" are passed to file-systems as - * flags. However, when a "rootflags=" kernel boot parameter is used, they may - * be present in the options string. This function tries to deal with this - * situation and parse standard options. Returns 0 if the option was not - * recognized, and the corresponding integer flag if it was. - * - * UBIFS is only interested in the "sync" option, so do not check for anything - * else. - */ -static int parse_standard_option(const char *option) -{ +static const struct constant_table ubifs_param_assert[] = { + { "report", ASSACT_REPORT }, + { "read-only", ASSACT_RO }, + { "panic", ASSACT_PANIC }, + {} +}; - pr_notice("UBIFS: parse %s\n", option); - if (!strcmp(option, "sync")) - return SB_SYNCHRONOUS; - return 0; -} +static const struct fs_parameter_spec ubifs_fs_param_spec[] = { + fsparam_flag ("fast_unmount", Opt_fast_unmount), + fsparam_flag ("norm_unmount", Opt_norm_unmount), + fsparam_flag ("bulk_read", Opt_bulk_read), + fsparam_flag ("no_bulk_read", Opt_no_bulk_read), + fsparam_flag ("chk_data_crc", Opt_chk_data_crc), + fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), + fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), + fsparam_enum ("assert", Opt_assert, ubifs_param_assert), + fsparam_string ("auth_key", Opt_auth_key), + fsparam_string ("auth_hash_name", Opt_auth_hash_name), + fsparam_string ("ubi", Opt_ignore), + fsparam_string ("vol", Opt_ignore), + {} +}; + +struct ubifs_fs_context { + struct ubifs_mount_opts mount_opts; + char *auth_key_name; + char *auth_hash_name; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int assert_action:2; +}; /** - * ubifs_parse_options - parse mount parameters. - * @c: UBIFS file-system description object - * @options: parameters to parse - * @is_remount: non-zero if this is FS re-mount + * ubifs_parse_param - parse a parameter. + * @fc: the filesystem context + * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ -static int ubifs_parse_options(struct ubifs_info *c, char *options, - int is_remount) +static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - if (!options) - return 0; + struct ubifs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); + int opt; - while ((p = strsep(&options, ","))) { - int token; + opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { + switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ - case Opt_fast_unmount: - c->mount_opts.unmount_mode = 2; - break; - case Opt_norm_unmount: - c->mount_opts.unmount_mode = 1; - break; - case Opt_bulk_read: - c->mount_opts.bulk_read = 2; - c->bulk_read = 1; - break; - case Opt_no_bulk_read: - c->mount_opts.bulk_read = 1; - c->bulk_read = 0; - break; - case Opt_chk_data_crc: - c->mount_opts.chk_data_crc = 2; - c->no_chk_data_crc = 0; - break; - case Opt_no_chk_data_crc: - c->mount_opts.chk_data_crc = 1; - c->no_chk_data_crc = 1; - break; - case Opt_override_compr: - { - char *name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "none")) - c->mount_opts.compr_type = UBIFS_COMPR_NONE; - else if (!strcmp(name, "lzo")) - c->mount_opts.compr_type = UBIFS_COMPR_LZO; - else if (!strcmp(name, "zlib")) - c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; - else if (!strcmp(name, "zstd")) - c->mount_opts.compr_type = UBIFS_COMPR_ZSTD; - else { - ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? - kfree(name); - return -EINVAL; - } - kfree(name); - c->mount_opts.override_compr = 1; - c->default_compr = c->mount_opts.compr_type; - break; - } - case Opt_assert: - { - char *act = match_strdup(&args[0]); - - if (!act) - return -ENOMEM; - if (!strcmp(act, "report")) - c->assert_action = ASSACT_REPORT; - else if (!strcmp(act, "read-only")) - c->assert_action = ASSACT_RO; - else if (!strcmp(act, "panic")) - c->assert_action = ASSACT_PANIC; - else { - ubifs_err(c, "unknown assert action \"%s\"", act); - kfree(act); - return -EINVAL; - } - kfree(act); - break; - } - case Opt_auth_key: - if (!is_remount) { - c->auth_key_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; - } - break; - case Opt_auth_hash_name: - if (!is_remount) { - c->auth_hash_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; - } - break; - case Opt_ignore: - break; - default: - { - unsigned long flag; - struct super_block *sb = c->vfs_sb; - - flag = parse_standard_option(p); - if (!flag) { - ubifs_err(c, "unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - sb->s_flags |= flag; - break; + case Opt_fast_unmount: + ctx->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + ctx->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + ctx->mount_opts.bulk_read = 2; + ctx->bulk_read = 1; + break; + case Opt_no_bulk_read: + ctx->mount_opts.bulk_read = 1; + ctx->bulk_read = 0; + break; + case Opt_chk_data_crc: + ctx->mount_opts.chk_data_crc = 2; + ctx->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + ctx->mount_opts.chk_data_crc = 1; + ctx->no_chk_data_crc = 1; + break; + case Opt_override_compr: + ctx->mount_opts.compr_type = result.uint_32; + ctx->mount_opts.override_compr = 1; + ctx->default_compr = ctx->mount_opts.compr_type; + break; + case Opt_assert: + ctx->assert_action = result.uint_32; + break; + case Opt_auth_key: + if (!is_remount) { + kfree(ctx->auth_key_name); + ctx->auth_key_name = param->string; + param->string = NULL; } + break; + case Opt_auth_hash_name: + if (!is_remount) { + kfree(ctx->auth_hash_name); + ctx->auth_hash_name = param->string; + param->string = NULL; } + break; + case Opt_ignore: + break; } return 0; @@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb) mutex_unlock(&c->umount_mutex); } -static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +static int ubifs_reconfigure(struct fs_context *fc) { + struct ubifs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); - dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); - err = ubifs_parse_options(c, data, 1); - if (err) { - ubifs_err(c, "invalid or unknown remount parameter"); - return err; - } + /* + * Apply the mount option changes. + * auth_key_name and auth_hash_name are ignored on remount. + */ + c->mount_opts = ctx->mount_opts; + c->bulk_read = ctx->bulk_read; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; - if (c->ro_mount && !(*flags & SB_RDONLY)) { + if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; @@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_remount_rw(c); if (err) return err; - } else if (!c->ro_mount && (*flags & SB_RDONLY)) { + } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; @@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = { .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, - .remount_fs = ubifs_remount_fs, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. - * @name: UBI volume name + * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume @@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = { * returns UBI volume description object in case of success and a negative * error code in case of failure. */ -static struct ubi_volume_desc *open_ubi(const char *name, int mode) +static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; + const char *name = fc->source; int dev, vol; char *endptr; - if (!name || !*name) - return ERR_PTR(-EINVAL); - /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) @@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') - return ERR_PTR(-EINVAL); + goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) - return ERR_PTR(-EINVAL); + goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); @@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') - return ERR_PTR(-EINVAL); + goto invalid_source; return ubi_open_volume(dev, vol, mode); } @@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); - return ERR_PTR(-EINVAL); +invalid_source: + return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) @@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } -static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; + struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; @@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_close; + /* Copy in parsed mount options */ + c->mount_opts = ctx->mount_opts; + c->auth_key_name = ctx->auth_key_name; + c->auth_hash_name = ctx->auth_hash_name; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->bulk_read = ctx->bulk_read; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; + + /* ubifs_info owns auth strings now */ + ctx->auth_key_name = NULL; + ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For @@ -2249,6 +2206,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) } super_set_uuid(sb, c->uuid, sizeof(c->uuid)); + super_set_sysfs_name_generic(sb, UBIFS_DFS_DIR_NAME, + c->vi.ubi_num, c->vi.vol_id); mutex_unlock(&c->umount_mutex); return 0; @@ -2264,41 +2223,38 @@ out: return err; } -static int sb_test(struct super_block *sb, void *data) +static int sb_test(struct super_block *sb, struct fs_context *fc) { - struct ubifs_info *c1 = data; + struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } -static int sb_set(struct super_block *sb, void *data) -{ - sb->s_fs_info = data; - return set_anon_super(sb, NULL); -} - -static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, - const char *name, void *data) +static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; - dbg_gen("name %s, flags %#x", name, flags); + if (!fc->source || !*fc->source) + return invalf(fc, "No source specified"); + + dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ - ubi = open_ubi(name, UBI_READONLY); + ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { - if (!(flags & SB_SILENT)) + err = PTR_ERR(ubi); + if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); - return ERR_CAST(ubi); + current->pid, fc->source, err); + return err; } c = alloc_ubifs_info(ubi); @@ -2306,10 +2262,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, err = -ENOMEM; goto out_close; } + fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, sb_test, sb_set, flags, c); + sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); @@ -2321,12 +2278,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if (!!(flags & SB_RDONLY) != c1->ro_mount) { + if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); + err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ @@ -2340,13 +2297,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return ERR_PTR(err); + return err; } static void kill_ubifs_super(struct super_block *s) @@ -2356,10 +2314,61 @@ static void kill_ubifs_super(struct super_block *s) kfree(c); } +static void ubifs_free_fc(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->auth_key_name); + kfree(ctx->auth_hash_name); + kfree(ctx); + } +} + +static const struct fs_context_operations ubifs_context_ops = { + .free = ubifs_free_fc, + .parse_param = ubifs_parse_param, + .get_tree = ubifs_get_tree, + .reconfigure = ubifs_reconfigure, +}; + +static int ubifs_init_fs_context(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* Iniitialize for first mount */ + ctx->no_chk_data_crc = 1; + ctx->assert_action = ASSACT_RO; + } else { + struct ubifs_info *c = fc->root->d_sb->s_fs_info; + + /* + * Preserve existing options across remounts. + * auth_key_name and auth_hash_name are not remountable. + */ + ctx->mount_opts = c->mount_opts; + ctx->bulk_read = c->bulk_read; + ctx->no_chk_data_crc = c->no_chk_data_crc; + ctx->default_compr = c->default_compr; + ctx->assert_action = c->assert_action; + } + + fc->ops = &ubifs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .mount = ubifs_mount, + .init_fs_context = ubifs_init_fs_context, + .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 45cacdcd4746..33946b518148 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2930,8 +2930,6 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) dbg_tnc("xent '%s', ino %lu", xent->name, (unsigned long)xattr_inum); - ubifs_evict_xattr_inode(c, xattr_inum); - fname_name(&nm) = xent->name; fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index a55e04822d16..7c43e0ccf6d4 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -657,6 +657,8 @@ static int get_znodes_to_commit(struct ubifs_info *c) znode->alt = 0; cnext = find_next_dirty(znode); if (!cnext) { + ubifs_assert(c, !znode->parent); + znode->cparent = NULL; znode->cnext = c->cnext; break; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d69a5a42d693..3375bbe0508c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2040,13 +2040,10 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, #ifdef CONFIG_UBIFS_FS_XATTR extern const struct xattr_handler * const ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); -void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum); int ubifs_purge_xattrs(struct inode *host); #else #define ubifs_listxattr NULL #define ubifs_xattr_handlers NULL -static inline void ubifs_evict_xattr_inode(struct ubifs_info *c, - ino_t xattr_inum) { } static inline int ubifs_purge_xattrs(struct inode *host) { return 0; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index f734588b224a..c21a0c2b3e90 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -48,19 +48,6 @@ #include <linux/slab.h> #include <linux/xattr.h> -/* - * Extended attribute type constants. - * - * USER_XATTR: user extended attribute ("user.*") - * TRUSTED_XATTR: trusted extended attribute ("trusted.*) - * SECURITY_XATTR: security extended attribute ("security.*") - */ -enum { - USER_XATTR, - TRUSTED_XATTR, - SECURITY_XATTR, -}; - static const struct inode_operations empty_iops; static const struct file_operations empty_fops; @@ -532,8 +519,6 @@ int ubifs_purge_xattrs(struct inode *host) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); - kfree(pxent); - kfree(xent); goto out_err; } @@ -541,16 +526,12 @@ int ubifs_purge_xattrs(struct inode *host) clear_nlink(xino); err = remove_xattr(c, host, xino, &nm); + iput(xino); if (err) { - kfree(pxent); - kfree(xent); - iput(xino); ubifs_err(c, "cannot remove xattr, error %d", err); goto out_err; } - iput(xino); - kfree(pxent); pxent = xent; key_read(c, &xent->key, &key); @@ -566,32 +547,12 @@ int ubifs_purge_xattrs(struct inode *host) return 0; out_err: + kfree(pxent); + kfree(xent); up_write(&ubifs_inode(host)->xattr_sem); return err; } -/** - * ubifs_evict_xattr_inode - Evict an xattr inode. - * @c: UBIFS file-system description object - * @xattr_inum: xattr inode number - * - * When an inode that hosts xattrs is being removed we have to make sure - * that cached inodes of the xattrs also get removed from the inode cache - * otherwise we'd waste memory. This function looks up an inode from the - * inode cache and clears the link counter such that iput() will evict - * the inode. - */ -void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum) -{ - struct inode *inode; - - inode = ilookup(c->vfs_sb, xattr_inum); - if (inode) { - clear_nlink(inode); - iput(inode); - } -} - static int ubifs_xattr_remove(struct inode *host, const char *name) { struct inode *inode; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 78a603129dd5..2cb49b6b0716 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -517,7 +517,11 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); inode->i_size = 0; - inode_dec_link_count(dir); + if (dir->i_nlink >= 3) + inode_dec_link_count(dir); + else + udf_warn(inode->i_sb, "parent dir link count too low (%u)\n", + dir->i_nlink); udf_add_fid_counter(dir->i_sb, true, -1); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); @@ -787,8 +791,18 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, retval = -ENOTEMPTY; if (!empty_dir(new_inode)) goto out_oiter; + retval = -EFSCORRUPTED; + if (new_inode->i_nlink != 2) + goto out_oiter; } + retval = -EFSCORRUPTED; + if (old_dir->i_nlink < 3) + goto out_oiter; is_dir = true; + } else if (new_inode) { + retval = -EFSCORRUPTED; + if (new_inode->i_nlink < 1) + goto out_oiter; } if (is_dir && old_dir != new_dir) { retval = udf_fiiter_find_entry(old_inode, &dotdot_name, diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 53c11be2b2c1..194ed3ab945e 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -33,6 +33,29 @@ static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info * static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[]; static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int); +static void adjust_free_blocks(struct super_block *sb, + struct ufs_cylinder_group *ucg, + struct ufs_cg_private_info *ucpi, + unsigned fragment, int delta) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + + if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) + ufs_clusteracct(sb, ucpi, fragment, delta); + + fs32_add(sb, &ucg->cg_cs.cs_nbfree, delta); + uspi->cs_total.cs_nbfree += delta; + fs32_add(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, delta); + + if (uspi->fs_magic != UFS2_MAGIC) { + unsigned cylno = ufs_cbtocylno(fragment); + + fs16_add(sb, &ubh_cg_blks(ucpi, cylno, + ufs_cbtorpos(fragment)), delta); + fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), delta); + } +} + /* * Free 'count' fragments from fragment number 'fragment' */ @@ -43,7 +66,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; - u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -51,7 +73,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); - if (ufs_fragnum(fragment) + count > uspi->s_fpg) + if (ufs_fragnum(fragment) + count > uspi->s_fpb) ufs_error (sb, "ufs_free_fragments", "internal error"); mutex_lock(&UFS_SB(sb)->s_lock); @@ -94,23 +116,11 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) /* * Trying to reassemble free fragments into block */ - blkno = ufs_fragstoblks (bbase); - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(uspi, ucpi, bbase)) { fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); uspi->cs_total.cs_nffree -= uspi->s_fpb; fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, 1); - fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree++; - fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno (bbase); - - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos(bbase)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + adjust_free_blocks(sb, ucg, ucpi, bbase, 1); } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -139,7 +149,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; - u64 blkno; sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -181,26 +190,12 @@ do_more: } for (i = bit; i < end_bit; i += uspi->s_fpb) { - blkno = ufs_fragstoblks(i); - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(uspi, ucpi, i)) { ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } - ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + ubh_setblock(uspi, ucpi, i); inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, 1); - - fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree++; - fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); - - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno(i); - - fs16_add(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos(i)), 1); - fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + adjust_free_blocks(sb, ucg, ucpi, i, 1); } ubh_mark_buffer_dirty (USPI_UBH(uspi)); @@ -234,13 +229,13 @@ failed: * situated at the end of file. * * We can come here from ufs_writepage or ufs_prepare_write, - * locked_page is argument of these functions, so we already lock it. + * locked_folio is argument of these functions, so we already lock it. */ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, - sector_t newb, struct page *locked_page) + sector_t newb, struct folio *locked_folio) { - struct folio *folio, *locked_folio = page_folio(locked_page); + struct folio *folio; const unsigned blks_per_page = 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; @@ -337,7 +332,7 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n, u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, u64 goal, unsigned count, int *err, - struct page *locked_page) + struct folio *locked_folio) { struct super_block * sb; struct ufs_sb_private_info * uspi; @@ -417,7 +412,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { ufs_clear_frags(inode, result + oldcount, - newcount - oldcount, locked_page != NULL); + newcount - oldcount, locked_folio != NULL); *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); @@ -441,7 +436,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); read_sequnlock_excl(&UFS_I(inode)->meta_lock); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, - locked_page != NULL); + locked_folio != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; @@ -462,11 +457,11 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, - locked_page != NULL); + locked_folio != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, - uspi->s_sbbase + result, locked_page); + uspi->s_sbbase + result, locked_folio); *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); @@ -698,7 +693,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_cylinder_group * ucg; - u64 result, blkno; + u64 result; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -716,7 +711,7 @@ static u64 ufs_alloccg_block(struct inode *inode, /* * If the requested block is available, use it. */ - if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) { + if (ubh_isblockset(uspi, ucpi, goal)) { result = goal; goto gotit; } @@ -729,22 +724,8 @@ norot: gotit: if (!try_add_frags(inode, uspi->s_fpb)) return 0; - blkno = ufs_fragstoblks(result); - ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); - if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) - ufs_clusteracct (sb, ucpi, blkno, -1); - - fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); - uspi->cs_total.cs_nbfree--; - fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); - - if (uspi->fs_magic != UFS2_MAGIC) { - unsigned cylno = ufs_cbtocylno((unsigned)result); - - fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, - ufs_cbtorpos((unsigned)result)), 1); - fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); - } + ubh_clrblock(uspi, ucpi, result); + adjust_free_blocks(sb, ucg, ucpi, result, -1); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -863,12 +844,12 @@ static u64 ufs_bitmap_search(struct super_block *sb, } static void ufs_clusteracct(struct super_block * sb, - struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt) + struct ufs_cg_private_info * ucpi, unsigned frag, int cnt) { - struct ufs_sb_private_info * uspi; + struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; int i, start, end, forw, back; + unsigned blkno = ufs_fragstoblks(frag); - uspi = UFS_SB(sb)->s_uspi; if (uspi->s_contigsumsize <= 0) return; diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c index 1abe5454de47..a2813270c303 100644 --- a/fs/ufs/cylinder.c +++ b/fs/ufs/cylinder.c @@ -26,7 +26,7 @@ * Read cylinder group into cache. The memory space for ufs_cg_private_info * structure is already allocated during ufs_read_super. */ -static void ufs_read_cylinder (struct super_block * sb, +static bool ufs_read_cylinder(struct super_block *sb, unsigned cgno, unsigned bitmap_nr) { struct ufs_sb_info * sbi = UFS_SB(sb); @@ -46,9 +46,11 @@ static void ufs_read_cylinder (struct super_block * sb, * We have already the first fragment of cylinder group block in buffer */ UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno]; - for (i = 1; i < UCPI_UBH(ucpi)->count; i++) - if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i))) + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) { + UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i); + if (!UCPI_UBH(ucpi)->bh[i]) goto failed; + } sbi->s_cgno[bitmap_nr] = cgno; ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx); @@ -67,13 +69,14 @@ static void ufs_read_cylinder (struct super_block * sb, ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); UFSD("EXIT\n"); - return; + return true; failed: for (j = 1; j < i; j++) - brelse (sbi->s_ucg[j]); + brelse(UCPI_UBH(ucpi)->bh[j]); sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno); + return false; } /* @@ -156,15 +159,14 @@ struct ufs_cg_private_info * ufs_load_cylinder ( UFSD("EXIT (FAILED)\n"); return NULL; } - else { - UFSD("EXIT\n"); - return sbi->s_ucpi[cgno]; - } } else { - ufs_read_cylinder (sb, cgno, cgno); - UFSD("EXIT\n"); - return sbi->s_ucpi[cgno]; + if (unlikely(!ufs_read_cylinder (sb, cgno, cgno))) { + UFSD("EXIT (FAILED)\n"); + return NULL; + } } + UFSD("EXIT\n"); + return sbi->s_ucpi[cgno]; } /* * Cylinder group number cg is in cache but it was not last used, @@ -195,7 +197,10 @@ struct ufs_cg_private_info * ufs_load_cylinder ( sbi->s_ucpi[j] = sbi->s_ucpi[j-1]; } sbi->s_ucpi[0] = ucpi; - ufs_read_cylinder (sb, cgno, 0); + if (unlikely(!ufs_read_cylinder (sb, cgno, 0))) { + UFSD("EXIT (FAILED)\n"); + return NULL; + } } UFSD("EXIT\n"); return sbi->s_ucpi[0]; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index d6e6a2198971..88d0062cfdb9 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -81,10 +81,9 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) } -/* Releases the page */ -void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct folio *folio, struct inode *inode, - bool update_times) +int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, + bool update_times) { loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); @@ -92,17 +91,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, folio_lock(folio); err = ufs_prepare_chunk(folio, pos, len); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + return err; + } de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); ufs_commit_chunk(folio, pos, len); - folio_release_kmap(folio, de); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); - ufs_handle_dirsync(dir); + return ufs_handle_dirsync(dir); } static bool ufs_check_folio(struct folio *folio, char *kaddr) @@ -505,8 +506,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (de->d_reclen == 0) { ufs_error(inode->i_sb, __func__, "zero-length directory entry"); - err = -EIO; - goto out; + return -EIO; } pde = de; de = ufs_next_entry(sb, de); @@ -516,18 +516,17 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pos = folio_pos(folio) + from; folio_lock(folio); err = ufs_prepare_chunk(folio, pos, to - from); - BUG_ON(err); + if (unlikely(err)) { + folio_unlock(folio); + return err; + } if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(folio, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); - err = ufs_handle_dirsync(inode); -out: - folio_release_kmap(folio, kaddr); - UFSD("EXIT\n"); - return err; + return ufs_handle_dirsync(inode); } int ufs_make_empty(struct inode * inode, struct inode *dir) diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 6558882a89ef..487ad1fc2de6 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -42,4 +42,5 @@ const struct file_operations ufs_file_operations = { .open = generic_file_open, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, }; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 5331ae7ebf3e..7dc38fdef2ea 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -220,7 +220,7 @@ changed: */ static bool ufs_extend_tail(struct inode *inode, u64 writes_to, - int *err, struct page *locked_page) + int *err, struct folio *locked_folio) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -239,7 +239,7 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), new_size - (lastfrag & uspi->s_fpbmask), err, - locked_page); + locked_folio); return tmp != 0; } @@ -250,12 +250,11 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, * @new_fragment: number of new allocated fragment(s) * @err: we set it if something wrong * @new: we set it if we allocate new block - * @locked_page: for ufs_new_fragments() + * @locked_folio: for ufs_new_fragments() */ -static u64 -ufs_inode_getfrag(struct inode *inode, unsigned index, +static u64 ufs_inode_getfrag(struct inode *inode, unsigned index, sector_t new_fragment, int *err, - int *new, struct page *locked_page) + int *new, struct folio *locked_folio) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -264,11 +263,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, unsigned nfrags = uspi->s_fpb; void *p; - /* TODO : to be done for write support - if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - goto ufs2; - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) @@ -288,7 +282,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, nfrags, err, locked_page); + goal, nfrags, err, locked_folio); if (!tmp) { *err = -ENOSPC; @@ -303,21 +297,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, mark_inode_dirty(inode); out: return tmp + uspi->s_sbbase; - - /* This part : To be implemented .... - Required only for writing, not required for READ-ONLY. -ufs2: - - u2_block = ufs_fragstoblks(fragment); - u2_blockoff = ufs_fragnum(fragment); - p = ufsi->i_u1.u2_i_data + block; - goal = 0; - -repeat2: - tmp = fs32_to_cpu(sb, *p); - lastfrag = ufsi->i_lastfrag; - - */ } /** @@ -329,12 +308,11 @@ repeat2: * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() - * @locked_page: see ufs_inode_getfrag() + * @locked_folio: see ufs_inode_getfrag() */ -static u64 -ufs_inode_getblock(struct inode *inode, u64 ind_block, - unsigned index, sector_t new_fragment, int *err, - int *new, struct page *locked_page) +static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block, + unsigned index, sector_t new_fragment, int *err, + int *new, struct folio *locked_folio) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -369,7 +347,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, else goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, - uspi->s_fpb, err, locked_page); + uspi->s_fpb, err, locked_folio); if (!tmp) goto out; @@ -434,14 +412,14 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff unsigned tailfrags = lastfrag & uspi->s_fpbmask; if (tailfrags && fragment >= lastfrag) { if (!ufs_extend_tail(inode, fragment, - &err, bh_result->b_page)) + &err, bh_result->b_folio)) goto out; } } if (depth == 1) { phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, &new, bh_result->b_page); + &err, &new, bh_result->b_folio); } else { int i; phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, @@ -450,7 +428,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff phys64 = ufs_inode_getblock(inode, phys64, offsets[i], fragment, &err, NULL, NULL); phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], - fragment, &err, &new, bh_result->b_page); + fragment, &err, &new, bh_result->b_folio); } out: if (phys64) { @@ -898,91 +876,84 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) +/* + * used only for truncation down to direct blocks. + */ static void ufs_trunc_direct(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned int new_frags, old_frags; + unsigned int old_slot, new_slot; + unsigned int old_tail, new_tail; struct to_free ctx = {.inode = inode}; - unsigned i, tmp; UFSD("ENTER: ino %lu\n", inode->i_ino); - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; + new_frags = DIRECT_FRAGMENT; + // new_frags = first fragment past the new EOF + old_frags = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + // old_frags = first fragment past the old EOF or covered by indirects - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); + if (new_frags >= old_frags) // expanding - nothing to free + goto done; - ufs_free_fragments(inode, tmp + frag1, frag2); + old_tail = ufs_fragnum(old_frags); + old_slot = ufs_fragstoblks(old_frags); + new_tail = ufs_fragnum(new_frags); + new_slot = ufs_fragstoblks(new_frags); -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); + if (old_slot == new_slot) { // old_tail > 0 + void *p = ufs_get_direct_data_ptr(uspi, ufsi, old_slot); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) - continue; - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); + ufs_panic(sb, __func__, "internal error"); + if (!new_tail) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + } + ufs_free_fragments(inode, tmp + new_tail, old_tail - new_tail); + } else { + unsigned int slot = new_slot; - free_data(&ctx, tmp, uspi->s_fpb); - } + if (new_tail) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + ufs_panic(sb, __func__, "internal error"); - free_data(&ctx, 0, 0); + ufs_free_fragments(inode, tmp + new_tail, + uspi->s_fpb - new_tail); + } + while (slot < old_slot) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); - if (frag3 >= frag4) - goto next3; + free_data(&ctx, tmp, uspi->s_fpb); + } - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); + free_data(&ctx, 0, 0); - ufs_free_fragments (inode, tmp, frag4); - next3: + if (old_tail) { + void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot); + u64 tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + ufs_panic(sb, __func__, "internal error"); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + ufs_free_fragments(inode, tmp, old_tail); + } + } +done: UFSD("EXIT: ino %lu\n", inode->i_ino); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index c8390976ab6a..38a024c8cccd 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -210,20 +210,18 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; struct folio *folio; - int err = -ENOENT; + int err; de = ufs_find_entry(dir, &dentry->d_name, &folio); if (!de) - goto out; + return -ENOENT; err = ufs_delete_entry(dir, de, folio); - if (err) - goto out; - - inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); - inode_dec_link_count(inode); - err = 0; -out: + if (!err) { + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); + inode_dec_link_count(inode); + } + folio_release_kmap(folio, de); return err; } @@ -253,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct ufs_dir_entry * dir_de = NULL; struct folio *old_folio; struct ufs_dir_entry *old_de; - int err = -ENOENT; + int err; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) - goto out; + return -ENOENT; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -281,7 +279,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); + err = ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); + folio_release_kmap(new_folio, new_de); + if (err) + goto out_dir; inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -299,26 +300,20 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, * rename. */ inode_set_ctime_current(old_inode); - - ufs_delete_entry(old_dir, old_de, old_folio); mark_inode_dirty(old_inode); - if (dir_de) { + err = ufs_delete_entry(old_dir, old_de, old_folio); + if (!err && dir_de) { if (old_dir != new_dir) - ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); - else - folio_release_kmap(dir_folio, dir_de); + err = ufs_set_link(old_inode, dir_de, dir_folio, + new_dir, 0); inode_dec_link_count(old_dir); } - return 0; - - out_dir: if (dir_de) folio_release_kmap(dir_folio, dir_de); out_old: folio_release_kmap(old_folio, old_de); -out: return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index bc625788589c..762699c1bcf6 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -505,7 +505,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; - struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; @@ -521,21 +520,13 @@ static int ufs_read_cylinder_structures(struct super_block *sb) if (!base) goto failed; sbi->s_csp = (struct ufs_csum *)space; - for (i = 0; i < blks; i += uspi->s_fpb) { - size = uspi->s_bsize; - if (i + uspi->s_fpb > blks) - size = (blks - i) * uspi->s_fsize; - - ubh = ubh_bread(sb, uspi->s_csaddr + i, size); - - if (!ubh) + for (i = 0; i < blks; i++) { + struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); + if (!bh) goto failed; - - ubh_ubhcpymem (space, ubh, size); - - space += size; - ubh_brelse (ubh); - ubh = NULL; + memcpy(space, bh->b_data, uspi->s_fsize); + space += uspi->s_fsize; + brelse (bh); } /* @@ -645,7 +636,6 @@ static void ufs_put_super_internal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; - struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned blks, size, i; @@ -656,18 +646,17 @@ static void ufs_put_super_internal(struct super_block *sb) size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = (char*) sbi->s_csp; - for (i = 0; i < blks; i += uspi->s_fpb) { - size = uspi->s_bsize; - if (i + uspi->s_fpb > blks) - size = (blks - i) * uspi->s_fsize; - - ubh = ubh_bread(sb, uspi->s_csaddr + i, size); - - ubh_memcpyubh (ubh, space, size); - space += size; - ubh_mark_buffer_uptodate (ubh, 1); - ubh_mark_buffer_dirty (ubh); - ubh_brelse (ubh); + for (i = 0; i < blks; i++, space += uspi->s_fsize) { + struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); + + if (unlikely(!bh)) { // better than an oops... + ufs_panic(sb, __func__, + "can't write part of cylinder group summary"); + continue; + } + memcpy(bh->b_data, space, uspi->s_fsize); + mark_buffer_dirty(bh); + brelse(bh); } for (i = 0; i < sbi->s_cg_loaded; i++) { ufs_put_cylinder (sb, i); @@ -1240,11 +1229,7 @@ magic_found: else uspi->s_apbshift = uspi->s_bshift - 2; - uspi->s_2apbshift = uspi->s_apbshift * 2; - uspi->s_3apbshift = uspi->s_apbshift * 3; uspi->s_apb = 1 << uspi->s_apbshift; - uspi->s_2apb = 1 << uspi->s_2apbshift; - uspi->s_3apb = 1 << uspi->s_3apbshift; uspi->s_apbmask = uspi->s_apb - 1; uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index a2c762cb65a0..e7df65dd4351 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -88,10 +88,10 @@ struct ufs_inode_info { #endif /* balloc.c */ -extern void ufs_free_fragments (struct inode *, u64, unsigned); -extern void ufs_free_blocks (struct inode *, u64, unsigned); -extern u64 ufs_new_fragments(struct inode *, void *, u64, u64, - unsigned, int *, struct page *); +void ufs_free_fragments (struct inode *, u64 fragment, unsigned count); +void ufs_free_blocks (struct inode *, u64 fragment, unsigned count); +u64 ufs_new_fragments(struct inode *, void *, u64 fragment, u64 goal, + unsigned count, int *err, struct folio *); /* cylinder.c */ extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned); @@ -108,8 +108,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *); int ufs_empty_dir(struct inode *); struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); -void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct folio *folio, struct inode *inode, bool update_times); +int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct folio *folio, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index ef9ead44776a..0905f9a16b91 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -775,12 +775,8 @@ struct ufs_sb_private_info { __u32 s_fpbmask; /* fragments per block mask */ __u32 s_apb; /* address per block */ - __u32 s_2apb; /* address per block^2 */ - __u32 s_3apb; /* address per block^3 */ __u32 s_apbmask; /* address per block mask */ __u32 s_apbshift; /* address per block shift */ - __u32 s_2apbshift; /* address per block shift * 2 */ - __u32 s_3apbshift; /* address per block shift * 3 */ __u32 s_nspfshift; /* number of sector per fragment shift */ __u32 s_nspb; /* number of sector per block */ __u32 s_inopf; /* inodes per fragment */ diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 2acf191eb89e..f0e906ab4ddd 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -99,20 +99,6 @@ void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh) mark_buffer_dirty (ubh->bh[i]); } -void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) -{ - unsigned i; - if (!ubh) - return; - if (flag) { - for ( i = 0; i < ubh->count; i++ ) - set_buffer_uptodate (ubh->bh[i]); - } else { - for ( i = 0; i < ubh->count; i++ ) - clear_buffer_uptodate (ubh->bh[i]); - } -} - void ubh_sync_block(struct ufs_buffer_head *ubh) { if (ubh) { @@ -146,38 +132,6 @@ int ubh_buffer_dirty (struct ufs_buffer_head * ubh) return result; } -void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, - unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size) -{ - unsigned len, bhno; - if (size > (ubh->count << uspi->s_fshift)) - size = ubh->count << uspi->s_fshift; - bhno = 0; - while (size) { - len = min_t(unsigned int, size, uspi->s_fsize); - memcpy (mem, ubh->bh[bhno]->b_data, len); - mem += uspi->s_fsize; - size -= len; - bhno++; - } -} - -void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size) -{ - unsigned len, bhno; - if (size > (ubh->count << uspi->s_fshift)) - size = ubh->count << uspi->s_fshift; - bhno = 0; - while (size) { - len = min_t(unsigned int, size, uspi->s_fsize); - memcpy (ubh->bh[bhno]->b_data, mem, len); - mem += uspi->s_fsize; - size -= len; - bhno++; - } -} - dev_t ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) { diff --git a/fs/ufs/util.h b/fs/ufs/util.h index bf708b68f150..391bb4f11d74 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -263,14 +263,9 @@ extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, str extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); -extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); extern void ubh_sync_block(struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); -#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) -extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned); -#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size) -extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); @@ -455,65 +450,69 @@ static inline unsigned _ubh_find_last_zero_bit_( return (base << uspi->s_bpfshift) + pos - begin; } -#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block)) - -#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block) -static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline int ubh_isblockset(struct ufs_sb_private_info *uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); u8 mask; + switch (uspi->s_fpb) { case 8: - return (*ubh_get_addr (ubh, begin + block) == 0xff); + return *p == 0xff; case 4: - mask = 0x0f << ((block & 0x01) << 2); - return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; + mask = 0x0f << (frag & 4); + return (*p & mask) == mask; case 2: - mask = 0x03 << ((block & 0x03) << 1); - return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; + mask = 0x03 << (frag & 6); + return (*p & mask) == mask; case 1: - mask = 0x01 << (block & 0x07); - return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; + mask = 0x01 << (frag & 7); + return (*p & mask) == mask; } return 0; } -#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block) -static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline void ubh_clrblock(struct ufs_sb_private_info *uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); + switch (uspi->s_fpb) { case 8: - *ubh_get_addr (ubh, begin + block) = 0x00; + *p = 0x00; return; case 4: - *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2)); + *p &= ~(0x0f << (frag & 4)); return; case 2: - *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1)); + *p &= ~(0x03 << (frag & 6)); return; case 1: - *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07))); + *p &= ~(0x01 << (frag & 7)); return; } } -#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block) -static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi, - struct ufs_buffer_head * ubh, unsigned begin, unsigned block) +static inline void ubh_setblock(struct ufs_sb_private_info * uspi, + struct ufs_cg_private_info *ucpi, unsigned int frag) { + struct ufs_buffer_head *ubh = UCPI_UBH(ucpi); + u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3)); + switch (uspi->s_fpb) { case 8: - *ubh_get_addr(ubh, begin + block) = 0xff; + *p = 0xff; return; case 4: - *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2)); + *p |= 0x0f << (frag & 4); return; case 2: - *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1)); + *p |= 0x03 << (frag & 6); return; case 1: - *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07))); + *p |= 0x01 << (frag & 7); return; } } diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data index c73786807d3b..f75567e28138 100644 --- a/fs/unicode/README.utf8data +++ b/fs/unicode/README.utf8data @@ -1,4 +1,4 @@ -The utf8data.h file in this directory is generated from the Unicode +The utf8data.c file in this directory is generated from the Unicode Character Database for version 12.1.0 of the Unicode standard. The full set of files can be found here: @@ -45,13 +45,13 @@ Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1: make REGENERATE_UTF8DATA=1 fs/unicode/ -After sanity checking the newly generated utf8data.h file (the +After sanity checking the newly generated utf8data.c file (the version generated from the 12.1.0 UCD should be 4,109 lines long, and have a total size of 324k) and/or comparing it with the older version -of utf8data.h_shipped, rename it to utf8data.h_shipped. +of utf8data.c_shipped, rename it to utf8data.c_shipped. If you are a kernel developer updating to a newer version of the Unicode Character Database, please update this README.utf8data file with the version of the UCD that was used, the md5sum and sha1sums of -the *.txt files, before checking in the new versions of the utf8data.h +the *.txt files, before checking in the new versions of the utf8data.c and README.utf8data files. diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index 77b685db8275..401f5d3aeb0c 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -36,7 +36,7 @@ #define FOLD_NAME "CaseFolding.txt" #define NORM_NAME "NormalizationCorrections.txt" #define TEST_NAME "NormalizationTest.txt" -#define UTF8_NAME "utf8data.h" +#define UTF8_NAME "utf8data.c" const char *age_name = AGE_NAME; const char *ccc_name = CCC_NAME; @@ -3338,7 +3338,7 @@ static void write_file(void) } fprintf(file, "};\n"); fprintf(file, "\n"); - fprintf(file, "struct utf8data_table utf8_data_table = {\n"); + fprintf(file, "const struct utf8data_table utf8_data_table = {\n"); fprintf(file, "\t.utf8agetab = utf8agetab,\n"); fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); fprintf(file, "\n"); diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 8395066341a4..6fc9ab8667e6 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -198,7 +198,7 @@ struct unicode_map *utf8_load(unsigned int version) return um; out_symbol_put: - symbol_put(um->tables); + symbol_put(utf8_data_table); out_free_um: kfree(um); return ERR_PTR(-EINVAL); @@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um) } EXPORT_SYMBOL(utf8_unload); +/** + * utf8_parse_version - Parse a UTF-8 version number from a string + * + * @version: input string + * + * Returns the parsed version on success, negative code on error + */ +int utf8_parse_version(char *version) +{ + substring_t args[3]; + unsigned int maj, min, rev; + static const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + + if (match_token(version, token, args) != 1) + return -EINVAL; + + if (match_int(&args[0], &maj) || match_int(&args[1], &min) || + match_int(&args[2], &rev)) + return -EINVAL; + + return UNICODE_AGE(maj, min, rev); +} +EXPORT_SYMBOL(utf8_parse_version); diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 600e15efe9ed..5ddaf27b21a6 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -17,9 +17,6 @@ static unsigned int failed_tests; static unsigned int total_tests; -/* Tests will be based on this version. */ -#define UTF8_LATEST UNICODE_AGE(12, 1, 0) - #define _test(cond, func, line, fmt, ...) do { \ total_tests++; \ if (!cond) { \ diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped index dafa5fed761d..73a93d49b3ba 100644 --- a/fs/unicode/utf8data.c_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -4107,7 +4107,7 @@ static const unsigned char utf8data[64256] = { 0x81,0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00 }; -struct utf8data_table utf8_data_table = { +const struct utf8data_table utf8_data_table = { .utf8agetab = utf8agetab, .utf8agetab_size = ARRAY_SIZE(utf8agetab), diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index bd00d587747a..fc703aa4b28e 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h @@ -78,6 +78,6 @@ struct utf8data_table { const unsigned char *utf8data; }; -extern struct utf8data_table utf8_data_table; +extern const struct utf8data_table utf8_data_table; #endif /* UTF8NORM_H */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 68cdd89c97a3..7c0bd0b55f88 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { diff --git a/fs/utimes.c b/fs/utimes.c index 99b26f792b89..c7c7958e57b2 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -108,18 +108,13 @@ retry: static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { - struct fd f; - int error; - if (flags) return -EINVAL; - f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; - error = vfs_utimes(&fd_file(f)->f_path, times); - fdput(f); - return error; + return vfs_utimes(&fd_file(f)->f_path, times); } /* diff --git a/fs/vboxsf/Kconfig b/fs/vboxsf/Kconfig index b84586ae08b3..d4694026db8b 100644 --- a/fs/vboxsf/Kconfig +++ b/fs/vboxsf/Kconfig @@ -1,6 +1,6 @@ config VBOXSF_FS tristate "VirtualBox guest shared folder (vboxsf) support" - depends on X86 && VBOXGUEST + depends on (ARM64 || X86) && VBOXGUEST select NLS help VirtualBox hosts can share folders with guests, this driver diff --git a/fs/xattr.c b/fs/xattr.c index 05ec7e7d9e87..02bee149ad96 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -586,25 +586,32 @@ retry_deleg: } EXPORT_SYMBOL_GPL(vfs_removexattr); +int import_xattr_name(struct xattr_name *kname, const char __user *name) +{ + int error = strncpy_from_user(kname->name, name, + sizeof(kname->name)); + if (error == 0 || error == sizeof(kname->name)) + return -ERANGE; + if (error < 0) + return error; + return 0; +} + /* * Extended attribute SET operations */ -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; - error = strncpy_from_user(ctx->kname->name, name, - sizeof(ctx->kname->name)); - if (error == 0 || error == sizeof(ctx->kname->name)) - return -ERANGE; - if (error < 0) + error = import_xattr_name(ctx->kname, name); + if (error) return error; - error = 0; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; @@ -619,8 +626,8 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) return error; } -int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx) +static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, @@ -630,32 +637,32 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, ctx->kvalue, ctx->size, ctx->flags); } -static int path_setxattr(const char __user *pathname, - const char __user *name, const void __user *value, - size_t size, int flags, unsigned int lookup_flags) +int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) +{ + int error = mnt_want_write_file(f); + + if (!error) { + audit_file(f); + error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); + mnt_drop_write_file(f); + } + return error; +} + +/* unconditionally consumes filename */ +int filename_setxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { - struct xattr_name kname; - struct xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, - }; struct path path; int error; - error = setxattr_copy(name, &ctx); - if (error) - return error; - retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { - error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx); + error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); @@ -665,80 +672,121 @@ retry: } out: + putname(filename); + return error; +} + +static int path_setxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + const void __user *value, size_t size, int flags) +{ + struct xattr_name kname; + struct kernel_xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + struct filename *filename; + unsigned int lookup_flags = 0; + int error; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags = LOOKUP_FOLLOW; + + error = setxattr_copy(name, &ctx); + if (error) + return error; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + error = -EBADF; + else + error = file_setxattr(fd_file(f), &ctx); + } else { + error = filename_setxattr(dfd, filename, lookup_flags, &ctx); + } kvfree(ctx.kvalue); return error; } +SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, const struct xattr_args __user *, uargs, + size_t, usize) +{ + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (error) + return error; + + return path_setxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size, + args.flags); +} + SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); + return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, 0); + return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { - struct xattr_name kname; - struct xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, - }; - int error; - - CLASS(fd, f)(fd); - if (!fd_file(f)) - return -EBADF; - - audit_file(fd_file(f)); - error = setxattr_copy(name, &ctx); - if (error) - return error; - - error = mnt_want_write_file(fd_file(f)); - if (!error) { - error = do_setxattr(file_mnt_idmap(fd_file(f)), - fd_file(f)->f_path.dentry, &ctx); - mnt_drop_write_file(fd_file(f)); - } - kvfree(ctx.kvalue); - return error; + return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, + value, size, flags); } /* * Extended attribute GET operations */ -ssize_t +static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, - struct xattr_ctx *ctx) + struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; + void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; - ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); - if (!ctx->kvalue) + kvalue = kvzalloc(ctx->size, GFP_KERNEL); + if (!kvalue) return -ENOMEM; } - if (is_posix_acl_xattr(ctx->kname->name)) - error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size); + if (is_posix_acl_xattr(kname)) + error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else - error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size); + error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { - if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) + if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger @@ -746,79 +794,114 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d, error = -E2BIG; } + kvfree(kvalue); return error; } -static ssize_t -getxattr(struct mnt_idmap *idmap, struct dentry *d, - const char __user *name, void __user *value, size_t size) +ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { + audit_file(f); + return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); +} + +/* unconditionally consumes filename */ +ssize_t filename_getxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) +{ + struct path path; ssize_t error; +retry: + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); + if (error) + goto out; + error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +out: + putname(filename); + return error; +} + +static ssize_t path_getxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + void __user *value, size_t size) +{ struct xattr_name kname; - struct xattr_ctx ctx = { + struct kernel_xattr_ctx ctx = { .value = value, - .kvalue = NULL, .size = size, .kname = &kname, .flags = 0, }; + struct filename *filename; + ssize_t error; - error = strncpy_from_user(kname.name, name, sizeof(kname.name)); - if (error == 0 || error == sizeof(kname.name)) - error = -ERANGE; - if (error < 0) - return error; + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; - error = do_getxattr(idmap, d, &ctx); + error = import_xattr_name(&kname, name); + if (error) + return error; - kvfree(ctx.kvalue); - return error; + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_getxattr(fd_file(f), &ctx); + } else { + int lookup_flags = 0; + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags = LOOKUP_FOLLOW; + return filename_getxattr(dfd, filename, lookup_flags, &ctx); + } } -static ssize_t path_getxattr(const char __user *pathname, - const char __user *name, void __user *value, - size_t size, unsigned int lookup_flags) +SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { - struct path path; - ssize_t error; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; - error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + + if (args.flags != 0) + return -EINVAL; + + return path_getxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); + return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, 0); + return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { - struct fd f = fdget(fd); - ssize_t error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry, - name, value, size); - fdput(f); - return error; + return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* @@ -853,47 +936,80 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -static ssize_t path_listxattr(const char __user *pathname, char __user *list, - size_t size, unsigned int lookup_flags) +static +ssize_t file_listxattr(struct file *f, char __user *list, size_t size) +{ + audit_file(f); + return listxattr(f->f_path.dentry, list, size); +} + +/* unconditionally consumes filename */ +static +ssize_t filename_listxattr(int dfd, struct filename *filename, + unsigned int lookup_flags, + char __user *list, size_t size) { struct path path; ssize_t error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - return error; + goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +out: + putname(filename); return error; } +static ssize_t path_listxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, char __user *list, + size_t size) +{ + struct filename *filename; + int lookup_flags; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_listxattr(fd_file(f), list, size); + } + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + return filename_listxattr(dfd, filename, lookup_flags, list, size); +} + +SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, + char __user *, list, size_t, size) +{ + return path_listxattrat(dfd, pathname, at_flags, list, size); +} + SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); + return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, 0); + return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { - struct fd f = fdget(fd); - ssize_t error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - error = listxattr(fd_file(f)->f_path.dentry, list, size); - fdput(f); - return error; + return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* @@ -907,25 +1023,33 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) return vfs_removexattr(idmap, d, name); } -static int path_removexattr(const char __user *pathname, - const char __user *name, unsigned int lookup_flags) +static int file_removexattr(struct file *f, struct xattr_name *kname) +{ + int error = mnt_want_write_file(f); + + if (!error) { + audit_file(f); + error = removexattr(file_mnt_idmap(f), + f->f_path.dentry, kname->name); + mnt_drop_write_file(f); + } + return error; +} + +/* unconditionally consumes filename */ +static int filename_removexattr(int dfd, struct filename *filename, + unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; - char kname[XATTR_NAME_MAX + 1]; - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) - return error; + goto out; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_idmap(path.mnt), path.dentry, kname); + error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); @@ -933,45 +1057,58 @@ retry: lookup_flags |= LOOKUP_REVAL; goto retry; } +out: + putname(filename); return error; } +static int path_removexattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name) +{ + struct xattr_name kname; + struct filename *filename; + unsigned int lookup_flags; + int error; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + error = import_xattr_name(&kname, name); + if (error) + return error; + + filename = getname_maybe_null(pathname, at_flags); + if (!filename) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + return file_removexattr(fd_file(f), &kname); + } + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + return filename_removexattr(dfd, filename, lookup_flags, &kname); +} + +SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, const char __user *, name) +{ + return path_removexattrat(dfd, pathname, at_flags, name); +} + SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, LOOKUP_FOLLOW); + return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, 0); + return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { - struct fd f = fdget(fd); - char kname[XATTR_NAME_MAX + 1]; - int error = -EBADF; - - if (!fd_file(f)) - return error; - audit_file(fd_file(f)); - - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; - - error = mnt_want_write_file(fd_file(f)); - if (!error) { - error = removexattr(file_mnt_idmap(fd_file(f)), - fd_file(f)->f_path.dentry, kname); - mnt_drop_write_file(fd_file(f)); - } - fdput(f); - return error; + return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) @@ -1005,9 +1142,10 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; - int err = 0; for_each_xattr_handler(handlers, handler) { + int err; + if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); @@ -1015,7 +1153,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return err; } - return err ? err : buffer_size - remaining_size; + return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index dd692619bed5..ed9b0dabc1f1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -14,7 +14,9 @@ xfs-y += xfs_trace.o # build the libxfs code first xfs-y += $(addprefix libxfs/, \ + xfs_group.o \ xfs_ag.o \ + xfs_ag_resv.o \ xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ @@ -42,7 +44,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_buf.o \ xfs_inode_util.o \ xfs_log_rlimit.o \ - xfs_ag_resv.o \ + xfs_metadir.o \ + xfs_metafile.o \ xfs_parent.o \ xfs_rmap.o \ xfs_rmap_btree.o \ @@ -58,6 +61,7 @@ xfs-y += $(addprefix libxfs/, \ # xfs_rtbitmap is shared with libxfs xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ + xfs_rtgroup.o \ ) # highlevel code @@ -171,6 +175,7 @@ xfs-y += $(addprefix scrub/, \ inode.o \ iscan.o \ listxattr.o \ + metapath.o \ nlinks.o \ parent.o \ readdir.o \ @@ -186,6 +191,7 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rgsuper.o \ rtbitmap.o \ rtsummary.o \ ) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5f0494702e0b..b59cb461e096 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -30,86 +30,7 @@ #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" - - -/* - * Passive reference counting access wrappers to the perag structures. If the - * per-ag structure is to be freed, the freeing code is responsible for cleaning - * up objects with passive references before freeing the structure. This is - * things like cached buffers. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = xa_load(&mp->m_perags, agno); - if (pag) { - trace_xfs_perag_get(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) >= 0); - atomic_inc(&pag->pag_ref); - } - rcu_read_unlock(); - return pag; -} - -/* Get a passive reference to the given perag. */ -struct xfs_perag * -xfs_perag_hold( - struct xfs_perag *pag) -{ - ASSERT(atomic_read(&pag->pag_ref) > 0 || - atomic_read(&pag->pag_active_ref) > 0); - - trace_xfs_perag_hold(pag, _RET_IP_); - atomic_inc(&pag->pag_ref); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - trace_xfs_perag_put(pag, _RET_IP_); - ASSERT(atomic_read(&pag->pag_ref) > 0); - atomic_dec(&pag->pag_ref); -} - -/* - * Active references for perag structures. This is for short term access to the - * per ag structures for walking trees or accessing state. If an AG is being - * shrunk or is offline, then this will fail to find that AG and return NULL - * instead. - */ -struct xfs_perag * -xfs_perag_grab( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - - rcu_read_lock(); - pag = xa_load(&mp->m_perags, agno); - if (pag) { - trace_xfs_perag_grab(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; -} - -void -xfs_perag_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_rele(pag, _RET_IP_); - if (atomic_dec_and_test(&pag->pag_active_ref)) - wake_up(&pag->pag_active_wq); -} +#include "xfs_group.h" /* * xfs_initialize_perag_data @@ -184,31 +105,32 @@ out: return error; } +static void +xfs_perag_uninit( + struct xfs_group *xg) +{ +#ifdef __KERNEL__ + struct xfs_perag *pag = to_perag(xg); + + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_buf_cache_destroy(&pag->pag_bcache); +#endif +} + /* - * Free up the per-ag resources associated with the mount structure. + * Free up the per-ag resources within the specified AG range. */ void -xfs_free_perag( - struct xfs_mount *mp) +xfs_free_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno) + { - struct xfs_perag *pag; xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xa_erase(&mp->m_perags, agno); - ASSERT(pag); - XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); - xfs_defer_drain_free(&pag->pag_intents_drain); - - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_cache_destroy(&pag->pag_bcache); - - /* drop the mount's active reference */ - xfs_perag_rele(pag); - XFS_IS_CORRUPT(pag->pag_mount, - atomic_read(&pag->pag_active_ref) != 0); - kfree_rcu_mightsleep(pag); - } + for (agno = first_agno; agno < end_agno; agno++) + xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit); } /* Find the size of the AG, in blocks. */ @@ -271,118 +193,100 @@ xfs_agino_range( } /* - * Free perag within the specified AG range, it is only used to free unused - * perags under the error handling path. + * Update the perag of the previous tail AG if it has been changed during + * recovery (i.e. recovery of a growfs). */ -void -xfs_free_unused_perag_range( +int +xfs_update_last_ag_size( struct xfs_mount *mp, - xfs_agnumber_t agstart, - xfs_agnumber_t agend) + xfs_agnumber_t prev_agcount) { - struct xfs_perag *pag; - xfs_agnumber_t index; + struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1); - for (index = agstart; index < agend; index++) { - pag = xa_erase(&mp->m_perags, index); - if (!pag) - break; - xfs_buf_cache_destroy(&pag->pag_bcache); - xfs_defer_drain_free(&pag->pag_intents_drain); - kfree(pag); - } + if (!pag) + return -EFSCORRUPTED; + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, + prev_agcount - 1, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); + xfs_perag_rele(pag); + return 0; } -int -xfs_initialize_perag( +static int +xfs_perag_alloc( struct xfs_mount *mp, + xfs_agnumber_t index, xfs_agnumber_t agcount, - xfs_rfsblock_t dblocks, - xfs_agnumber_t *maxagi) + xfs_rfsblock_t dblocks) { struct xfs_perag *pag; - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - - error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL); - if (error) { - WARN_ON_ONCE(error == -EBUSY); - goto out_free_pag; - } + pag = kzalloc(sizeof(*pag), GFP_KERNEL); + if (!pag) + return -ENOMEM; #ifdef __KERNEL__ - /* Place kernel structure only init below this point. */ - spin_lock_init(&pag->pag_ici_lock); - spin_lock_init(&pag->pagb_lock); - spin_lock_init(&pag->pag_state_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - xfs_defer_drain_init(&pag->pag_intents_drain); - init_waitqueue_head(&pag->pagb_wait); - init_waitqueue_head(&pag->pag_active_wq); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; - xfs_hooks_init(&pag->pag_rmap_update_hooks); + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); #endif /* __KERNEL__ */ - error = xfs_buf_cache_init(&pag->pag_bcache); - if (error) - goto out_remove_pag; + error = xfs_buf_cache_init(&pag->pag_bcache); + if (error) + goto out_free_perag; - /* Active ref owned by mount indicates AG is online. */ - atomic_set(&pag->pag_active_ref, 1); + /* + * Pre-calculated geometry + */ + pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; + error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG); + if (error) + goto out_buf_cache_destroy; - /* - * Pre-calculated geometry - */ - pag->block_count = __xfs_ag_block_count(mp, index, agcount, - dblocks); - pag->min_block = XFS_AGFL_BLOCK(mp); - __xfs_agino_range(mp, pag->block_count, &pag->agino_min, - &pag->agino_max); - } + return 0; - index = xfs_set_inode_alloc(mp, agcount); +out_buf_cache_destroy: + xfs_buf_cache_destroy(&pag->pag_bcache); +out_free_perag: + kfree(pag); + return error; +} - if (maxagi) - *maxagi = index; +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + int error; + if (orig_agcount >= new_agcount) + return 0; + + for (index = orig_agcount; index < new_agcount; index++) { + error = xfs_perag_alloc(mp, index, new_agcount, dblocks); + if (error) + goto out_unwind_new_pags; + } + + *maxagi = xfs_set_inode_alloc(mp, new_agcount); mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_remove_pag: - xfs_defer_drain_free(&pag->pag_intents_drain); - pag = xa_erase(&mp->m_perags, index); -out_free_pag: - kfree(pag); out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - xfs_free_unused_perag_range(mp, first_initialised, agcount); + xfs_free_perag_range(mp, orig_agcount, index); return error; } @@ -837,7 +741,7 @@ xfs_ag_shrink_space( struct xfs_trans **tpp, xfs_extlen_t delta) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -854,7 +758,7 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp); if (error) return error; @@ -891,7 +795,7 @@ xfs_ag_shrink_space( /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); + xfs_agbno_to_fsb(pag, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; @@ -950,9 +854,9 @@ xfs_ag_shrink_space( } /* Update perag geometry */ - pag->block_count -= delta; - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count -= delta; + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); @@ -977,12 +881,13 @@ xfs_ag_extend_space( struct xfs_trans *tp, xfs_extlen_t len) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; - ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, tp, 0, &bp); if (error) @@ -1021,9 +926,9 @@ xfs_ag_extend_space( return error; /* Update perag geometry */ - pag->block_count = be32_to_cpu(agf->agf_length); - __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, - &pag->agino_max); + pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min, + &pag->agino_max); return 0; } @@ -1050,7 +955,7 @@ xfs_ag_get_geometry( /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = pag->pag_agno; + ageo->ag_number = pag_agno(pag); agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index d9cccd093b60..1f24cfa27321 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -7,6 +7,8 @@ #ifndef __LIBXFS_AG_H #define __LIBXFS_AG_H 1 +#include "xfs_group.h" + struct xfs_mount; struct xfs_trans; struct xfs_perag; @@ -30,11 +32,7 @@ struct xfs_ag_resv { * performance of allocation group selection. */ struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* passive reference count */ - atomic_t pag_active_ref; /* active reference count */ - wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ + struct xfs_group pag_group; unsigned long pag_opstate; uint8_t pagf_bno_level; /* # of levels in bno btree */ uint8_t pagf_cnt_level; /* # of levels in cnt btree */ @@ -55,7 +53,6 @@ struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; - int pagb_count; /* pagb slots in use */ uint8_t pagf_refcount_level; /* recount btree height */ /* Blocks reserved for all kinds of metadata. */ @@ -64,21 +61,12 @@ struct xfs_perag { struct xfs_ag_resv pag_rmapbt_resv; /* Precalculated geometry info */ - xfs_agblock_t block_count; - xfs_agblock_t min_block; xfs_agino_t agino_min; xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - #ifdef CONFIG_XFS_ONLINE_REPAIR /* * Alternate btree heights so that online repair won't trip the write @@ -90,13 +78,6 @@ struct xfs_perag { uint8_t pagf_repair_rmap_level; #endif - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ spinlock_t pag_ici_lock; /* incore inode cache lock */ @@ -108,21 +89,29 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - - /* - * We use xfs_drain to track the number of deferred log intent items - * that have been queued (but not yet processed) so that waiters (e.g. - * scrub) will not lock resources when other threads are in the middle - * of processing a chain of intent items only to find momentary - * inconsistencies. - */ - struct xfs_defer_drain pag_intents_drain; - - /* Hook to feed rmapbt updates to an active online repair. */ - struct xfs_hooks pag_rmap_update_hooks; #endif /* __KERNEL__ */ }; +static inline struct xfs_perag *to_perag(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_perag, pag_group); +} + +static inline struct xfs_group *pag_group(struct xfs_perag *pag) +{ + return &pag->pag_group; +} + +static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_mount; +} + +static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag) +{ + return pag->pag_group.xg_gno; +} + /* * Per-AG operational state. These are atomic flag bits. */ @@ -144,21 +133,80 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) -void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, - xfs_agnumber_t agend); -int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount, + xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount, + xfs_agnumber_t *maxagi); +void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno, + xfs_agnumber_t end_agno); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); -void xfs_free_perag(struct xfs_mount *mp); +int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); /* Passive AG references */ -struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); -struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); -void xfs_perag_put(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + return to_perag(xfs_group_hold(pag_group(pag))); +} + +static inline void +xfs_perag_put( + struct xfs_perag *pag) +{ + xfs_group_put(pag_group(pag)); +} /* Active AG references */ -struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t); -void xfs_perag_rele(struct xfs_perag *pag); +static inline struct xfs_perag * +xfs_perag_grab( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG)); +} + +static inline void +xfs_perag_rele( + struct xfs_perag *pag) +{ + xfs_group_rele(pag_group(pag)); +} + +static inline struct xfs_perag * +xfs_perag_next_range( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno, + xfs_agnumber_t end_agno) +{ + return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL, + start_agno, end_agno, XG_TYPE_AG)); +} + +static inline struct xfs_perag * +xfs_perag_next_from( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_agnumber_t start_agno) +{ + return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1); +} + +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + return xfs_perag_next_from(mp, pag, 0); +} /* * Per-ag geometry infomation and validation @@ -170,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) { - if (agbno >= pag->block_count) - return false; - if (agbno <= pag->min_block) - return false; - return true; + return xfs_verify_gbno(pag_group(pag), agbno); } static inline bool @@ -183,13 +227,7 @@ xfs_verify_agbext( xfs_agblock_t agbno, xfs_agblock_t len) { - if (agbno + len <= agbno) - return false; - - if (!xfs_verify_agbno(pag, agbno)) - return false; - - return xfs_verify_agbno(pag, agbno + len - 1); + return xfs_verify_gbext(pag_group(pag), agbno, len); } /* @@ -225,40 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); } -/* - * Perag iteration APIs - */ -static inline struct xfs_perag * -xfs_perag_next( - struct xfs_perag *pag, - xfs_agnumber_t *agno, - xfs_agnumber_t end_agno) -{ - struct xfs_mount *mp = pag->pag_mount; - - *agno = pag->pag_agno + 1; - xfs_perag_rele(pag); - while (*agno <= end_agno) { - pag = xfs_perag_grab(mp, *agno); - if (pag) - return pag; - (*agno)++; - } - return NULL; -} - -#define for_each_perag_range(mp, agno, end_agno, pag) \ - for ((pag) = xfs_perag_grab((mp), (agno)); \ - (pag) != NULL; \ - (pag) = xfs_perag_next((pag), &(agno), (end_agno))) - -#define for_each_perag_from(mp, agno, pag) \ - for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) - -#define for_each_perag(mp, agno, pag) \ - (agno) = 0; \ - for_each_perag_from((mp), (agno), (pag)) - static inline struct xfs_perag * xfs_perag_next_wrap( struct xfs_perag *pag, @@ -267,9 +271,9 @@ xfs_perag_next_wrap( xfs_agnumber_t restart_agno, xfs_agnumber_t wrap_agno) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - *agno = pag->pag_agno + 1; + *agno = pag_agno(pag) + 1; xfs_perag_rele(pag); while (*agno != stop_agno) { if (*agno >= wrap_agno) { @@ -331,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len); int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +static inline xfs_fsblock_t +xfs_agbno_to_fsb( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_daddr_t +xfs_agbno_to_daddr( + struct xfs_perag *pag, + xfs_agblock_t agbno) +{ + return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno); +} + +static inline xfs_ino_t +xfs_agino_to_ino( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino); +} + #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 216423df939e..f5d853089019 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -70,6 +70,7 @@ xfs_ag_resv_critical( struct xfs_perag *pag, enum xfs_ag_resv_type type) { + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t avail; xfs_extlen_t orig; @@ -92,8 +93,8 @@ xfs_ag_resv_critical( /* Critically low if less than 10% or max btree height remains. */ return XFS_TEST_ERROR(avail < orig / 10 || - avail < pag->pag_mount->m_agbtree_maxlevels, - pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); + avail < mp->m_agbtree_maxlevels, + mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -137,8 +138,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - if (pag->pag_agno == 0) - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag_agno(pag) == 0) + pag_mount(pag)->m_ag_max_usable += resv->ar_asked; /* * RMAPBT blocks come from the AGFL and AGFL blocks are always * considered "free", so whatever was reserved at mount time must be @@ -148,7 +149,7 @@ __xfs_ag_resv_free( oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; - xfs_add_fdblocks(pag->pag_mount, oldresv); + xfs_add_fdblocks(pag_mount(pag), oldresv); resv->ar_reserved = 0; resv->ar_asked = 0; resv->ar_orig_reserved = 0; @@ -170,7 +171,7 @@ __xfs_ag_resv_init( xfs_extlen_t ask, xfs_extlen_t used) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_ag_resv *resv; int error; xfs_extlen_t hidden_space; @@ -206,11 +207,10 @@ __xfs_ag_resv_init( else error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, - error, _RET_IP_); + trace_xfs_ag_resv_init_error(pag, error, _RET_IP_); xfs_warn(mp, "Per-AG reservation for AG %u failed. Filesystem may run out of space.", - pag->pag_agno); + pag_agno(pag)); return error; } @@ -220,7 +220,7 @@ __xfs_ag_resv_init( * counter, we only make the adjustment for AG 0. This assumes that * there aren't any AGs hungrier for per-AG reservation than AG 0. */ - if (pag->pag_agno == 0) + if (pag_agno(pag) == 0) mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); @@ -238,7 +238,7 @@ xfs_ag_resv_init( struct xfs_perag *pag, struct xfs_trans *tp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 59326f84f6a5..3d33e17f2e5c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -275,7 +275,7 @@ xfs_alloc_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); @@ -303,7 +303,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -331,7 +331,8 @@ xfs_alloc_compute_aligned( bool busy; /* Trim busy sections out of found extent */ - busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); + busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, + args->maxlen, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -539,7 +540,7 @@ static int xfs_alloc_fixup_longest( struct xfs_btree_cur *cnt_cur) { - struct xfs_perag *pag = cnt_cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cnt_cur->bc_group); struct xfs_buf *bp = cnt_cur->bc_ag.agbp; struct xfs_agf *agf = bp->b_addr; xfs_extlen_t longest = 0; @@ -799,7 +800,7 @@ xfs_agfl_verify( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { @@ -879,13 +880,12 @@ xfs_alloc_read_agfl( struct xfs_trans *tp, struct xfs_buf **bpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *bp; int error; - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); @@ -1252,14 +1252,14 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, + xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { struct xfs_buf *bp; error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, - XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), + xfs_agbno_to_daddr(args->pag, fbno), args->mp->m_bsize, 0, &bp); if (error) goto error; @@ -1365,7 +1365,8 @@ xfs_alloc_ag_vextent_exact( */ tbno = fbno; tlen = flen; - xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); + xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, + &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1758,8 +1759,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_near_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - acur.busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), acur.busy_gen, + alloc_flags); if (error) goto out; @@ -1874,8 +1876,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -1923,7 +1926,7 @@ restart: error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); @@ -1973,8 +1976,9 @@ restart: * the allocation can be retried. */ trace_xfs_alloc_size_busy(args); - error = xfs_extent_busy_flush(args->tp, args->pag, - busy_gen, alloc_flags); + error = xfs_extent_busy_flush(args->tp, + pag_group(args->pag), busy_gen, + alloc_flags); if (error) goto error0; @@ -2037,7 +2041,6 @@ int xfs_free_ag_extent( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, @@ -2358,19 +2361,19 @@ xfs_free_ag_extent( * Update the freespace totals in the ag and superblock. */ error = xfs_alloc_update_counters(tp, agbp, len); - xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); if (error) goto error0; XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); + trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); + trace_xfs_free_extent(pag, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2429,7 +2432,7 @@ xfs_alloc_longest_free_extent( * reservations and AGFL rules in place, we can return this extent. */ if (pag->pagf_longest > delta) - return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, + return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, pag->pagf_longest - delta); /* Otherwise, let the caller try for 1 block if there's space. */ @@ -2612,7 +2615,7 @@ xfs_agfl_reset( xfs_warn(mp, "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " "Please unmount and run xfs_repair.", - pag->pag_agno, pag->pagf_flcount); + pag_agno(pag), pag->pagf_flcount); agf->agf_flfirst = 0; agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); @@ -2645,8 +2648,17 @@ xfs_defer_extent_free( ASSERT(!isnullstartblock(bno)); ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) - return -EFSCORRUPTED; + if (free_flags & XFS_FREE_EXTENT_REALTIME) { + if (type != XFS_AG_RESV_NONE) { + ASSERT(type == XFS_AG_RESV_NONE); + return -EFSCORRUPTED; + } + if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) + return -EFSCORRUPTED; + } else { + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + } xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); @@ -2655,6 +2667,8 @@ xfs_defer_extent_free( xefi->xefi_agresv = type; if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (free_flags & XFS_FREE_EXTENT_REALTIME) + xefi->xefi_flags |= XFS_EFI_REALTIME; if (oinfo) { ASSERT(oinfo->oi_offset == 0); @@ -2766,7 +2780,6 @@ xfs_alloc_commit_autoreap( xfs_defer_item_unpause(tp, aarp->dfp); } -#ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to * args->minlen. @@ -2806,7 +2819,6 @@ out: return error; } -#endif /* * Decide whether to use this allocation group for this allocation. @@ -2880,15 +2892,14 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; -#ifdef DEBUG - if (args->alloc_minlen_only) { + if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { int stat; error = xfs_exact_minlen_extent_available(args, agbp, &stat); if (error || !stat) goto out_agbp_relse; } -#endif + /* * Make the freelist shorter if it's too long. * @@ -2937,9 +2948,8 @@ xfs_alloc_fix_freelist( * Deferring the free disconnects freeing up the AGFL slot from * freeing the block. */ - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, args->agno, bno), 1, - &targs.oinfo, XFS_AG_RESV_AGFL, 0); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), + 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); if (error) goto out_agbp_relse; } @@ -3159,8 +3169,6 @@ xfs_alloc_put_freelist( logflags |= XFS_AGF_BTREEBLKS; } - xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = xfs_buf_to_agfl_bno(agflbp); @@ -3193,7 +3201,7 @@ xfs_validate_ag_length( * use it by using uncached buffers that don't have the perag attached * so we can detect and avoid this problem. */ - if (bp->b_pag && seqno != bp->b_pag->pag_agno) + if (bp->b_pag && seqno != pag_agno(bp->b_pag)) return __this_address; /* @@ -3362,13 +3370,13 @@ xfs_read_agf( int flags, struct xfs_buf **agfbpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agf(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); @@ -3391,12 +3399,13 @@ xfs_alloc_read_agf( int flags, struct xfs_buf **agfbpp) { + struct xfs_mount *mp = pag_mount(pag); struct xfs_buf *agfbp; struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); + trace_xfs_alloc_read_agf(pag); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != @@ -3417,7 +3426,7 @@ xfs_alloc_read_agf( pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); - if (xfs_agfl_needs_reset(pag->pag_mount, agf)) + if (xfs_agfl_needs_reset(mp, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3430,16 +3439,15 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(pag->pag_mount)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, - &pag->pag_mount->m_allocbt_blks); + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } #ifdef DEBUG - else if (!xfs_is_shutdown(pag->pag_mount)) { + else if (!xfs_is_shutdown(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3600,7 +3608,7 @@ xfs_alloc_vextent_finish( goto out_drop_perag; } - args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); @@ -3621,8 +3629,8 @@ xfs_alloc_vextent_finish( if (error) goto out_drop_perag; - ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, - args->len)); + ASSERT(!xfs_extent_busy_search(pag_group(args->pag), + args->agbno, args->len)); } xfs_ag_resv_alloc_extent(args->pag, args->resv, args); @@ -3652,21 +3660,20 @@ xfs_alloc_vextent_this_ag( struct xfs_alloc_arg *args, xfs_agnumber_t agno) { - struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == agno); + ASSERT(pag_agno(args->pag) == agno); args->agno = agno; args->agbno = 0; trace_xfs_alloc_vextent_this_ag(args); - error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), - &minimum_agno); + error = xfs_alloc_vextent_check_args(args, + xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); if (error) { if (error == -ENOSPC) return 0; @@ -3871,7 +3878,7 @@ xfs_alloc_vextent_exact_bno( int error; ASSERT(args->pag != NULL); - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3910,7 +3917,7 @@ xfs_alloc_vextent_near_bno( int error; if (!needs_perag) - ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); + ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); args->agno = XFS_FSB_TO_AGNO(mp, target); args->agbno = XFS_FSB_TO_AGBNO(mp, target); @@ -3947,7 +3954,7 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = pag->pag_agno; + args.agno = pag_agno(pag); args.pag = pag; /* @@ -4015,14 +4022,13 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, - type); + error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); return 0; err_release: @@ -4047,7 +4053,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index fae170825be0..50ef79a1ed41 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -53,11 +53,9 @@ typedef struct xfs_alloc_arg { int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ + bool alloc_minlen_only; /* allocate exact minlen extent */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ -#ifdef DEBUG - bool alloc_minlen_only; /* allocate exact minlen extent */ -#endif } xfs_alloc_arg_t; /* @@ -81,9 +79,8 @@ int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, struct xfs_buf *agflbp, xfs_agblock_t bno, int btreeblk); int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, - xfs_extlen_t len, const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type); + xfs_agblock_t bno, xfs_extlen_t len, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); /* * Compute and fill in value of m_alloc_maxlevels. @@ -240,7 +237,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, /* Don't issue a discard for the blocks freed. */ #define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0) -#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD) +/* Free blocks on the realtime device. */ +#define XFS_FREE_EXTENT_REALTIME (1U << 1) + +#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \ + XFS_FREE_EXTENT_REALTIME) /* * List of extents to be free "later". @@ -251,7 +252,7 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - struct xfs_perag *xefi_pag; + struct xfs_group *xefi_group; unsigned int xefi_flags; enum xfs_ag_resv_type xefi_agresv; }; @@ -260,6 +261,12 @@ struct xfs_extent_free_item { #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ #define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ +#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */ + +static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi) +{ + return xefi->xefi_flags & XFS_EFI_REALTIME; +} struct xfs_alloc_autoreap { struct xfs_defer_pending *dfp; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index aada676eee51..a4ac37ba5d51 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -28,7 +28,7 @@ xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } STATIC struct xfs_btree_cur * @@ -36,29 +36,29 @@ xfs_cntbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, - cur->bc_ag.pag); + to_perag(cur->bc_group)); } - STATIC void xfs_allocbt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; ASSERT(ptr->s != 0); if (xfs_btree_is_bno(cur->bc_ops)) { agf->agf_bno_root = ptr->s; be32_add_cpu(&agf->agf_bno_level, inc); - cur->bc_ag.pag->pagf_bno_level += inc; + pag->pagf_bno_level += inc; } else { agf->agf_cnt_root = ptr->s; be32_add_cpu(&agf->agf_cnt_level, inc); - cur->bc_ag.pag->pagf_cnt_level += inc; + pag->pagf_cnt_level += inc; } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); @@ -75,7 +75,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -86,7 +86,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_group, bno, 1, false); new->s = cpu_to_be32(bno); @@ -104,13 +104,13 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, - bno, 1); + error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp, + agbp, NULL, bno, 1); if (error) return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } @@ -178,7 +178,7 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); if (xfs_btree_is_bno(cur->bc_ops)) ptr->s = agf->agf_bno_root; @@ -492,7 +492,7 @@ xfs_bnobt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -518,7 +518,7 @@ xfs_cntbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f30bcc64100d..17875ad865f5 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -51,7 +51,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); -STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); /* * Internal routines when attribute list is more than one block. @@ -437,6 +436,33 @@ xfs_attr_hashval( return xfs_attr_hashname(name, namelen); } +/* Save the current remote block info and clear the current pointers. */ +static void +xfs_attr_save_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno2 = args->blkno; + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; +} + +/* Set stored info about a remote block */ +static void +xfs_attr_restore_rmt_blk( + struct xfs_da_args *args) +{ + args->blkno = args->blkno2; + args->index = args->index2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; +} + /* * PPTR_REPLACE operations require the caller to set the old and new names and * values explicitly. Update the canonical fields to the new name and value @@ -482,48 +508,73 @@ xfs_attr_complete_op( return replace_state; } +/* + * Try to add an attribute to an inode in leaf form. + */ static int xfs_attr_leaf_addname( struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_buf *bp; int error; ASSERT(xfs_attr_is_leaf(args->dp)); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); + if (error) + return error; + /* - * Use the leaf buffer we may already hold locked as a result of - * a sf-to-leaf conversion. + * Look up the xattr name to set the insertion point for the new xattr. */ - error = xfs_attr_leaf_try_add(args); - - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + goto out_brelse; + trace_xfs_attr_leaf_replace(args); /* - * We're not in leaf format anymore, so roll the transaction and - * retry the add to the newly allocated node block. + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to + * add, not the attribute we just found and will remove later. */ - attr->xattri_dela_state = XFS_DAS_NODE_ADD; - goto out; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - if (error) - return error; /* * We need to commit and roll if we need to allocate remote xattr blocks * or perform more xattr manipulations. Otherwise there is nothing more * to do and we can return success. */ - if (args->rmtblkno) + if (!xfs_attr3_leaf_add(bp, args)) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + } else if (args->rmtblkno) { attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; - else - attr->xattri_dela_state = xfs_attr_complete_op(attr, - XFS_DAS_LEAF_REPLACE); -out: + } else { + attr->xattri_dela_state = + xfs_attr_complete_op(attr, XFS_DAS_LEAF_REPLACE); + } + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return 0; + +out_brelse: + xfs_trans_brelse(args->trans, bp); return error; } @@ -546,7 +597,7 @@ xfs_attr_node_addname( return error; error = xfs_attr_node_try_addname(attr); - if (error == -ENOSPC) { + if (error == 1) { error = xfs_attr3_leaf_to_node(args); if (error) return error; @@ -953,7 +1004,10 @@ xfs_attr_add_fork( unsigned int blks; /* space reservation */ int error; /* error return value */ - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + if (xfs_is_metadir_inode(ip)) + ASSERT(XFS_IS_DQDETACHED(ip)); + else + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); blks = XFS_ADDAFORK_SPACE_RES(mp); @@ -1170,88 +1224,6 @@ xfs_attr_shortform_addname( * External routines when attribute list is one block *========================================================================*/ -/* Save the current remote block info and clear the current pointers. */ -static void -xfs_attr_save_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno2 = args->blkno; - args->index2 = args->index; - args->rmtblkno2 = args->rmtblkno; - args->rmtblkcnt2 = args->rmtblkcnt; - args->rmtvaluelen2 = args->rmtvaluelen; - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; -} - -/* Set stored info about a remote block */ -static void -xfs_attr_restore_rmt_blk( - struct xfs_da_args *args) -{ - args->blkno = args->blkno2; - args->index = args->index2; - args->rmtblkno = args->rmtblkno2; - args->rmtblkcnt = args->rmtblkcnt2; - args->rmtvaluelen = args->rmtvaluelen2; -} - -/* - * Tries to add an attribute to an inode in leaf form - * - * This function is meant to execute as part of a delayed operation and leaves - * the transaction handling to the caller. On success the attribute is added - * and the inode and transaction are left dirty. If there is not enough space, - * the attr data is converted to node format and -ENOSPC is returned. Caller is - * responsible for handling the dirty inode and transaction or adding the attr - * in node format. - */ -STATIC int -xfs_attr_leaf_try_add( - struct xfs_da_args *args) -{ - struct xfs_buf *bp; - int error; - - error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, 0, &bp); - if (error) - return error; - - /* - * Look up the xattr name to set the insertion point for the new xattr. - */ - error = xfs_attr3_leaf_lookup_int(bp, args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - goto out_brelse; - break; - case -EEXIST: - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - goto out_brelse; - - trace_xfs_attr_leaf_replace(args); - /* - * Save the existing remote attr state so that the current - * values reflect the state of the new attribute we are about to - * add, not the attribute we just found and will remove later. - */ - xfs_attr_save_rmt_blk(args); - break; - case 0: - break; - default: - goto out_brelse; - } - - return xfs_attr3_leaf_add(bp, args); - -out_brelse: - xfs_trans_brelse(args->trans, bp); - return error; -} - /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -1417,9 +1389,12 @@ error: /* * Add a name to a Btree-format attribute list. * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). + * This will involve walking down the Btree, and may involve splitting leaf + * nodes and even splitting intermediate nodes up to and including the root + * node (a special case of an intermediate node). + * + * If the tree was still in single leaf format and needs to converted to + * real node format return 1 and let the caller handle that. */ static int xfs_attr_node_try_addname( @@ -1427,21 +1402,21 @@ xfs_attr_node_try_addname( { struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; - int error; + int error = 0; trace_xfs_attr_node_addname(state->args); blk = &state->path.blk[state->path.active-1]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_add(blk->bp, state->args); - if (error == -ENOSPC) { + if (!xfs_attr3_leaf_add(blk->bp, state->args)) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* * have been a b-tree. Let the caller deal with this. */ + error = 1; goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e50d913ad32f..fddb55605e0c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -47,7 +47,7 @@ */ STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, struct xfs_buf **bpp); -STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, +STATIC void xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int freemap_index); STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, @@ -995,10 +995,8 @@ xfs_attr_shortform_to_leaf( xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); - error = xfs_attr3_leaf_add(bp, &nargs); - ASSERT(error != -ENOSPC); - if (error) - goto out; + if (!xfs_attr3_leaf_add(bp, &nargs)) + ASSERT(0); sfe = xfs_attr_sf_nextentry(sfe); } error = 0; @@ -1333,6 +1331,9 @@ xfs_attr3_leaf_create( /* * Split the leaf node, rebalance, then add the new entry. + * + * Returns 0 if the entry was added, 1 if a further split is needed or a + * negative error number otherwise. */ int xfs_attr3_leaf_split( @@ -1340,8 +1341,9 @@ xfs_attr3_leaf_split( struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk) { - xfs_dablk_t blkno; - int error; + bool added; + xfs_dablk_t blkno; + int error; trace_xfs_attr_leaf_split(state->args); @@ -1376,10 +1378,10 @@ xfs_attr3_leaf_split( */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); - error = xfs_attr3_leaf_add(oldblk->bp, state->args); + added = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); - error = xfs_attr3_leaf_add(newblk->bp, state->args); + added = xfs_attr3_leaf_add(newblk->bp, state->args); } /* @@ -1387,13 +1389,15 @@ xfs_attr3_leaf_split( */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); - return error; + if (!added) + return 1; + return 0; } /* * Add a name to the leaf attribute list structure. */ -int +bool xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) @@ -1402,6 +1406,7 @@ xfs_attr3_leaf_add( struct xfs_attr3_icleaf_hdr ichdr; int tablesize; int entsize; + bool added = true; int sum; int tmp; int i; @@ -1430,7 +1435,7 @@ xfs_attr3_leaf_add( if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); if (ichdr.freemap[i].size >= tmp) { - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + xfs_attr3_leaf_add_work(bp, &ichdr, args, i); goto out_log_hdr; } sum += ichdr.freemap[i].size; @@ -1442,7 +1447,7 @@ xfs_attr3_leaf_add( * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) - return -ENOSPC; + return false; /* * Compact the entries to coalesce free space. @@ -1455,24 +1460,24 @@ xfs_attr3_leaf_add( * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { - tmp = -ENOSPC; + added = false; goto out_log_hdr; } - tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); - return tmp; + return added; } /* * Add a name to a leaf attribute list structure. */ -STATIC int +STATIC void xfs_attr3_leaf_add_work( struct xfs_buf *bp, struct xfs_attr3_icleaf_hdr *ichdr, @@ -1590,7 +1595,6 @@ xfs_attr3_leaf_add_work( } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); - return 0; } /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index bac219589896..589f810eedc0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -76,7 +76,7 @@ int xfs_attr3_leaf_split(struct xfs_da_state *state, int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); -int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, +bool xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8090e8249116..5255f93bae31 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -40,6 +40,7 @@ #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" +#include "xfs_rtgroup.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -1042,7 +1043,10 @@ xfs_bmap_add_attrfork( int error; /* error return value */ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + if (xfs_is_metadir_inode(ip)) + ASSERT(XFS_IS_DQDETACHED(ip)); + else + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1423,6 +1427,24 @@ xfs_bmap_last_offset( * Extent tree manipulation functions used during allocation. */ +static inline bool +xfs_bmap_same_rtgroup( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *left, + struct xfs_bmbt_irec *right) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, left->br_startblock) != + xfs_rtb_to_rgno(mp, right->br_startblock)) + return false; + } + + return true; +} + /* * Convert a delayed allocation to a real allocation. */ @@ -1492,7 +1514,8 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -1516,7 +1539,8 @@ xfs_bmap_add_extent_delay_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2061,7 +2085,8 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new)) state |= BMAP_LEFT_CONTIG; /* @@ -2085,7 +2110,8 @@ xfs_bmap_add_extent_unwritten_real( (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= XFS_MAX_BMBT_EXTLEN)) + <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT)) state |= BMAP_RIGHT_CONTIG; /* @@ -2745,7 +2771,8 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN && + xfs_bmap_same_rtgroup(ip, whichfork, &left, new)) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && @@ -2755,7 +2782,8 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) && + xfs_bmap_same_rtgroup(ip, whichfork, new, &right)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -3121,8 +3149,15 @@ xfs_bmap_adjacent_valid( struct xfs_mount *mp = ap->ip->i_mount; if (XFS_IS_REALTIME_INODE(ap->ip) && - (ap->datatype & XFS_ALLOC_USERDATA)) - return x < mp->m_sb.sb_rblocks; + (ap->datatype & XFS_ALLOC_USERDATA)) { + if (!xfs_has_rtgroups(mp)) + return x < mp->m_sb.sb_rblocks; + + return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) && + xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount && + xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents; + + } return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && @@ -3280,7 +3315,7 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(pag, - xfs_alloc_min_freelist(pag->pag_mount, pag), + xfs_alloc_min_freelist(pag_mount(pag), pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3477,31 +3512,19 @@ xfs_bmap_process_allocated_extent( xfs_bmap_alloc_account(ap); } -#ifdef DEBUG static int xfs_bmap_exact_minlen_extent_alloc( - struct xfs_bmalloca *ap) + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) { - struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - int error; - - ASSERT(ap->length); - if (ap->minlen != 1) { - ap->blkno = NULLFSBLOCK; - ap->length = 0; + args->fsbno = NULLFSBLOCK; return 0; } - orig_offset = ap->offset; - orig_length = ap->length; - - args.alloc_minlen_only = 1; - - xfs_bmap_compute_alignments(ap, &args); + args->alloc_minlen_only = 1; + args->minlen = args->maxlen = ap->minlen; + args->total = ap->total; /* * Unlike the longest extent available in an AG, we don't track @@ -3511,39 +3534,16 @@ xfs_bmap_exact_minlen_extent_alloc( * we need not be concerned about a drop in performance in * "debug only" code paths. */ - ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); - - args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - args.minlen = args.maxlen = ap->minlen; - args.total = ap->total; - - args.alignment = 1; - args.minalignslop = 0; - - args.minleft = ap->minleft; - args.wasdel = ap->wasdel; - args.resv = XFS_AG_RESV_NONE; - args.datatype = ap->datatype; - - error = xfs_alloc_vextent_first_ag(&args, ap->blkno); - if (error) - return error; - - if (args.fsbno != NULLFSBLOCK) { - xfs_bmap_process_allocated_extent(ap, &args, orig_offset, - orig_length); - } else { - ap->blkno = NULLFSBLOCK; - ap->length = 0; - } + ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); - return 0; + /* + * Call xfs_bmap_btalloc_low_space here as it first does a "normal" AG + * iteration and then drops args->total to args->minlen, which might be + * required to find an allocation for the transaction reservation when + * the file system is very full. + */ + return xfs_bmap_btalloc_low_space(ap, args); } -#else - -#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) - -#endif /* * If we are not low on available data blocks and we are allocating at @@ -3801,8 +3801,11 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if ((ap->datatype & XFS_ALLOC_USERDATA) && - xfs_inode_is_filestream(ap->ip)) + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); + else if ((ap->datatype & XFS_ALLOC_USERDATA) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); else error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); @@ -4123,7 +4126,7 @@ retry: fdblocks = indlen; if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); if (error) goto out_unreserve_quota; } else { @@ -4158,7 +4161,7 @@ retry: out_unreserve_frextents: if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen)); + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); @@ -4177,43 +4180,6 @@ out: } static int -xfs_bmap_alloc_userdata( - struct xfs_bmalloca *bma) -{ - struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = xfs_bmapi_whichfork(bma->flags); - int error; - - /* - * Set the data type being allocated. For the data fork, the first data - * in the file is treated differently to all other allocations. For the - * attribute fork, we only need to ensure the allocated range is not on - * the busy list. - */ - bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { - bma->datatype |= XFS_ALLOC_USERDATA; - if (bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; - - if (mp->m_dalign && bma->length >= mp->m_dalign) { - error = xfs_bmap_isaeof(bma, whichfork); - if (error) - return error; - } - - if (XFS_IS_REALTIME_INODE(bma->ip)) - return xfs_bmap_rtalloc(bma); - } - - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - return xfs_bmap_exact_minlen_extent_alloc(bma); - - return xfs_bmap_btalloc(bma); -} - -static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) { @@ -4230,15 +4196,32 @@ xfs_bmapi_allocate( else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) { - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) - error = xfs_bmap_exact_minlen_extent_alloc(bma); - else - error = xfs_bmap_btalloc(bma); - } else { - error = xfs_bmap_alloc_userdata(bma); + if (!(bma->flags & XFS_BMAPI_METADATA)) { + /* + * For the data and COW fork, the first data in the file is + * treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range + * is not on the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + + if (mp->m_dalign && bma->length >= mp->m_dalign) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + } } + + if ((bma->datatype & XFS_ALLOC_USERDATA) && + XFS_IS_REALTIME_INODE(bma->ip)) + error = xfs_bmap_rtalloc(bma); + else + error = xfs_bmap_btalloc(bma); if (error) return error; if (bma->blkno == NULLFSBLOCK) @@ -5086,7 +5069,7 @@ xfs_bmap_del_extent_delay( fdblocks = da_diff; if (isrt) - xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); else fdblocks += del->br_blockcount; @@ -5165,6 +5148,34 @@ xfs_bmap_del_extent_cow( ip->i_delayed_blks -= del->br_blockcount; } +static int +xfs_bmap_free_rtblocks( + struct xfs_trans *tp, + struct xfs_bmbt_irec *del) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = xfs_rtgroup_grab(tp->t_mountp, 0); + if (!rtg) + return -EIO; + + /* + * Ensure the bitmap and summary inodes are locked and joined to the + * transaction before modifying them. + */ + if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { + tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP); + } + + error = xfs_rtfree_blocks(tp, rtg, del->br_startblock, + del->br_blockcount); + xfs_rtgroup_rele(rtg); + return error; +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space. @@ -5377,20 +5388,12 @@ xfs_bmap_del_extent_real( * If we need to, add to list of extents to delete. */ if (!(bflags & XFS_BMAPI_REMAP)) { + bool isrt = xfs_ifork_is_realtime(ip, whichfork); + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); - } else if (xfs_ifork_is_realtime(ip, whichfork)) { - /* - * Ensure the bitmap and summary inodes are locked - * and joined to the transaction before modifying them. - */ - if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) { - tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED; - xfs_rtbitmap_lock(mp); - xfs_rtbitmap_trans_join(tp); - } - error = xfs_rtfree_blocks(tp, del->br_startblock, - del->br_blockcount); + } else if (isrt && !xfs_has_rtgroups(mp)) { + error = xfs_bmap_free_rtblocks(tp, del); } else { unsigned int efi_flags = 0; @@ -5398,6 +5401,19 @@ xfs_bmap_del_extent_real( del->br_state == XFS_EXT_UNWRITTEN) efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD; + /* + * Historically, we did not use EFIs to free realtime + * extents. However, when reverse mapping is enabled, + * we must maintain the same order of operations as the + * data device, which is: Remove the file mapping, + * remove the reverse mapping, and then free the + * blocks. Reflink for realtime volumes requires the + * same sort of ordering. Both features rely on + * rtgroups, so let's gate rt EFI usage on rtgroups. + */ + if (isrt) + efi_flags |= XFS_FREE_EXTENT_REALTIME; + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, efi_flags); @@ -5746,6 +5762,8 @@ xfs_bunmapi( */ STATIC bool xfs_bmse_can_merge( + struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_bmbt_irec *got, /* current extent to shift */ xfs_fileoff_t shift) /* shift fsb */ @@ -5761,7 +5779,8 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) || + !xfs_bmap_same_rtgroup(ip, whichfork, left, got)) return false; return true; @@ -5797,7 +5816,7 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - ASSERT(xfs_bmse_can_merge(left, got, shift)); + ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift)); new = *left; new.br_blockcount = blockcount; @@ -5959,7 +5978,8 @@ xfs_bmap_collapse_extents( goto del_cursor; } - if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(ip, whichfork, &prev, &got, + offset_shift_fsb)) { error = xfs_bmse_merge(tp, ip, whichfork, offset_shift_fsb, &icur, &got, &prev, cur, &logflags); @@ -6095,7 +6115,8 @@ xfs_bmap_insert_extents( * never find mergeable extents in this scenario. Check anyways * and warn if we encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb)) + if (xfs_bmse_can_merge(ip, whichfork, &got, &next, + offset_shift_fsb)) WARN_ON_ONCE(1); } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7592d46e97c6..4b721d935994 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -248,7 +248,7 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; - struct xfs_perag *bi_pag; + struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a5c4af148853..68ee1c299c25 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -225,7 +225,7 @@ __xfs_btree_check_agblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; xfs_agblock_t agbno; @@ -331,7 +331,7 @@ __xfs_btree_check_ptr( return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_AG: - if (!xfs_verify_agbno(cur->bc_ag.pag, + if (!xfs_verify_agbno(to_perag(cur->bc_group), be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; break; @@ -372,7 +372,7 @@ xfs_btree_check_ptr( case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_ops->name, + cur->bc_group->xg_gno, cur->bc_ops->name, level, index); break; } @@ -523,20 +523,8 @@ xfs_btree_del_cursor( ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_AG: - if (cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); - break; - case XFS_BTREE_TYPE_INODE: - /* nothing to do */ - break; - case XFS_BTREE_TYPE_MEM: - if (cur->bc_mem.pag) - xfs_perag_put(cur->bc_mem.pag); - break; - } - + if (cur->bc_group) + xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } @@ -1017,22 +1005,22 @@ xfs_btree_readahead_agblock( struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, left), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, left), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, right), - mp->m_bsize, cur->bc_ops->buf_ops); + xfs_agbno_to_daddr(pag, right), mp->m_bsize, + cur->bc_ops->buf_ops); rval++; } @@ -1091,7 +1079,7 @@ xfs_btree_ptr_to_daddr( switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, + *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), be32_to_cpu(ptr->s)); break; case XFS_BTREE_TYPE_INODE: @@ -1313,7 +1301,7 @@ xfs_btree_owner( case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; case XFS_BTREE_TYPE_AG: - return cur->bc_ag.pag->pag_agno; + return cur->bc_group->xg_gno; default: ASSERT(0); return 0; @@ -3569,14 +3557,31 @@ xfs_btree_insrec( xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* - * If we just inserted into a new tree block, we have to - * recalculate nkey here because nkey is out of date. + * Update btree keys to reflect the newly added record or keyptr. + * There are three cases here to be aware of. Normally, all we have to + * do is walk towards the root, updating keys as necessary. + * + * If the caller had us target a full block for the insertion, we dealt + * with that by calling the _make_block_unfull function. If the + * "make unfull" function splits the block, it'll hand us back the key + * and pointer of the new block. We haven't yet added the new block to + * the next level up, so if we decide to add the new record to the new + * block (bp->b_bn != old_bn), we have to update the caller's pointer + * so that the caller adds the new block with the correct key. * - * Otherwise we're just updating an existing block (having shoved - * some records into the new tree block), so use the regular key - * update mechanism. + * However, there is a third possibility-- if the selected block is the + * root block of an inode-rooted btree and cannot be expanded further, + * the "make unfull" function moves the root block contents to a new + * block and updates the root block to point to the new block. In this + * case, no block pointer is passed back because the block has already + * been added to the btree. In this case, we need to use the regular + * key update function, just like the first case. This is critical for + * overlapping btrees, because the high key must be updated to reflect + * the entire tree, not just the subtree accessible through the first + * child of the root (which is now two levels down from the root). */ - if (bp && xfs_buf_daddr(bp) != old_bn) { + if (!xfs_btree_ptr_is_null(cur, &nptr) && + bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -4745,7 +4750,7 @@ xfs_btree_agblock_v5hdr_verify( return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; - if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) return __this_address; return NULL; } @@ -5156,7 +5161,7 @@ xfs_btree_count_blocks_helper( int level, void *data) { - xfs_extlen_t *blocks = data; + xfs_filblks_t *blocks = data; (*blocks)++; return 0; @@ -5166,7 +5171,7 @@ xfs_btree_count_blocks_helper( int xfs_btree_count_blocks( struct xfs_btree_cur *cur, - xfs_extlen_t *blocks) + xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 10b7ddc3b2b3..c5bff273cae2 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -254,6 +254,7 @@ struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ + struct xfs_group *bc_group; /* per-type information */ union { @@ -264,13 +265,11 @@ struct xfs_btree_cur struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { - struct xfs_perag *pag; struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; - struct xfs_perag *pag; } bc_mem; }; @@ -485,7 +484,7 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); -int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c index 036061fe32cc..df3d613675a1 100644 --- a/fs/xfs/libxfs/xfs_btree_mem.c +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -57,10 +57,8 @@ xfbtree_dup_cursor( ncur->bc_flags = cur->bc_flags; ncur->bc_nlevels = cur->bc_nlevels; ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; - - if (cur->bc_mem.pag) - ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); - + if (cur->bc_group) + ncur->bc_group = xfs_group_hold(cur->bc_group); return ncur; } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 16a529a88780..17d9e6154f19 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -593,9 +593,8 @@ xfs_da3_split( switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: error = xfs_attr3_leaf_split(state, oldblk, newblk); - if ((error != 0) && (error != -ENOSPC)) { + if (error < 0) return error; /* GROT: attr is inconsistent */ - } if (!error) { addblk = newblk; break; @@ -617,6 +616,8 @@ xfs_da3_split( error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } + if (error == 1) + return -ENOSPC; if (error) return error; /* GROT: attr inconsistent */ addblk = newblk; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 2cd212ad2c1d..5b377cbbb1f7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -846,6 +846,12 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!ops->finish_item) { + ASSERT(ops->finish_item != NULL); + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + return NULL; + } + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) dfp = xfs_defer_alloc(&tp->t_dfops, ops); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8b338031e487..ec51b8465e61 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -71,6 +71,7 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type; extern const struct xfs_defer_op_type xfs_attr_defer_type; extern const struct xfs_defer_op_type xfs_exchmaps_defer_type; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 15a362e2f5ea..dceef2abd4e2 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -16,6 +16,9 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_metadir.h" +#include "xfs_metafile.h" int xfs_calc_dquots_per_chunk( @@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts( return cpu_to_be32(t); } + +inline unsigned int +xfs_dqinode_sick_mask(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_SICK_FS_UQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_SICK_FS_GQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_SICK_FS_PQUOTA; + } + + ASSERT(0); + return 0; +} + +/* + * Load the inode for a given type of quota, assuming that the sb fields have + * been sorted out. This is not true when switching quota types on a V4 + * filesystem, so do not use this function for that. If metadir is enabled, + * @dp must be the /quota metadir. + * + * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on + * success; or a negative errno. + */ +int +xfs_dqinode_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type); + int error; + + if (!xfs_has_metadir(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_DQTYPE_USER: + ino = mp->m_sb.sb_uquotino; + break; + case XFS_DQTYPE_GROUP: + ino = mp->m_sb.sb_gquotino; + break; + case XFS_DQTYPE_PROJ: + ino = mp->m_sb.sb_pquotino; + break; + default: + ASSERT(0); + return -EFSCORRUPTED; + } + + /* Should have set 0 to NULLFSINO when loading superblock */ + if (ino == NULLFSINO) + return -ENOENT; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip); + } else { + error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type), + metafile_type, &ip); + if (error == -ENOENT) + return error; + } + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return error; + } + + if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) { + xfs_irele(ip); + xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type)); + return -EFSCORRUPTED; + } + + *ipp = ip; + return 0; +} + +/* Create a metadata directory quota inode. */ +int +xfs_dqinode_metadir_create( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + }; + int error; + + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + error = xfs_metadir_commit(&upd); + if (error) + return error; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; +} + +#ifndef __KERNEL__ +/* Link a metadata directory quota inode. */ +int +xfs_dqinode_metadir_link( + struct xfs_inode *dp, + xfs_dqtype_t type, + struct xfs_inode *ip) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .metafile_type = xfs_dqinode_metafile_type(type), + .path = xfs_dqinode_path(type), + .ip = ip, + }; + int error; + + error = xfs_metadir_start_link(&upd); + if (error) + return error; + + error = xfs_metadir_link(&upd); + if (error) + return error; + + xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE); + + return xfs_metadir_commit(&upd); +} +#endif /* __KERNEL__ */ + +/* Create the parent directory for all quota inodes and load it. */ +int +xfs_dqinode_mkdir_parent( + struct xfs_mount *mp, + struct xfs_inode **dpp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp); +} + +/* + * Load the parent directory of all quota inodes. Pass the inode to the caller + * because quota functions (e.g. QUOTARM) can be called on the quota files even + * if quotas are not enabled. + */ +int +xfs_dqinode_load_parent( + struct xfs_trans *tp, + struct xfs_inode **dpp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR, + dpp); +} diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e1bfee0c3b1a..4d47a3e723aa 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -174,6 +174,14 @@ typedef struct xfs_sb { xfs_lsn_t sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ + xfs_ino_t sb_metadirino; /* metadata directory tree root */ + + xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ + xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ + + uint8_t sb_rgblklog; /* rt group number shift */ + uint8_t sb_pad[7]; /* zeroes */ + /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -259,7 +267,19 @@ struct xfs_dsb { __be64 sb_lsn; /* last write sequence */ uuid_t sb_meta_uuid; /* metadata file system unique id */ - /* must be padded to 64 bit alignment */ + __be64 sb_metadirino; /* metadata directory tree root */ + __be32 sb_rgcount; /* # of realtime groups */ + __be32 sb_rgextents; /* size of rtgroup in rtx */ + + __u8 sb_rgblklog; /* rt group number shift */ + __u8 sb_pad[7]; /* zeroes */ + + /* + * The size of this structure must be padded to 64 bit alignment. + * + * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when + * adding new fields here. + */ }; #define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc) @@ -278,7 +298,7 @@ struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -287,12 +307,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) * Detect a mismatched features2 field. Older kernels read/wrote * this into the wrong slot, so to be safe we keep them in sync. */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp) { return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) || (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); @@ -342,8 +362,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) #define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL static inline bool xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_compat & feature) != 0; } @@ -360,8 +380,8 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_ro_compat & feature) != 0; } @@ -374,6 +394,7 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ +#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -382,13 +403,14 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \ XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ - XFS_SB_FEAT_INCOMPAT_PARENT) + XFS_SB_FEAT_INCOMPAT_PARENT | \ + XFS_SB_FEAT_INCOMPAT_METADIR) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_incompat & feature) != 0; } @@ -399,8 +421,8 @@ xfs_sb_has_incompat_feature( #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - uint32_t feature) + const struct xfs_sb *sbp, + uint32_t feature) { return (sbp->sb_features_log_incompat & feature) != 0; } @@ -420,7 +442,7 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } -static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp) { return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -694,21 +716,58 @@ struct xfs_agfl { /* * Realtime bitmap information is accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_rtword_raw { __u32 old; + __be32 rtg; }; /* * Realtime summary counts are accessed by the word, which is currently - * stored in host-endian format. + * stored in host-endian format. Starting with the realtime groups feature, + * the words are stored in be32 ondisk. */ union xfs_suminfo_raw { __u32 old; + __be32 rtg; }; /* + * Realtime allocation groups break the rt section into multiple pieces that + * could be locked independently. Realtime block group numbers are 32-bit + * quantities. Block numbers within a group are also 32-bit quantities, but + * the upper bit must never be set. rtgroup 0 might have a superblock in it, + * so the minimum size of an rtgroup is 2 rtx. + */ +#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1) +#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2) +#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U)) + +#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */ + +/* + * Realtime superblock - on disk version. Must be padded to 64 bit alignment. + * The first block of the realtime volume contains this superblock. + */ +struct xfs_rtsb { + __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */ + __le32 rsb_crc; /* superblock crc */ + + __be32 rsb_pad; /* zero */ + unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */ + + uuid_t rsb_uuid; /* user-visible file system unique id */ + uuid_t rsb_meta_uuid; /* metadata file system unique id */ + + /* must be padded to 64 bit alignment */ +}; + +#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc) +#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */ + +/* * XFS Timestamps * ============== * @@ -790,6 +849,27 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET; } +enum xfs_metafile_type { + XFS_METAFILE_UNKNOWN, /* unknown */ + XFS_METAFILE_DIR, /* metadir directory */ + XFS_METAFILE_USRQUOTA, /* user quota */ + XFS_METAFILE_GRPQUOTA, /* group quota */ + XFS_METAFILE_PRJQUOTA, /* project quota */ + XFS_METAFILE_RTBITMAP, /* rt bitmap */ + XFS_METAFILE_RTSUMMARY, /* rt summary */ + + XFS_METAFILE_MAX +} __packed; + +#define XFS_METAFILE_TYPE_STR \ + { XFS_METAFILE_UNKNOWN, "unknown" }, \ + { XFS_METAFILE_DIR, "dir" }, \ + { XFS_METAFILE_USRQUOTA, "usrquota" }, \ + { XFS_METAFILE_GRPQUOTA, "grpquota" }, \ + { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ + { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ + { XFS_METAFILE_RTSUMMARY, "rtsummary" } + /* * On-disk inode structure. * @@ -812,7 +892,7 @@ struct xfs_dinode { __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ + __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */ __be32 di_uid; /* owner's user id */ __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ @@ -1088,21 +1168,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * Values for di_flags2 These start by being exposed to userspace in the upper * 16 bits of the XFS_XFLAG_s range. */ -#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ -#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ -#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ -#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ -#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ +/* use DAX for this inode */ +#define XFS_DIFLAG2_DAX_BIT 0 + +/* file's blocks may be shared */ +#define XFS_DIFLAG2_REFLINK_BIT 1 -#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) -#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) -#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) -#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) +/* copy on write extent size hint */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 + +/* big timestamps */ +#define XFS_DIFLAG2_BIGTIME_BIT 3 + +/* large extent counters */ +#define XFS_DIFLAG2_NREXT64_BIT 4 + +/* + * The inode contains filesystem metadata and can be found through the metadata + * directory tree. Metadata inodes must satisfy the following constraints: + * + * - V5 filesystem (and ftype) are enabled; + * - The only valid modes are regular files and directories; + * - The access bits must be zero; + * - DMAPI event and state masks are zero; + * - The user and group IDs must be zero; + * - The project ID can be used as a u32 annotation; + * - The immutable, sync, noatime, nodump, nodefrag flags must be set. + * - The dax flag must not be set. + * - Directories must have nosymlinks set. + * + * These requirements are chosen defensively to minimize the ability of + * userspace to read or modify the contents, should a metadata file ever + * escape to userspace. + * + * There are further constraints on the directory tree itself: + * + * - Metadata inodes must never be resolvable through the root directory; + * - They must never be accessed by userspace; + * - Metadata directory entries must have correct ftype. + * + * Superblock-rooted metadata files must have the METADATA iflag set even + * though they do not have a parent directory. + */ +#define XFS_DIFLAG2_METADATA_BIT 5 + +#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT) +#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT) +#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1117,6 +1236,12 @@ static inline bool xfs_dinode_has_large_extent_counts( (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); } +static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1165,6 +1290,24 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ /* + * RT bit manipulation macros. + */ +#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */ +#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */ + +struct xfs_rtbuf_blkinfo { + __be32 rt_magic; /* validity check on block */ + __be32 rt_crc; /* CRC of block */ + __be64 rt_owner; /* inode that owns the block */ + __be64 rt_blkno; /* first block of the buffer */ + __be64 rt_lsn; /* sequence number of last write */ + uuid_t rt_uuid; /* filesystem we belong to */ +}; + +#define XFS_RTBUF_CRC_OFF \ + offsetof(struct xfs_rtbuf_blkinfo, rt_crc) + +/* * Dquot and dquot block format definitions */ #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 860284064c5a..41ce4d3d650e 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -187,7 +187,9 @@ struct xfs_fsop_geom { __u32 logsunit; /* log stripe unit, bytes */ uint32_t sick; /* o: unhealthy fs & rt metadata */ uint32_t checked; /* o: checked fs & rt metadata */ - __u64 reserved[17]; /* reserved space */ + __u32 rgextents; /* rt extents in a realtime group */ + __u32 rgcount; /* number of realtime groups */ + __u64 reserved[16]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -198,6 +200,8 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ #define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ #define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ +#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */ +#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -242,6 +246,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ /* * Minimum and maximum sizes need for growth checks. @@ -489,9 +494,17 @@ struct xfs_bulk_ireq { */ #define XFS_BULK_IREQ_NREXT64 (1U << 2) +/* + * Allow bulkstat to return information about metadata directories. This + * enables xfs_scrub to find them for scanning, as they are otherwise ordinary + * directories. + */ +#define XFS_BULK_IREQ_METADIR (1U << 3) + #define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ XFS_BULK_IREQ_SPECIAL | \ - XFS_BULK_IREQ_NREXT64) + XFS_BULK_IREQ_NREXT64 | \ + XFS_BULK_IREQ_METADIR) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -722,9 +735,11 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ #define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ #define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ +#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ +#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 29 +#define XFS_SCRUB_TYPE_NR 31 /* * This special type code only applies to the vectored scrub implementation. @@ -803,6 +818,22 @@ struct xfs_scrub_vec_head { #define XFS_SCRUB_VEC_FLAGS_ALL (0) /* + * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for + * path checking. + */ +#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */ +#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */ +#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */ +#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */ +#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */ +#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */ +#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ +#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ + +/* Number of metapath sm_ino values */ +#define XFS_SCRUB_METAPATH_NR (8) + +/* * ioctl limits */ #ifdef XATTR_LIST_MAX @@ -949,6 +980,21 @@ struct xfs_getparents_by_handle { }; /* + * Output for XFS_IOC_RTGROUP_GEOMETRY + */ +struct xfs_rtgroup_geometry { + __u32 rg_number; /* i/o: rtgroup number */ + __u32 rg_length; /* o: length in blocks */ + __u32 rg_sick; /* o: sick things in ag */ + __u32 rg_checked; /* o: checked metadata in ag */ + __u32 rg_flags; /* i/o: flags for this ag */ + __u32 rg_reserved[27]; /* o: zero */ +}; +#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ +#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ +#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ + +/* * ioctl commands that are used by Linux filesystems */ #define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS @@ -986,6 +1032,7 @@ struct xfs_getparents_by_handle { #define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents) #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) +#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c new file mode 100644 index 000000000000..e9d76bcdc820 --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_extent_busy.h" +#include "xfs_group.h" + +/* + * Groups can have passive and active references. + * + * For passive references the code freeing a group is responsible for cleaning + * up objects that hold the passive references (e.g. cached buffers). + * Routines manipulating passive references are xfs_group_get, xfs_group_hold + * and xfs_group_put. + * + * Active references are for short term access to the group for walking trees or + * accessing state. If a group is being shrunk or offlined, the lookup will fail + * to find that group and return NULL instead. + * Routines manipulating active references are xfs_group_grab and + * xfs_group_rele. + */ + +struct xfs_group * +xfs_group_get( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_get(xg, _RET_IP_); + ASSERT(atomic_read(&xg->xg_ref) >= 0); + atomic_inc(&xg->xg_ref); + } + rcu_read_unlock(); + return xg; +} + +struct xfs_group * +xfs_group_hold( + struct xfs_group *xg) +{ + ASSERT(atomic_read(&xg->xg_ref) > 0 || + atomic_read(&xg->xg_active_ref) > 0); + + trace_xfs_group_hold(xg, _RET_IP_); + atomic_inc(&xg->xg_ref); + return xg; +} + +void +xfs_group_put( + struct xfs_group *xg) +{ + trace_xfs_group_put(xg, _RET_IP_); + + ASSERT(atomic_read(&xg->xg_ref) > 0); + atomic_dec(&xg->xg_ref); +} + +struct xfs_group * +xfs_group_grab( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type) +{ + struct xfs_group *xg; + + rcu_read_lock(); + xg = xa_load(&mp->m_groups[type].xa, index); + if (xg) { + trace_xfs_group_grab(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +/* + * Iterate to the next group. To start the iteration at @start_index, a %NULL + * @xg is passed, else the previous group returned from this function. The + * caller should break out of the loop when this returns %NULL. If the caller + * wants to break out of a loop that did not finish it needs to release the + * active reference to @xg using xfs_group_rele() itself. + */ +struct xfs_group * +xfs_group_next_range( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t start_index, + uint32_t end_index, + enum xfs_group_type type) +{ + uint32_t index = start_index; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + if (index > end_index) + return NULL; + return xfs_group_grab(mp, index, type); +} + +/* + * Find the next group after @xg, or the first group if @xg is NULL. + */ +struct xfs_group * +xfs_group_grab_next_mark( + struct xfs_mount *mp, + struct xfs_group *xg, + xa_mark_t mark, + enum xfs_group_type type) +{ + unsigned long index = 0; + + if (xg) { + index = xg->xg_gno + 1; + xfs_group_rele(xg); + } + + rcu_read_lock(); + xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark); + if (xg) { + trace_xfs_group_grab_next_tag(xg, _RET_IP_); + if (!atomic_inc_not_zero(&xg->xg_active_ref)) + xg = NULL; + } + rcu_read_unlock(); + return xg; +} + +void +xfs_group_rele( + struct xfs_group *xg) +{ + trace_xfs_group_rele(xg, _RET_IP_); + atomic_dec(&xg->xg_active_ref); +} + +void +xfs_group_free( + struct xfs_mount *mp, + uint32_t index, + enum xfs_group_type type, + void (*uninit)(struct xfs_group *xg)) +{ + struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index); + + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0); + + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + + if (uninit) + uninit(xg); + + /* drop the mount's active reference */ + xfs_group_rele(xg); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + kfree_rcu_mightsleep(xg); +} + +int +xfs_group_insert( + struct xfs_mount *mp, + struct xfs_group *xg, + uint32_t index, + enum xfs_group_type type) +{ + int error; + + xg->xg_mount = mp; + xg->xg_gno = index; + xg->xg_type = type; + +#ifdef __KERNEL__ + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + spin_lock_init(&xg->xg_state_lock); + xfs_hooks_init(&xg->xg_rmap_update_hooks); +#endif + xfs_defer_drain_init(&xg->xg_intents_drain); + + /* Active ref owned by mount indicates group is online. */ + atomic_set(&xg->xg_active_ref, 1); + + error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL); + if (error) { + WARN_ON_ONCE(error == -EBUSY); + goto out_drain; + } + + return 0; +out_drain: + xfs_defer_drain_free(&xg->xg_intents_drain); +#ifdef __KERNEL__ + kfree(xg->xg_busy_extents); +#endif + return error; +} + +struct xfs_group * +xfs_group_get_by_fsb( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type); +} diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h new file mode 100644 index 000000000000..242b05627c7a --- /dev/null +++ b/fs/xfs/libxfs/xfs_group.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + */ +#ifndef __LIBXFS_GROUP_H +#define __LIBXFS_GROUP_H 1 + +struct xfs_group { + struct xfs_mount *xg_mount; + uint32_t xg_gno; + enum xfs_group_type xg_type; + atomic_t xg_ref; /* passive reference count */ + atomic_t xg_active_ref; /* active reference count */ + + /* Precalculated geometry info */ + uint32_t xg_block_count; /* max usable gbno */ + uint32_t xg_min_gbno; /* min usable gbno */ + +#ifdef __KERNEL__ + /* -- kernel only structures below this line -- */ + + /* + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold xg_state_lock before accessing this field. + */ + uint16_t xg_checked; + uint16_t xg_sick; + spinlock_t xg_state_lock; + + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain xg_intents_drain; + + /* + * Hook to feed rmapbt updates to an active online repair. + */ + struct xfs_hooks xg_rmap_update_hooks; +#endif /* __KERNEL__ */ +}; + +struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +struct xfs_group *xfs_group_hold(struct xfs_group *xg); +void xfs_group_put(struct xfs_group *xg); + +struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type); +struct xfs_group *xfs_group_next_range(struct xfs_mount *mp, + struct xfs_group *xg, uint32_t start_index, uint32_t end_index, + enum xfs_group_type type); +struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp, + struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type); +void xfs_group_rele(struct xfs_group *xg); + +void xfs_group_free(struct xfs_mount *mp, uint32_t index, + enum xfs_group_type type, void (*uninit)(struct xfs_group *xg)); +int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg, + uint32_t index, enum xfs_group_type); + +#define xfs_group_set_mark(_xg, _mark) \ + xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_clear_mark(_xg, _mark) \ + xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \ + (_xg)->xg_gno, (_mark)) +#define xfs_group_marked(_mp, _type, _mark) \ + xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark)) + +static inline xfs_agblock_t +xfs_group_max_blocks( + struct xfs_group *xg) +{ + return xg->xg_mount->m_groups[xg->xg_type].blocks; +} + +static inline xfs_fsblock_t +xfs_group_start_fsb( + struct xfs_group *xg) +{ + return ((xfs_fsblock_t)xg->xg_gno) << + xg->xg_mount->m_groups[xg->xg_type].blklog; +} + +static inline xfs_fsblock_t +xfs_gbno_to_fsb( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + return xfs_group_start_fsb(xg) | gbno; +} + +static inline xfs_daddr_t +xfs_gbno_to_daddr( + struct xfs_group *xg, + xfs_agblock_t gbno) +{ + struct xfs_mount *mp = xg->xg_mount; + uint32_t blocks = mp->m_groups[xg->xg_type].blocks; + + return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); +} + +static inline uint32_t +xfs_fsb_to_gno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + if (!mp->m_groups[type].blklog) + return 0; + return fsbno >> mp->m_groups[type].blklog; +} + +static inline xfs_agblock_t +xfs_fsb_to_gbno( + struct xfs_mount *mp, + xfs_fsblock_t fsbno, + enum xfs_group_type type) +{ + return fsbno & mp->m_groups[type].blkmask; +} + +static inline bool +xfs_verify_gbno( + struct xfs_group *xg, + uint32_t gbno) +{ + if (gbno >= xg->xg_block_count) + return false; + if (gbno < xg->xg_min_gbno) + return false; + return true; +} + +static inline bool +xfs_verify_gbext( + struct xfs_group *xg, + uint32_t gbno, + uint32_t glen) +{ + uint32_t end; + + if (!xfs_verify_gbno(xg, gbno)) + return false; + if (glen == 0 || check_add_overflow(gbno, glen - 1, &end)) + return false; + if (!xfs_verify_gbno(xg, end)) + return false; + return true; +} + +#endif /* __LIBXFS_GROUP_H */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index b0edb4288e59..d34986ac18c3 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -6,6 +6,8 @@ #ifndef __XFS_HEALTH_H__ #define __XFS_HEALTH_H__ +struct xfs_group; + /* * In-Core Filesystem Health Assessments * ===================================== @@ -52,6 +54,7 @@ struct xfs_inode; struct xfs_fsop_geom; struct xfs_btree_cur; struct xfs_da_args; +struct xfs_rtgroup; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ @@ -60,10 +63,13 @@ struct xfs_da_args; #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ #define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ #define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ +#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */ +#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */ -/* Observable health issues for realtime volume metadata. */ -#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ -#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ +/* Observable health issues for realtime group metadata. */ +#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */ +#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ +#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ /* Observable health issues for AG metadata. */ #define XFS_SICK_AG_SB (1 << 0) /* superblock */ @@ -103,10 +109,13 @@ struct xfs_da_args; XFS_SICK_FS_GQUOTA | \ XFS_SICK_FS_PQUOTA | \ XFS_SICK_FS_QUOTACHECK | \ - XFS_SICK_FS_NLINKS) + XFS_SICK_FS_NLINKS | \ + XFS_SICK_FS_METADIR | \ + XFS_SICK_FS_METAPATH) -#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ - XFS_SICK_RT_SUMMARY) +#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ + XFS_SICK_RG_BITMAP | \ + XFS_SICK_RG_SUMMARY) #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ XFS_SICK_AG_AGF | \ @@ -136,26 +145,26 @@ struct xfs_da_args; /* Secondary state related to (but not primary evidence of) health problems. */ #define XFS_SICK_FS_SECONDARY (0) -#define XFS_SICK_RT_SECONDARY (0) +#define XFS_SICK_RG_SECONDARY (0) #define XFS_SICK_AG_SECONDARY (0) #define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) /* Evidence of health problems elsewhere. */ #define XFS_SICK_FS_INDIRECT (0) -#define XFS_SICK_RT_INDIRECT (0) +#define XFS_SICK_RG_INDIRECT (0) #define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) #define XFS_SICK_INO_INDIRECT (0) /* All health masks. */ -#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ XFS_SICK_FS_SECONDARY | \ XFS_SICK_FS_INDIRECT) -#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \ - XFS_SICK_RT_SECONDARY | \ - XFS_SICK_RT_INDIRECT) +#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \ + XFS_SICK_RG_SECONDARY | \ + XFS_SICK_RG_INDIRECT) -#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ XFS_SICK_AG_SECONDARY | \ XFS_SICK_AG_INDIRECT) @@ -189,18 +198,17 @@ void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); -void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); -void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, - unsigned int *checked); +void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno, + unsigned int mask); void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int mask); -void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); -void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, +void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask); +#define xfs_ag_mark_sick(pag, mask) \ + xfs_group_mark_sick(pag_group(pag), (mask)) +void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask); +void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask); +void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); @@ -227,22 +235,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) } static inline bool -xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +xfs_group_has_sickness( + struct xfs_group *xg, + unsigned int mask) { - unsigned int sick, checked; + unsigned int sick, checked; - xfs_rt_measure_sickness(mp, &sick, &checked); + xfs_group_measure_sickness(xg, &sick, &checked); return sick & mask; } -static inline bool -xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) -{ - unsigned int sick, checked; +#define xfs_ag_has_sickness(pag, mask) \ + xfs_group_has_sickness(pag_group(pag), (mask)) +#define xfs_ag_is_healthy(pag) \ + (!xfs_ag_has_sickness((pag), UINT_MAX)) - xfs_ag_measure_sickness(pag, &sick, &checked); - return sick & mask; -} +#define xfs_rtgroup_has_sickness(rtg, mask) \ + xfs_group_has_sickness(rtg_group(rtg), (mask)) +#define xfs_rtgroup_is_healthy(rtg) \ + (!xfs_rtgroup_has_sickness((rtg), UINT_MAX)) static inline bool xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) @@ -260,18 +271,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp) } static inline bool -xfs_rt_is_healthy(struct xfs_mount *mp) -{ - return !xfs_rt_has_sickness(mp, -1U); -} - -static inline bool -xfs_ag_is_healthy(struct xfs_perag *pag) -{ - return !xfs_ag_has_sickness(pag, -1U); -} - -static inline bool xfs_inode_is_healthy(struct xfs_inode *ip) { return !xfs_inode_has_sickness(ip, -1U); @@ -279,6 +278,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip) void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); #define xfs_metadata_is_sick(error) \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 271855227514..f3a840a425f5 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -142,7 +142,7 @@ xfs_inobt_complain_bad_rec( xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", - cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); + cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -170,7 +170,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -275,8 +275,10 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!xfs_is_shutdown(cur->bc_mp)) - ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); + if (!xfs_is_shutdown(cur->bc_mp)) { + ASSERT(freecount == + to_perag(cur->bc_group)->pagi_freecount); + } } return 0; } @@ -551,7 +553,7 @@ xfs_inobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -606,15 +608,12 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(pag, &rec, nrec); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(pag, nrec); error = xfs_inobt_rec_check_count(mp, nrec); if (error) @@ -648,7 +647,7 @@ xfs_finobt_insert_sprec( struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; @@ -768,8 +767,7 @@ xfs_ialloc_ag_alloc( /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - args.agbno)); + xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; @@ -811,8 +809,8 @@ xfs_ialloc_ag_alloc( */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -824,8 +822,8 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; } @@ -855,13 +853,14 @@ sparse_alloc: * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; - args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.max_agbno = round_down(xfs_ag_block_count(args.mp, + pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, pag->pag_agno, - be32_to_cpu(agi->agi_root))); + xfs_agbno_to_fsb(pag, + be32_to_cpu(agi->agi_root))); if (error) return error; @@ -884,7 +883,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) @@ -915,8 +914,7 @@ sparse_alloc: if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, pag->pag_agno, - rec.ir_startino), + xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } @@ -1076,7 +1074,7 @@ xfs_dialloc_check_ino( if (error) return -EAGAIN; - error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp); + error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); if (error) return -EAGAIN; @@ -1127,7 +1125,7 @@ xfs_dialloc_ag_inobt( /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == pag->pag_agno) { + if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1335,7 +1333,7 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); @@ -1604,7 +1602,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (pag->pag_agno == pagno) + if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1616,7 +1614,7 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); + ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); @@ -1845,6 +1843,40 @@ out_release: } /* + * Pick an AG for the new inode. + * + * Directories, symlinks, and regular files frequently allocate at least one + * block, so factor that potential expansion when we examine whether an AG has + * enough space for file creation. Try to keep metadata files all in the same + * AG. + */ +static inline xfs_agnumber_t +xfs_dialloc_pick_ag( + struct xfs_mount *mp, + struct xfs_inode *dp, + umode_t mode) +{ + xfs_agnumber_t start_agno; + + if (!dp) + return 0; + if (xfs_is_metadir_inode(dp)) { + if (mp->m_sb.sb_logstart) + return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); + return 0; + } + + if (S_ISDIR(mode)) + return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; + + start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + + return start_agno; +} + +/* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to @@ -1859,31 +1891,19 @@ xfs_dialloc( xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; + struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_ino_t ino = NULLFSINO; xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; - umode_t mode = args->mode & S_IFMT; xfs_agnumber_t agno; - int error = 0; xfs_agnumber_t start_agno; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); + umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; - xfs_ino_t ino = NULLFSINO; + int error = 0; - /* - * Directories, symlinks, and regular files frequently allocate at least - * one block, so factor that potential expansion when we examine whether - * an AG has enough space for file creation. - */ - if (S_ISDIR(mode)) - start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) % - mp->m_maxagi; - else { - start_agno = XFS_INO_TO_AGNO(mp, parent); - if (start_agno >= mp->m_maxagi) - start_agno = 0; - } + start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear @@ -1974,7 +1994,7 @@ retry: static int xfs_difree_inode_chunk( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; @@ -1988,8 +2008,7 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - return xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, sagbno), + return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); } @@ -2035,9 +2054,9 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); + error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), + contigblk, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE, 0); if (error) return error; @@ -2059,7 +2078,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2124,8 +2143,7 @@ xfs_difree_inobt( if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - rec.ir_startino); + xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2148,7 +2166,7 @@ xfs_difree_inobt( goto error0; } - error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { @@ -2194,7 +2212,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; @@ -2317,24 +2335,24 @@ xfs_difree( /* * Break up inode number into its components. */ - if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { - xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", - __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); + if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + if (inode != xfs_agino_to_ino(pag, agino)) { + xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { + xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", + __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } @@ -2380,7 +2398,7 @@ xfs_imap_lookup( xfs_agblock_t *offset_agbno, int flags) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; @@ -2391,7 +2409,7 @@ xfs_imap_lookup( if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, pag->pag_agno); + __func__, error, pag_agno(pag)); return error; } @@ -2441,7 +2459,7 @@ xfs_imap( struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ @@ -2457,8 +2475,8 @@ xfs_imap( */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || + ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* @@ -2467,17 +2485,18 @@ xfs_imap( */ if (flags & XFS_IGET_UNTRUSTED) return error; - if (agbno >= mp->m_sb.sb_agblocks) { + if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, - (unsigned long)mp->m_sb.sb_agblocks); + (unsigned long)xfs_ag_block_count(mp, + pag_agno(pag))); } - if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, - "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); + xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ @@ -2507,7 +2526,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2537,7 +2556,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); + imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2733,13 +2752,13 @@ xfs_read_agi( xfs_buf_flags_t flags, struct xfs_buf **agibpp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; - trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); @@ -2767,7 +2786,7 @@ xfs_ialloc_read_agi( struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); + trace_xfs_ialloc_read_agi(pag); error = xfs_read_agi(pag, tp, (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, @@ -2787,7 +2806,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag->pag_mount)); + xfs_is_shutdown(pag_mount(pag))); if (agibpp) *agibpp = agibp; else @@ -2887,7 +2906,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); @@ -3126,13 +3145,13 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_has_sparseinodes(pag->pag_mount)) + if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ - agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); + agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 401b42d52af6..6f270d8f4270 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -37,7 +37,7 @@ STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -45,7 +45,7 @@ STATIC struct xfs_btree_cur * xfs_finobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp, cur->bc_ag.agbp); } @@ -112,7 +112,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_INOBT; args.minlen = 1; args.maxlen = 1; @@ -120,7 +120,7 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); + xfs_agbno_to_fsb(args.pag, sbno)); if (error) return error; @@ -248,7 +248,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -260,7 +260,8 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; } @@ -478,12 +479,12 @@ xfs_inobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -504,12 +505,12 @@ xfs_finobt_init_cursor( struct xfs_trans *tp, struct xfs_buf *agbp) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; @@ -715,8 +716,8 @@ static xfs_extlen_t xfs_inobt_max_size( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; - xfs_agblock_t agblocks = pag->block_count; + struct xfs_mount *mp = pag_mount(pag); + xfs_agblock_t agblocks = pag_group(pag)->xg_block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -727,7 +728,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -743,6 +744,7 @@ xfs_finobt_count_blocks( { struct xfs_buf *agbp = NULL; struct xfs_btree_cur *cur; + xfs_filblks_t blocks; int error; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); @@ -750,9 +752,10 @@ xfs_finobt_count_blocks( return error; cur = xfs_finobt_init_cursor(pag, tp, agbp); - error = xfs_btree_count_blocks(cur, tree_blocks); + error = xfs_btree_count_blocks(cur, &blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); + *tree_blocks = blocks; return error; } @@ -791,10 +794,10 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_has_finobt(pag->pag_mount)) + if (!xfs_has_finobt(pag_mount(pag))) return 0; - if (xfs_has_inobtcounts(pag->pag_mount)) + if (xfs_has_inobtcounts(pag_mount(pag))) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_finobt_count_blocks(pag, tp, &tree_len); diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 79babeac9d75..424861fbf1bd 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -19,6 +19,7 @@ #include "xfs_ialloc.h" #include "xfs_dir2.h" #include "xfs_health.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -209,12 +210,15 @@ xfs_inode_from_disk( * They will also be unconditionally written back to disk as v2 inodes. */ if (unlikely(from->di_version == 1)) { - set_nlink(inode, be16_to_cpu(from->di_onlink)); + /* di_metatype used to be di_onlink */ + set_nlink(inode, be16_to_cpu(from->di_metatype)); ip->i_projid = 0; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | be16_to_cpu(from->di_projid_lo); + if (xfs_dinode_is_metadir(from)) + ip->i_metatype = be16_to_cpu(from->di_metatype); } i_uid_write(inode, be32_to_cpu(from->di_uid)); @@ -315,7 +319,10 @@ xfs_inode_to_disk( struct inode *inode = VFS_I(ip); to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - to->di_onlink = 0; + if (xfs_is_metadir_inode(ip)) + to->di_metatype = cpu_to_be16(ip->i_metatype); + else + to->di_metatype = 0; to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); @@ -483,6 +490,69 @@ xfs_dinode_verify_nrext64( return NULL; } +/* + * Validate all the picky requirements we have for a file that claims to be + * filesystem metadata. + */ +xfs_failaddr_t +xfs_dinode_verify_metadir( + struct xfs_mount *mp, + struct xfs_dinode *dip, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + if (!xfs_has_metadir(mp)) + return __this_address; + + /* V5 filesystem only */ + if (dip->di_version < 3) + return __this_address; + + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + return __this_address; + + /* V3 inode fields that are always zero */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) + return __this_address; + if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) + return __this_address; + + /* Metadata files can only be directories or regular files */ + if (!S_ISDIR(mode) && !S_ISREG(mode)) + return __this_address; + + /* They must have zero access permissions */ + if (mode & 0777) + return __this_address; + + /* DMAPI event and state masks are zero */ + if (dip->di_dmevmask || dip->di_dmstate) + return __this_address; + + /* + * User and group IDs must be zero. The project ID is used for + * grouping inodes. Metadata inodes are never accounted to quotas. + */ + if (dip->di_uid || dip->di_gid) + return __this_address; + + /* Mandatory inode flags must be set */ + if (S_ISDIR(mode)) { + if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) + return __this_address; + } else { + if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) + return __this_address; + } + + /* dax flags2 must not be set */ + if (flags2 & XFS_DIFLAG2_DAX) + return __this_address; + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -523,8 +593,11 @@ xfs_dinode_verify( * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with * di_onlink==0, so we can check that. */ - if (dip->di_version >= 2) { - if (dip->di_onlink) + if (dip->di_version == 2) { + if (dip->di_metatype) + return __this_address; + } else if (dip->di_version >= 3) { + if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) return __this_address; } @@ -546,7 +619,8 @@ xfs_dinode_verify( if (dip->di_nlink) return __this_address; } else { - if (dip->di_onlink) + /* di_metatype used to be di_onlink */ + if (dip->di_metatype) return __this_address; } } @@ -663,6 +737,12 @@ xfs_dinode_verify( !xfs_has_bigtime(mp)) return __this_address; + if (flags2 & XFS_DIFLAG2_METADATA) { + fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); + if (fa) + return fa; + } + return NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 585ed5a110af..8d43d2641c73 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp, + struct xfs_dinode *dip, uint16_t mode, uint16_t flags, + uint64_t flags2); xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, uint32_t extsize, uint16_t mode, uint16_t flags); xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index cc38e1c3c3e1..deb0b7c00a1f 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -224,6 +224,8 @@ xfs_inode_inherit_flags2( } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; + if (xfs_is_metadir_inode(pip)) + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; /* Don't let invalid cowextsize hints propagate. */ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, @@ -442,8 +444,8 @@ xfs_iunlink_update_bucket( ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); - trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, - old_value, new_agino); + trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value, + new_agino); /* * We should never find the head of the list already set to the value diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 3e6682ed656b..15dec19b6c32 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -248,6 +248,8 @@ typedef struct xfs_trans_header { #define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_XMI 0x1248 /* mapping exchange intent */ #define XFS_LI_XMD 0x1249 /* mapping exchange done */ +#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */ +#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -267,7 +269,9 @@ typedef struct xfs_trans_header { { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \ { XFS_LI_XMI, "XFS_LI_XMI" }, \ - { XFS_LI_XMD, "XFS_LI_XMD" } + { XFS_LI_XMD, "XFS_LI_XMD" }, \ + { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ + { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" } /* * Inode Log Item Format definitions. @@ -404,7 +408,7 @@ struct xfs_log_dinode { uint16_t di_mode; /* mode and type of file */ int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ - uint8_t di_pad3[2]; /* unused in v2/3 inodes */ + uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ uint32_t di_nlink; /* number of links to file */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 521d327e4c89..5397a8ff004d 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -77,6 +77,8 @@ extern const struct xlog_recover_item_ops xlog_attri_item_ops; extern const struct xlog_recover_item_ops xlog_attrd_item_ops; extern const struct xlog_recover_item_ops xlog_xmi_item_ops; extern const struct xlog_recover_item_ops xlog_xmd_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; +extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; /* * Macros, structures, prototypes for internal log manager use. diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c new file mode 100644 index 000000000000..bae7377c0f22 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_bmap_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_trans_space.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_parent.h" +#include "xfs_health.h" + +/* + * Metadata Directory Tree + * ======================= + * + * These functions provide an abstraction layer for looking up, creating, and + * deleting metadata inodes that live within a special metadata directory tree. + * + * This code does not manage the five existing metadata inodes: real time + * bitmap & summary; and the user, group, and quotas. All other metadata + * inodes must use only the xfs_meta{dir,file}_* functions. + * + * Callers wishing to create or hardlink a metadata inode must create an + * xfs_metadir_update structure, call the appropriate xfs_metadir* function, + * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel + * the update. Files in the metadata directory tree currently cannot be + * unlinked. + * + * When the metadir feature is enabled, all metadata inodes must have the + * "metadata" inode flag set to prevent them from being exposed to the outside + * world. + * + * Callers must take the ILOCK of any inode in the metadata directory tree to + * synchronize access to that inode. It is never necessary to take the IOLOCK + * or the MMAPLOCK since metadata inodes must not be exposed to user space. + */ + +static inline void +xfs_metadir_set_xname( + struct xfs_name *xname, + const char *path, + unsigned char ftype) +{ + xname->name = (const unsigned char *)path; + xname->len = strlen(path); + xname->type = ftype; +} + +/* + * Given a parent directory @dp and a metadata inode path component @xname, + * Look up the inode number in the directory, returning it in @ino. + * @xname.type must match the directory entry's ftype. + * + * Caller must hold ILOCK_EXCL. + */ +static inline int +xfs_metadir_lookup( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_name *xname, + xfs_ino_t *ino) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args = { + .trans = tp, + .dp = dp, + .geo = mp->m_dir_geo, + .name = xname->name, + .namelen = xname->len, + .hashval = xfs_dir2_hashname(mp, xname), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + .owner = dp->i_ino, + }; + int error; + + if (!S_ISDIR(VFS_I(dp)->i_mode)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_dir_lookup_args(&args); + if (error) + return error; + + if (!xfs_verify_ino(mp, args.inumber)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + trace_xfs_metadir_lookup(dp, xname, args.inumber); + *ino = args.inumber; + return 0; +} + +/* + * Look up and read a metadata inode from the metadata directory. If the path + * component doesn't exist, return -ENOENT. + */ +int +xfs_metadir_load( + struct xfs_trans *tp, + struct xfs_inode *dp, + const char *path, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_name xname; + xfs_ino_t ino; + int error; + + xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN); + + xfs_ilock(dp, XFS_ILOCK_EXCL); + error = xfs_metadir_lookup(tp, dp, &xname, &ino); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (error) + return error; + return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); +} + +/* + * Unlock and release resources after committing (or cancelling) a metadata + * directory tree operation. The caller retains its reference to @upd->ip + * and must release it explicitly. + */ +static inline void +xfs_metadir_teardown( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_teardown(upd, error); + + if (upd->ppargs) { + xfs_parent_finish(upd->dp->i_mount, upd->ppargs); + upd->ppargs = NULL; + } + + if (upd->ip) { + if (upd->ip_locked) + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + upd->ip_locked = false; + } + + if (upd->dp_locked) + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + upd->dp_locked = false; +} + +/* + * Begin the process of creating a metadata file by allocating transactions + * and taking whatever resources we're going to need. + */ +int +xfs_metadir_start_create( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip == NULL); + ASSERT(xfs_has_metadir(mp)); + ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + /* + * If we ever need the ability to create rt metadata files on a + * pre-metadir filesystem, we'll need to dqattach the parent here. + * Currently we assume that mkfs will create the files and quotacheck + * will account for them. + */ + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp); + if (error) + goto out_teardown; + + /* + * Lock the parent directory if there is one. We can't ijoin it to + * the transaction until after the child file has been created. + */ + xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + upd->dp_locked = true; + + trace_xfs_metadir_start_create(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Create a metadata inode with the given @mode, and insert it into the + * metadata directory tree at the given @upd->path. The path up to the final + * component must already exist. The final path component must not exist. + * + * The new metadata inode will be attached to the update structure @upd->ip, + * with the ILOCK held until the caller releases it. + * + * NOTE: This function may return a new inode to the caller even if it returns + * a negative error code. If an inode is passed back, the caller must finish + * setting up the inode before releasing it. + */ +int +xfs_metadir_create( + struct xfs_metadir_update *upd, + umode_t mode) +{ + struct xfs_icreate_args args = { + .pip = upd->dp, + .mode = mode, + }; + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + + /* Check that the name does not already exist in the directory. */ + xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dialloc(&upd->tp, &args, &ino); + if (error) + return error; + error = xfs_icreate(upd->tp, ino, &args, &upd->ip); + if (error) + return error; + du.ip = upd->ip; + xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type); + upd->ip_locked = true; + + /* + * Join the directory inode to the transaction. We do not do it + * earlier because xfs_dialloc rolls the transaction. + */ + xfs_trans_ijoin(upd->tp, upd->dp, 0); + + /* Create the entry. */ + if (S_ISDIR(args.mode)) + resblks = xfs_mkdir_space_res(mp, xname.len); + else + resblks = xfs_create_space_res(mp, xname.len); + xname.type = xfs_mode_to_ftype(args.mode); + + trace_xfs_metadir_try_create(upd); + + error = xfs_dir_create_child(upd->tp, resblks, &du); + if (error) + return error; + + /* Metadir files are not accounted to quota. */ + + trace_xfs_metadir_create(upd); + + return 0; +} + +#ifndef __KERNEL__ +/* + * Begin the process of linking a metadata file by allocating transactions + * and locking whatever resources we're going to need. + */ +int +xfs_metadir_start_link( + struct xfs_metadir_update *upd) +{ + struct xfs_mount *mp = upd->dp->i_mount; + unsigned int resblks; + int nospace_error = 0; + int error; + + ASSERT(upd->dp != NULL); + ASSERT(upd->ip != NULL); + ASSERT(xfs_has_metadir(mp)); + + error = xfs_parent_start(mp, &upd->ppargs); + if (error) + return error; + + resblks = xfs_link_space_res(mp, MAXNAMELEN); + error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip, + &resblks, &upd->tp, &nospace_error); + if (error) + goto out_teardown; + if (!resblks) { + /* We don't allow reservationless updates. */ + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + xfs_iunlock(upd->dp, XFS_ILOCK_EXCL); + xfs_iunlock(upd->ip, XFS_ILOCK_EXCL); + error = nospace_error; + goto out_teardown; + } + + upd->dp_locked = true; + upd->ip_locked = true; + + trace_xfs_metadir_start_link(upd); + return 0; +out_teardown: + xfs_metadir_teardown(upd, error); + return error; +} + +/* + * Link the metadata directory given by @path to the inode @upd->ip. + * The path (up to the final component) must already exist, but the final + * component must not already exist. + */ +int +xfs_metadir_link( + struct xfs_metadir_update *upd) +{ + struct xfs_name xname; + struct xfs_dir_update du = { + .dp = upd->dp, + .name = &xname, + .ip = upd->ip, + .ppargs = upd->ppargs, + }; + struct xfs_mount *mp = upd->dp->i_mount; + xfs_ino_t ino; + unsigned int resblks; + int error; + + xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL); + xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL); + + /* Look up the name in the current directory. */ + xfs_metadir_set_xname(&xname, upd->path, + xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode)); + error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + fallthrough; + default: + return error; + } + + resblks = xfs_link_space_res(mp, xname.len); + error = xfs_dir_add_child(upd->tp, resblks, &du); + if (error) + return error; + + trace_xfs_metadir_link(upd); + + return 0; +} +#endif /* ! __KERNEL__ */ + +/* Commit a metadir update and unlock/drop all resources. */ +int +xfs_metadir_commit( + struct xfs_metadir_update *upd) +{ + int error; + + trace_xfs_metadir_commit(upd); + + error = xfs_trans_commit(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); + return error; +} + +/* Cancel a metadir update and unlock/drop all resources. */ +void +xfs_metadir_cancel( + struct xfs_metadir_update *upd, + int error) +{ + trace_xfs_metadir_cancel(upd); + + xfs_trans_cancel(upd->tp); + upd->tp = NULL; + + xfs_metadir_teardown(upd, error); +} + +/* Create a metadata for the last component of the path. */ +int +xfs_metadir_mkdir( + struct xfs_inode *dp, + const char *path, + struct xfs_inode **ipp) +{ + struct xfs_metadir_update upd = { + .dp = dp, + .path = path, + .metafile_type = XFS_METAFILE_DIR, + }; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + /* Allocate a transaction to create the last directory. */ + error = xfs_metadir_start_create(&upd); + if (error) + return error; + + /* Create the subdirectory and take our reference. */ + error = xfs_metadir_create(&upd, S_IFDIR); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_irele; + + xfs_finish_inode_setup(upd.ip); + *ipp = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); +out_irele: + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } + return error; +} diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h new file mode 100644 index 000000000000..bfecac7d3d14 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metadir.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METADIR_H__ +#define __XFS_METADIR_H__ + +/* Cleanup widget for metadata inode creation and deletion. */ +struct xfs_metadir_update { + /* Parent directory */ + struct xfs_inode *dp; + + /* Path to metadata file */ + const char *path; + + /* Parent pointer update context */ + struct xfs_parent_args *ppargs; + + /* Child metadata file */ + struct xfs_inode *ip; + + struct xfs_trans *tp; + + enum xfs_metafile_type metafile_type; + + unsigned int dp_locked:1; + unsigned int ip_locked:1; +}; + +int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp, + const char *path, enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp); + +int xfs_metadir_start_create(struct xfs_metadir_update *upd); +int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode); + +int xfs_metadir_start_link(struct xfs_metadir_update *upd); +int xfs_metadir_link(struct xfs_metadir_update *upd); + +int xfs_metadir_commit(struct xfs_metadir_update *upd); +void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error); + +int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path, + struct xfs_inode **ipp); + +#endif /* __XFS_METADIR_H__ */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c new file mode 100644 index 000000000000..adeb25d1a444 --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_metafile.h" +#include "xfs_trace.h" +#include "xfs_inode.h" + +/* Set up an inode to be recognized as a metadata directory inode. */ +void +xfs_metafile_set_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip, + enum xfs_metafile_type metafile_type) +{ + VFS_I(ip)->i_mode &= ~0777; + VFS_I(ip)->i_uid = GLOBAL_ROOT_UID; + VFS_I(ip)->i_gid = GLOBAL_ROOT_GID; + if (S_ISDIR(VFS_I(ip)->i_mode)) + ip->i_diflags |= XFS_METADIR_DIFLAGS; + else + ip->i_diflags |= XFS_METAFILE_DIFLAGS; + ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + ip->i_diflags2 |= XFS_DIFLAG2_METADATA; + ip->i_metatype = metafile_type; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* Clear the metadata directory inode flag. */ +void +xfs_metafile_clear_iflag( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + ASSERT(xfs_is_metadir_inode(ip)); + ASSERT(VFS_I(ip)->i_nlink == 0); + + ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h new file mode 100644 index 000000000000..acec400123db --- /dev/null +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_METAFILE_H__ +#define __XFS_METAFILE_H__ + +/* All metadata files must have these flags set. */ +#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \ + XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | \ + XFS_DIFLAG_NODUMP | \ + XFS_DIFLAG_NODEFRAG) + +/* All metadata directories must have these flags set. */ +#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \ + XFS_DIFLAG_NOSYMLINKS) + +void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, + enum xfs_metafile_type metafile_type); +void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); + +/* Code specific to kernel/userspace; must be provided externally. */ + +int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); +int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino, + enum xfs_metafile_type metafile_type, struct xfs_inode **ipp); + +#endif /* __XFS_METAFILE_H__ */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 23c133fd36f5..ad0dedf00f18 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -19,40 +19,46 @@ static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) +#define XFS_CHECK_SB_OFFSET(field, offset) \ + XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \ + XFS_CHECK_OFFSET(struct xfs_sb, field, offset); + static inline void __init xfs_check_ondisk_structs(void) { - /* ag/file structures */ + /* file structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12); - XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); - XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); - XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); - XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136); - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264); XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); + XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); + + /* space btrees */ + XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224); + XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36); + XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344); + XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); - XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8); /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); @@ -67,33 +73,34 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4); /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56); XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); /* - * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to - * 4 bytes anyway so it's not obviously a problem. Hence for the moment - * we don't check this structure. This can be re-instated when the attr - * definitions are updated to use c99 VLA definitions. + * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad + * it to 4 bytes anyway so it's not obviously a problem. Hence for the + * moment we don't check this structure. This can be re-instated when + * the attr definitions are updated to use c99 VLA definitions. * - XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12); */ - XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); - XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8); + XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); @@ -101,27 +108,41 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); - XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); - XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); - XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); - XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1); + XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12); + /* ondisk dir/attr structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3); + XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10); + /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); @@ -157,6 +178,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16); XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16); + /* ondisk log structures from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88); + /* parent pointer ioctls */ XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40); @@ -201,6 +227,70 @@ xfs_check_ondisk_structs(void) XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4); XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT, 16299260424LL); + + /* superblock field checks we got from xfs/122 */ + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); + XFS_CHECK_SB_OFFSET(sb_magicnum, 0); + XFS_CHECK_SB_OFFSET(sb_blocksize, 4); + XFS_CHECK_SB_OFFSET(sb_dblocks, 8); + XFS_CHECK_SB_OFFSET(sb_rblocks, 16); + XFS_CHECK_SB_OFFSET(sb_rextents, 24); + XFS_CHECK_SB_OFFSET(sb_uuid, 32); + XFS_CHECK_SB_OFFSET(sb_logstart, 48); + XFS_CHECK_SB_OFFSET(sb_rootino, 56); + XFS_CHECK_SB_OFFSET(sb_rbmino, 64); + XFS_CHECK_SB_OFFSET(sb_rsumino, 72); + XFS_CHECK_SB_OFFSET(sb_rextsize, 80); + XFS_CHECK_SB_OFFSET(sb_agblocks, 84); + XFS_CHECK_SB_OFFSET(sb_agcount, 88); + XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92); + XFS_CHECK_SB_OFFSET(sb_logblocks, 96); + XFS_CHECK_SB_OFFSET(sb_versionnum, 100); + XFS_CHECK_SB_OFFSET(sb_sectsize, 102); + XFS_CHECK_SB_OFFSET(sb_inodesize, 104); + XFS_CHECK_SB_OFFSET(sb_inopblock, 106); + XFS_CHECK_SB_OFFSET(sb_blocklog, 120); + XFS_CHECK_SB_OFFSET(sb_fname[12], 120); + XFS_CHECK_SB_OFFSET(sb_sectlog, 121); + XFS_CHECK_SB_OFFSET(sb_inodelog, 122); + XFS_CHECK_SB_OFFSET(sb_inopblog, 123); + XFS_CHECK_SB_OFFSET(sb_agblklog, 124); + XFS_CHECK_SB_OFFSET(sb_rextslog, 125); + XFS_CHECK_SB_OFFSET(sb_inprogress, 126); + XFS_CHECK_SB_OFFSET(sb_imax_pct, 127); + XFS_CHECK_SB_OFFSET(sb_icount, 128); + XFS_CHECK_SB_OFFSET(sb_ifree, 136); + XFS_CHECK_SB_OFFSET(sb_fdblocks, 144); + XFS_CHECK_SB_OFFSET(sb_frextents, 152); + XFS_CHECK_SB_OFFSET(sb_uquotino, 160); + XFS_CHECK_SB_OFFSET(sb_gquotino, 168); + XFS_CHECK_SB_OFFSET(sb_qflags, 176); + XFS_CHECK_SB_OFFSET(sb_flags, 178); + XFS_CHECK_SB_OFFSET(sb_shared_vn, 179); + XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180); + XFS_CHECK_SB_OFFSET(sb_unit, 184); + XFS_CHECK_SB_OFFSET(sb_width, 188); + XFS_CHECK_SB_OFFSET(sb_dirblklog, 192); + XFS_CHECK_SB_OFFSET(sb_logsectlog, 193); + XFS_CHECK_SB_OFFSET(sb_logsectsize, 194); + XFS_CHECK_SB_OFFSET(sb_logsunit, 196); + XFS_CHECK_SB_OFFSET(sb_features2, 200); + XFS_CHECK_SB_OFFSET(sb_bad_features2, 204); + XFS_CHECK_SB_OFFSET(sb_features_compat, 208); + XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212); + XFS_CHECK_SB_OFFSET(sb_features_incompat, 216); + XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220); + XFS_CHECK_SB_OFFSET(sb_crc, 224); + XFS_CHECK_SB_OFFSET(sb_spino_align, 228); + XFS_CHECK_SB_OFFSET(sb_pquotino, 232); + XFS_CHECK_SB_OFFSET(sb_lsn, 240); + XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248); + XFS_CHECK_SB_OFFSET(sb_metadirino, 264); + XFS_CHECK_SB_OFFSET(sb_rgcount, 272); + XFS_CHECK_SB_OFFSET(sb_rgextents, 276); + XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); + XFS_CHECK_SB_OFFSET(sb_pad, 281); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index fb05f44f6c75..763d941a8420 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq, __be32 dtimer); __be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer); +static inline const char * +xfs_dqinode_path(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return "user"; + case XFS_DQTYPE_GROUP: + return "group"; + case XFS_DQTYPE_PROJ: + return "project"; + } + + ASSERT(0); + return NULL; +} + +static inline enum xfs_metafile_type +xfs_dqinode_metafile_type(xfs_dqtype_t type) +{ + switch (type) { + case XFS_DQTYPE_USER: + return XFS_METAFILE_USRQUOTA; + case XFS_DQTYPE_GROUP: + return XFS_METAFILE_GRPQUOTA; + case XFS_DQTYPE_PROJ: + return XFS_METAFILE_PRJQUOTA; + } + + ASSERT(0); + return XFS_METAFILE_UNKNOWN; +} + +unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type); + +int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dqtype_t type, struct xfs_inode **ipp); +int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode **ipp); +int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type, + struct xfs_inode *ip); +int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp); +int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp); + #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 198b84117df1..2dbab68b4fe6 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -154,7 +154,7 @@ xfs_refcount_complain_bad_rec( xfs_warn(mp, "Refcount BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -180,7 +180,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); + fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -1154,8 +1154,7 @@ xfs_refcount_adjust_extents( goto out_error; } } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, + fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, @@ -1217,8 +1216,7 @@ xfs_refcount_adjust_extents( } goto advloop; } else { - fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.pag->pag_agno, + fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, @@ -1312,7 +1310,7 @@ xfs_refcount_continue_op( xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, ri->ri_blockcount))) { @@ -1320,10 +1318,10 @@ xfs_refcount_continue_op( return -EFSCORRUPTED; } - ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno); ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1360,7 +1358,7 @@ xfs_refcount_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { + if (rcur != NULL && rcur->bc_group != ri->ri_group) { nr_ops = rcur->bc_refc.nr_ops; shape_changes = rcur->bc_refc.shape_changes; xfs_btree_del_cursor(rcur, 0); @@ -1368,13 +1366,14 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(ri->ri_pag, tp, + struct xfs_perag *pag = to_perag(ri->ri_group); + + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) return error; - *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, - ri->ri_pag); + *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); rcur->bc_refc.nr_ops = nr_ops; rcur->bc_refc.shape_changes = shape_changes; } @@ -1880,7 +1879,8 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) != + NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { xfs_btree_mark_sick(cur); @@ -1956,8 +1956,7 @@ xfs_refcount_recover_cow_leftovers( goto out_free; /* Free the orphan record */ - fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, - rr->rr_rrec.rc_startblock); + fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); @@ -2029,7 +2028,7 @@ xfs_refcount_query_range_helper( xfs_failaddr_t fa; xfs_refcount_btrec_to_irec(rec, &irec); - fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 68acb0b1b4a8..62d78afcf1f3 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -56,7 +56,7 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 795928d1a66d..54505fee1852 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -30,7 +30,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -68,21 +68,20 @@ xfs_refcountbt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - args.pag = cur->bc_ag.pag; + args.pag = to_perag(cur->bc_group); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; error = xfs_alloc_vextent_near_bno(&args, - XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, - xfs_refc_block(args.mp))); + xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp))); if (error) goto out_error; if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.pag->pag_agno); + ASSERT(args.agno == cur->bc_group->xg_gno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -170,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -362,11 +361,11 @@ xfs_refcountbt_init_cursor( { struct xfs_btree_cur *cur; - ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); + ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount); cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; cur->bc_ag.agbp = agbp; @@ -515,7 +514,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 6ef4687b3aba..d0df68dc3131 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -213,7 +213,7 @@ xfs_rmap_check_irec( struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); bool is_inode; bool is_unwritten; bool is_bmbt; @@ -269,9 +269,7 @@ xfs_rmap_check_btrec( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *irec) { - if (xfs_btree_is_mem_rmap(cur->bc_ops)) - return xfs_rmap_check_irec(cur->bc_mem.pag, irec); - return xfs_rmap_check_irec(cur->bc_ag.pag, irec); + return xfs_rmap_check_irec(to_perag(cur->bc_group), irec); } static inline int @@ -288,7 +286,7 @@ xfs_rmap_complain_bad_rec( else xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + cur->bc_group->xg_gno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -835,7 +833,7 @@ xfs_rmap_hook_enable(void) static inline void xfs_rmap_update_hook( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, enum xfs_rmap_intent_type op, xfs_agblock_t startblock, xfs_extlen_t blockcount, @@ -850,27 +848,27 @@ xfs_rmap_update_hook( .oinfo = *oinfo, /* struct copy */ }; - if (pag) - xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p); + if (xg) + xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p); } } /* Call the specified function during a reverse mapping update. */ int xfs_rmap_hook_add( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Stop calling the specified function during a reverse mapping update. */ void xfs_rmap_hook_del( - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_rmap_hook *hook) { - xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook); + xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook); } /* Configure rmap update hook functions. */ @@ -905,7 +903,8 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len, + false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1149,7 +1148,8 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo); + xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false, + oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -2586,28 +2586,30 @@ xfs_rmap_finish_one( * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ - if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { + if (rcur != NULL && rcur->bc_group != ri->ri_group) { xfs_btree_del_cursor(rcur, 0); rcur = NULL; *pcur = NULL; } if (rcur == NULL) { + struct xfs_perag *pag = to_perag(ri->ri_group); + /* * Refresh the freelist before we start changing the * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); return error; } if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); return -EFSCORRUPTED; } - *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); + *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); } xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, @@ -2620,7 +2622,7 @@ xfs_rmap_finish_one( if (error) return error; - xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno, + xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno, ri->ri_bmap.br_blockcount, unwritten, &oinfo); return 0; } diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b783dd4dd95d..96b4321d8310 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -173,7 +173,7 @@ struct xfs_rmap_intent { int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; - struct xfs_perag *ri_pag; + struct xfs_group *ri_group; }; /* functions for updating the rmapbt based on bmbt map/unmap operations */ @@ -264,8 +264,8 @@ struct xfs_rmap_hook { void xfs_rmap_hook_disable(void); void xfs_rmap_hook_enable(void); -int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook); -void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook); void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); #endif diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index ac2f1f499b76..2cab694ac58a 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -57,7 +57,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag); + cur->bc_ag.agbp, to_perag(cur->bc_group)); } STATIC void @@ -66,14 +66,15 @@ xfs_rmapbt_set_root( const union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = to_perag(cur->bc_group); ASSERT(ptr->s != 0); agf->agf_rmap_root = ptr->s; be32_add_cpu(&agf->agf_rmap_level, inc); - cur->bc_ag.pag->pagf_rmap_level += inc; + pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -87,7 +88,7 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); struct xfs_alloc_arg args = { .len = 1 }; int error; xfs_agblock_t bno; @@ -102,7 +103,7 @@ xfs_rmapbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); + xfs_extent_busy_reuse(pag_group(pag), bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); @@ -125,7 +126,7 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t bno; int error; @@ -136,7 +137,7 @@ xfs_rmapbt_free_block( if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); @@ -227,7 +228,7 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_rmap_root; } @@ -538,7 +539,7 @@ xfs_rmapbt_init_cursor( cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; @@ -647,14 +648,13 @@ xfs_rmapbt_mem_cursor( struct xfbtree *xfbt) { struct xfs_btree_cur *cur; - struct xfs_mount *mp = pag->pag_mount; - cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, + cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops, xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); cur->bc_mem.xfbtree = xfbt; cur->bc_nlevels = xfbt->nlevels; - cur->bc_mem.pag = xfs_perag_hold(pag); + cur->bc_group = xfs_group_hold(pag_group(pag)); return cur; } @@ -863,7 +863,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (xfs_ag_contains_log(mp, pag->pag_agno)) + if (xfs_ag_contains_log(mp, pag_agno(pag))) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 27a4472402ba..4ddfb7e395b3 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -20,28 +20,87 @@ #include "xfs_error.h" #include "xfs_rtbitmap.h" #include "xfs_health.h" +#include "xfs_sb.h" +#include "xfs_errortag.h" +#include "xfs_log.h" +#include "xfs_buf_item.h" +#include "xfs_extent_busy.h" /* * Realtime allocator bitmap functions shared with userspace. */ -/* - * Real time buffers need verifiers to avoid runtime warnings during IO. - * We don't have anything to verify, however, so these are just dummy - * operations. - */ +static xfs_failaddr_t +xfs_rtbuf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (!xfs_verify_magic(bp, hdr->rt_magic)) + return __this_address; + if (!xfs_has_rtgroups(mp)) + return __this_address; + if (!xfs_has_crc(mp)) + return __this_address; + if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp))) + return __this_address; + return NULL; +} + static void xfs_rtbuf_verify_read( - struct xfs_buf *bp) + struct xfs_buf *bp) { + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) { + fa = __this_address; + goto fail; + } + + if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) { + fa = __this_address; + goto fail; + } + + fa = xfs_rtbuf_verify(bp); + if (fa) + goto fail; + return; +fail: + xfs_verifier_error(bp, -EFSCORRUPTED, fa); } static void xfs_rtbuf_verify_write( struct xfs_buf *bp) { - return; + struct xfs_mount *mp = bp->b_mount; + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + struct xfs_buf_log_item *bip = bp->b_log_item; + xfs_failaddr_t fa; + + if (!xfs_has_rtgroups(mp)) + return; + + fa = xfs_rtbuf_verify(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + if (bip) + hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF); } const struct xfs_buf_ops xfs_rtbuf_ops = { @@ -50,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +const struct xfs_buf_ops xfs_rtbitmap_buf_ops = { + .name = "xfs_rtbitmap", + .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + +const struct xfs_buf_ops xfs_rtsummary_buf_ops = { + .name = "xfs_rtsummary", + .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) }, + .verify_read = xfs_rtbuf_verify_read, + .verify_write = xfs_rtbuf_verify_write, + .verify_struct = xfs_rtbuf_verify, +}; + /* Release cached rt bitmap and summary buffers. */ void xfs_rtbuf_cache_relse( @@ -75,28 +150,31 @@ static int xfs_rtbuf_get( struct xfs_rtalloc_args *args, xfs_fileoff_t block, /* block number in bitmap or summary */ - int issum) /* is summary not bitmap */ + enum xfs_rtg_inodes type) { + struct xfs_inode *ip = args->rtg->rtg_inodes[type]; struct xfs_mount *mp = args->mp; struct xfs_buf **cbpp; /* cached block buffer */ xfs_fileoff_t *coffp; /* cached block number */ struct xfs_buf *bp; /* block buffer, result */ - struct xfs_inode *ip; /* bitmap or summary inode */ struct xfs_bmbt_irec map; - enum xfs_blft type; + enum xfs_blft buf_type; int nmap = 1; int error; - if (issum) { + switch (type) { + case XFS_RTGI_SUMMARY: cbpp = &args->sumbp; coffp = &args->sumoff; - ip = mp->m_rsumip; - type = XFS_BLFT_RTSUMMARY_BUF; - } else { + buf_type = XFS_BLFT_RTSUMMARY_BUF; + break; + case XFS_RTGI_BITMAP: cbpp = &args->rbmbp; coffp = &args->rbmoff; - ip = mp->m_rbmip; - type = XFS_BLFT_RTBITMAP_BUF; + buf_type = XFS_BLFT_RTBITMAP_BUF; + break; + default: + return -EINVAL; } /* @@ -119,22 +197,32 @@ xfs_rtbuf_get( return error; if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); return -EFSCORRUPTED; } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + mp->m_bsize, 0, &bp, + xfs_rtblock_ops(mp, type)); if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : - XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, type); if (error) return error; - xfs_trans_buf_set_type(args->tp, bp, type); + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) { + xfs_buf_mark_corrupt(bp); + xfs_trans_brelse(args->tp, bp); + xfs_rtginode_mark_sick(args->rtg, type); + return -EFSCORRUPTED; + } + } + + xfs_trans_buf_set_type(args->tp, bp, buf_type); *cbpp = bp; *coffp = block; return 0; @@ -148,11 +236,11 @@ xfs_rtbitmap_read_buf( struct xfs_mount *mp = args->mp; if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) { - xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP); return -EFSCORRUPTED; } - return xfs_rtbuf_get(args, block, 0); + return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP); } int @@ -163,10 +251,10 @@ xfs_rtsummary_read_buf( struct xfs_mount *mp = args->mp; if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) { - xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY); + xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY); return -EFSCORRUPTED; } - return xfs_rtbuf_get(args, block, 1); + return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY); } /* @@ -503,6 +591,7 @@ xfs_rtmodify_summary( { struct xfs_mount *mp = args->mp; xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; unsigned int infoword; xfs_suminfo_t val; int error; @@ -514,11 +603,11 @@ xfs_rtmodify_summary( infoword = xfs_rtsumoffs_to_infoword(mp, so); val = xfs_suminfo_add(args, infoword, delta); - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache) { + if (val == 0 && log + 1 == rsum_cache[bbno]) + rsum_cache[bbno] = log; + if (val != 0 && log >= rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; } xfs_trans_log_rtsummary(args, infoword); @@ -737,7 +826,7 @@ xfs_rtfree_range( /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -961,19 +1050,25 @@ xfs_rtcheck_alloc_range( int xfs_rtfree_extent( struct xfs_trans *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, xfs_rtxnum_t start, /* starting rtext number to free */ xfs_rtxlen_t len) /* length of extent freed */ { struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; struct xfs_rtalloc_args args = { .mp = mp, .tp = tp, + .rtg = rtg, }; int error; struct timespec64 atime; - ASSERT(mp->m_rbmip->i_itemp != NULL); - xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + ASSERT(rbmip->i_itemp != NULL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); + + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); if (error) @@ -990,19 +1085,21 @@ xfs_rtfree_extent( * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* * If we've now freed all the blocks, reset the file sequence - * number to 0. + * number to 0 for pre-RTG file systems. */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + if (!xfs_has_rtgroups(mp) && + tp->t_frextents_delta + mp->m_sb.sb_frextents == mp->m_sb.sb_rextents) { - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - atime = inode_get_atime(VFS_I(mp->m_rbmip)); + atime = inode_get_atime(VFS_I(rbmip)); atime.tv_sec = 0; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), atime); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); } error = 0; out: @@ -1018,15 +1115,17 @@ out: int xfs_rtfree_blocks( struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { struct xfs_mount *mp = tp->t_mountp; xfs_extlen_t mod; + int error; ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); - mod = xfs_rtb_to_rtxoff(mp, rtlen); + mod = xfs_blen_to_rtxoff(mp, rtlen); if (mod) { ASSERT(mod == 0); return -EIO; @@ -1038,21 +1137,31 @@ xfs_rtfree_blocks( return -EIO; } - return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno), - xfs_rtb_to_rtx(mp, rtlen)); + error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno), + xfs_extlen_to_rtxlen(mp, rtlen)); + if (error) + return error; + + if (xfs_has_rtgroups(mp)) + xfs_extent_busy_insert(tp, rtg_group(rtg), + xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0); + + return 0; } /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_rtalloc_args args = { + .rtg = rtg, .mp = mp, .tp = tp, }; @@ -1060,10 +1169,10 @@ xfs_rtalloc_query_range( if (start > end) return -EINVAL; - if (start == end || start >= mp->m_sb.sb_rextents) + if (start == end || start >= rtg->rtg_extents) return 0; - end = min(end, mp->m_sb.sb_rextents - 1); + end = min(end, rtg->rtg_extents - 1); /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { @@ -1086,7 +1195,7 @@ xfs_rtalloc_query_range( rec.ar_startext = start; rec.ar_extcount = rtend - start + 1; - error = fn(mp, tp, &rec, priv); + error = fn(rtg, tp, &rec, priv); if (error) break; } @@ -1101,26 +1210,27 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) { - return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn, + return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn, priv); } /* Is the given extent all free? */ int xfs_rtalloc_extent_is_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free) { struct xfs_rtalloc_args args = { - .mp = mp, + .mp = rtg_mount(rtg), + .rtg = rtg, .tp = tp, }; xfs_rtxnum_t end; @@ -1136,88 +1246,71 @@ xfs_rtalloc_extent_is_free( return 0; } +/* Compute the number of rt extents tracked by a single bitmap block. */ +xfs_rtxnum_t +xfs_rtbitmap_rtx_per_rbmblock( + struct xfs_mount *mp) +{ + unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize; + + if (xfs_has_rtgroups(mp)) + rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo); + + return rbmblock_bytes * NBBY; +} + /* * Compute the number of rtbitmap blocks needed to track the given number of rt * extents. */ xfs_filblks_t -xfs_rtbitmap_blockcount( +xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { - return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); + return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } -/* Compute the number of rtsummary blocks needed to track the given rt space. */ -xfs_filblks_t -xfs_rtsummary_blockcount( - struct xfs_mount *mp, - unsigned int rsumlevels, - xfs_extlen_t rbmblocks) +/* How many rt extents does each rtbitmap file track? */ +static inline xfs_rtbxlen_t +xfs_rtbitmap_bitcount( + struct xfs_mount *mp) { - unsigned long long rsumwords; + if (!mp->m_sb.sb_rextents) + return 0; - rsumwords = (unsigned long long)rsumlevels * rbmblocks; - return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); -} + /* rtgroup size can be nonzero even if rextents is zero */ + if (xfs_has_rtgroups(mp)) + return mp->m_sb.sb_rgextents; -/* Lock both realtime free space metadata inodes for a freespace update. */ -void -xfs_rtbitmap_lock( - struct xfs_mount *mp) -{ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return mp->m_sb.sb_rextents; } /* - * Join both realtime free space metadata inodes to the transaction. The - * ILOCKs will be released on transaction commit. + * Compute the number of rtbitmap blocks used for a given file system. */ -void -xfs_rtbitmap_trans_join( - struct xfs_trans *tp) -{ - xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL); -} - -/* Unlock both realtime free space metadata inodes after a freespace update. */ -void -xfs_rtbitmap_unlock( +xfs_filblks_t +xfs_rtbitmap_blockcount( struct xfs_mount *mp) { - xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp)); } /* - * Lock the realtime free space metadata inodes for a freespace scan. Callers - * must walk metadata blocks in order of increasing file offset. + * Compute the geometry of the rtsummary file needed to track the given rt + * space. */ -void -xfs_rtbitmap_lock_shared( - struct xfs_mount *mp, - unsigned int rbmlock_flags) -{ - if (rbmlock_flags & XFS_RBMLOCK_BITMAP) - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - - if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) - xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); -} - -/* Unlock the realtime free space metadata inodes after a freespace scan. */ -void -xfs_rtbitmap_unlock_shared( +xfs_filblks_t +xfs_rtsummary_blockcount( struct xfs_mount *mp, - unsigned int rbmlock_flags) + unsigned int *rsumlevels) { - if (rbmlock_flags & XFS_RBMLOCK_SUMMARY) - xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM); + xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); + unsigned long long rsumwords; - if (rbmlock_flags & XFS_RBMLOCK_BITMAP) - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + *rsumlevels = xfs_compute_rextslog(rextents) + 1; + rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); + return howmany_64(rsumwords, mp->m_blockwsize); } static int @@ -1260,21 +1353,26 @@ out_trans_cancel: /* Get a buffer for the block. */ static int xfs_rtfile_initialize_block( - struct xfs_inode *ip, + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fsblock_t fsbno, void *data) { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *ip = rtg->rtg_inodes[type]; struct xfs_trans *tp; struct xfs_buf *bp; + void *bufdata; const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; enum xfs_blft buf_type; int error; - if (ip == mp->m_rsumip) + if (type == XFS_RTGI_BITMAP) + buf_type = XFS_BLFT_RTBITMAP_BUF; + else if (type == XFS_RTGI_SUMMARY) buf_type = XFS_BLFT_RTSUMMARY_BUF; else - buf_type = XFS_BLFT_RTBITMAP_BUF; + return -EINVAL; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp); if (error) @@ -1288,13 +1386,30 @@ xfs_rtfile_initialize_block( xfs_trans_cancel(tp); return error; } + bufdata = bp->b_addr; xfs_trans_buf_set_type(tp, bp, buf_type); - bp->b_ops = &xfs_rtbuf_ops; + bp->b_ops = xfs_rtblock_ops(mp, type); + + if (xfs_has_rtgroups(mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + if (type == XFS_RTGI_BITMAP) + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); + else + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(ip->i_ino); + hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid); + + bufdata += sizeof(*hdr); + } + if (data) - memcpy(bp->b_addr, data, copylen); + memcpy(bufdata, data, copylen); else - memset(bp->b_addr, 0, copylen); + memset(bufdata, 0, copylen); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); return xfs_trans_commit(tp); } @@ -1306,12 +1421,13 @@ xfs_rtfile_initialize_block( */ int xfs_rtfile_initialize_blocks( - struct xfs_inode *ip, /* inode (bitmap/summary) */ + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, /* offset to start from */ xfs_fileoff_t end_fsb, /* offset to allocate to */ void *data) /* data to fill the blocks */ { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = rtg_mount(rtg); const size_t copylen = mp->m_blockwsize << XFS_WORDLOG; while (offset_fsb < end_fsb) { @@ -1319,8 +1435,8 @@ xfs_rtfile_initialize_blocks( xfs_filblks_t i; int error; - error = xfs_rtfile_alloc_blocks(ip, offset_fsb, - end_fsb - offset_fsb, &map); + error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type], + offset_fsb, end_fsb - offset_fsb, &map); if (error) return error; @@ -1330,7 +1446,7 @@ xfs_rtfile_initialize_blocks( * Do this one block per transaction, to keep it simple. */ for (i = 0; i < map.br_blockcount; i++) { - error = xfs_rtfile_initialize_block(ip, + error = xfs_rtfile_initialize_block(rtg, type, map.br_startblock + i, data); if (error) return error; @@ -1343,3 +1459,35 @@ xfs_rtfile_initialize_blocks( return 0; } + +int +xfs_rtbitmap_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize; + if (init && !xfs_has_rtgroups(mp)) { + ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + inode_set_atime(VFS_I(ip), 0, 0); + } + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_rtsummary_create( + struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 140513d1d6bc..16563a44bd13 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -6,7 +6,10 @@ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ +#include "xfs_rtgroup.h" + struct xfs_rtalloc_args { + struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; @@ -19,13 +22,37 @@ struct xfs_rtalloc_args { static inline xfs_rtblock_t xfs_rtx_to_rtb( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); + + if (mp->m_rtxblklog >= 0) + return start + (rtx << mp->m_rtxblklog); + return start + (rtx * mp->m_sb.sb_rextsize); +} + +/* Convert an rgbno into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rgbno_to_rtx( + struct xfs_mount *mp, + xfs_rgblock_t rgbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rgbno >> mp->m_rtxblklog; + return rgbno / mp->m_sb.sb_rextsize; +} + +static inline uint64_t +xfs_rtbxlen_to_blen( + struct xfs_mount *mp, + xfs_rtbxlen_t rtbxlen) +{ if (mp->m_rtxblklog >= 0) - return rtx << mp->m_rtxblklog; + return rtbxlen << mp->m_rtxblklog; - return rtx * mp->m_sb.sb_rextsize; + return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t @@ -62,15 +89,49 @@ xfs_extlen_to_rtxlen( return len / mp->m_sb.sb_rextsize; } +/* Convert an rt block count into an rt extent count. */ +static inline xfs_rtbxlen_t +xfs_blen_to_rtbxlen( + struct xfs_mount *mp, + uint64_t blen) +{ + if (likely(mp->m_rtxblklog >= 0)) + return blen >> mp->m_rtxblklog; + + return div_u64(blen, mp->m_sb.sb_rextsize); +} + +/* Return the offset of a file block length within an rt extent. */ +static inline xfs_extlen_t +xfs_blen_to_rtxoff( + struct xfs_mount *mp, + xfs_filblks_t blen) +{ + if (likely(mp->m_rtxblklog >= 0)) + return blen & mp->m_rtxblkmask; + + return do_div(blen, mp->m_sb.sb_rextsize); +} + +/* Round this block count up to the nearest rt extent size. */ +static inline xfs_filblks_t +xfs_blen_roundup_rtx( + struct xfs_mount *mp, + xfs_filblks_t blen) +{ + return roundup_64(blen, mp->m_sb.sb_rextsize); +} + /* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; - return div_u64(rtbno, mp->m_sb.sb_rextsize); } @@ -80,48 +141,29 @@ xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + /* open-coded 64-bit masking operation */ + rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno & mp->m_rtxblkmask; - return do_div(rtbno, mp->m_sb.sb_rextsize); } -/* - * Convert an rt block number into an rt extent number, rounding up to the next - * rt extent if the rt block is not aligned to an rt extent boundary. - */ -static inline xfs_rtxnum_t -xfs_rtb_to_rtxup( - struct xfs_mount *mp, - xfs_rtblock_t rtbno) -{ - if (likely(mp->m_rtxblklog >= 0)) { - if (rtbno & mp->m_rtxblkmask) - return (rtbno >> mp->m_rtxblklog) + 1; - return rtbno >> mp->m_rtxblklog; - } - - if (do_div(rtbno, mp->m_sb.sb_rextsize)) - rtbno++; - return rtbno; -} - -/* Round this rtblock up to the nearest rt extent size. */ +/* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_roundup_rtx( +xfs_fileoff_roundup_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) { - return roundup_64(rtbno, mp->m_sb.sb_rextsize); + return roundup_64(off, mp->m_sb.sb_rextsize); } -/* Round this rtblock down to the nearest rt extent size. */ +/* Round this file block offset down to the nearest rt extent size. */ static inline xfs_rtblock_t -xfs_rtb_rounddown_rtx( +xfs_fileoff_rounddown_rtx( struct xfs_mount *mp, - xfs_rtblock_t rtbno) + xfs_fileoff_t off) { - return rounddown_64(rtbno, mp->m_sb.sb_rextsize); + return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ @@ -130,6 +172,9 @@ xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) + return div_u64(rtx, mp->m_rtx_per_rbmblock); + return rtx >> mp->m_blkbit_log; } @@ -139,6 +184,13 @@ xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { + if (xfs_has_rtgroups(mp)) { + unsigned int mod; + + div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); + return mod; + } + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } @@ -148,6 +200,9 @@ xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { + if (xfs_has_rtgroups(mp)) + return rbmoff * mp->m_rtx_per_rbmblock; + return rbmoff << mp->m_blkbit_log; } @@ -157,7 +212,14 @@ xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_rtword_raw *words = args->rbmbp->b_addr; + struct xfs_mount *mp = args->mp; + union xfs_rtword_raw *words; + struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; + + if (xfs_has_rtgroups(mp)) + words = (union xfs_rtword_raw *)(hdr + 1); + else + words = args->rbmbp->b_addr; return words + index; } @@ -170,6 +232,8 @@ xfs_rtbitmap_getword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(word->rtg); return word->old; } @@ -182,7 +246,10 @@ xfs_rtbitmap_setword( { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); - word->old = value; + if (xfs_has_rtgroups(args->mp)) + word->rtg = cpu_to_be32(value); + else + word->old = value; } /* @@ -207,6 +274,9 @@ xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { + if (xfs_has_rtgroups(mp)) + return rsumoff / mp->m_blockwsize; + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } @@ -221,6 +291,9 @@ xfs_rtsumoffs_to_infoword( { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + if (xfs_has_rtgroups(mp)) + return rsumoff % mp->m_blockwsize; + return rsumoff & mask; } @@ -230,7 +303,13 @@ xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { - union xfs_suminfo_raw *info = args->sumbp->b_addr; + union xfs_suminfo_raw *info; + struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; + + if (xfs_has_rtgroups(args->mp)) + info = (union xfs_suminfo_raw *)(hdr + 1); + else + info = args->sumbp->b_addr; return info + index; } @@ -243,6 +322,8 @@ xfs_suminfo_get( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) + return be32_to_cpu(info->rtg); return info->old; } @@ -255,10 +336,28 @@ xfs_suminfo_add( { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + if (xfs_has_rtgroups(args->mp)) { + be32_add_cpu(&info->rtg, delta); + return be32_to_cpu(info->rtg); + } + info->old += delta; return info->old; } +static inline const struct xfs_buf_ops * +xfs_rtblock_ops( + struct xfs_mount *mp, + enum xfs_rtg_inodes type) +{ + if (xfs_has_rtgroups(mp)) { + if (type == XFS_RTGI_SUMMARY) + return &xfs_rtsummary_buf_ops; + return &xfs_rtbitmap_buf_ops; + } + return &xfs_rtbuf_ops; +} + /* * Functions for walking free space rtextents in the realtime bitmap. */ @@ -268,7 +367,7 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); @@ -291,53 +390,43 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, +int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtxnum_t start, xfs_rtxlen_t len, - bool *is_free); -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to free */ - xfs_rtxlen_t len); /* length of extent freed */ - +int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); +int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ -int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, - xfs_filblks_t rtlen); +int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t rtbno, xfs_filblks_t rtlen); -xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t - rtextents); +xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); +xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, + xfs_rtbxlen_t rtextents); xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, - unsigned int rsumlevels, xfs_extlen_t rbmblocks); - -int xfs_rtfile_initialize_blocks(struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data); + unsigned int *rsumlevels); -void xfs_rtbitmap_lock(struct xfs_mount *mp); -void xfs_rtbitmap_unlock(struct xfs_mount *mp); -void xfs_rtbitmap_trans_join(struct xfs_trans *tp); +int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb, void *data); +int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); +int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, + struct xfs_trans *tp, bool init); -/* Lock the rt bitmap inode in shared mode */ -#define XFS_RBMLOCK_BITMAP (1U << 0) -/* Lock the rt summary inode in shared mode */ -#define XFS_RBMLOCK_SUMMARY (1U << 1) - -void xfs_rtbitmap_lock_shared(struct xfs_mount *mp, - unsigned int rbmlock_flags); -void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, - unsigned int rbmlock_flags); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) -# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) + +static inline int xfs_rtfree_blocks(struct xfs_trans *tp, + struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + return -ENOSYS; +} # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) @@ -345,17 +434,11 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp, # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) static inline xfs_filblks_t -xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } -# define xfs_rtsummary_blockcount(mp, l, b) (0) -# define xfs_rtbitmap_lock(mp) do { } while (0) -# define xfs_rtbitmap_trans_join(tp) do { } while (0) -# define xfs_rtbitmap_unlock(mp) do { } while (0) -# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0) -# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0) #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c new file mode 100644 index 000000000000..4f3bfc884aff --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -0,0 +1,697 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_defer.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_buf_item.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" + +/* Find the first usable fsblock in this rtgroup. */ +static inline uint32_t +xfs_rtgroup_min_block( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + if (xfs_has_rtsb(mp) && rgno == 0) + return mp->m_sb.sb_rextsize; + + return 0; +} + +/* Precompute this group's geometry */ +void +xfs_rtgroup_calc_geometry( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno); +} + +int +xfs_rtgroup_alloc( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + struct xfs_rtgroup *rtg; + int error; + + rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL); + if (!rtg) + return -ENOMEM; + + xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents); + + error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG); + if (error) + goto out_free_rtg; + return 0; + +out_free_rtg: + kfree(rtg); + return error; +} + +void +xfs_rtgroup_free( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL); +} + +/* Free a range of incore rtgroup objects. */ +void +xfs_free_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno) +{ + xfs_rgnumber_t rgno; + + for (rgno = first_rgno; rgno < end_rgno; rgno++) + xfs_rtgroup_free(mp, rgno); +} + +/* Initialize some range of incore rtgroup objects. */ +int +xfs_initialize_rtgroups( + struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + xfs_rgnumber_t index; + int error; + + if (first_rgno >= end_rgno) + return 0; + + for (index = first_rgno; index < end_rgno; index++) { + error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents); + if (error) + goto out_unwind_new_rtgs; + } + + return 0; + +out_unwind_new_rtgs: + xfs_free_rtgroups(mp, first_rgno, index); + return error; +} + +/* Compute the number of rt extents in this realtime group. */ +xfs_rtxnum_t +__xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents) +{ + ASSERT(rgno < rgcount); + if (rgno == rgcount - 1) + return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents); + + ASSERT(xfs_has_rtgroups(mp)); + return mp->m_sb.sb_rgextents; +} + +xfs_rtxnum_t +xfs_rtgroup_extents( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); +} + +/* + * Update the rt extent count of the previous tail rtgroup if it changed during + * recovery (i.e. recovery of a growfs). + */ +int +xfs_update_last_rtgroup_size( + struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount) +{ + struct xfs_rtgroup *rtg; + + ASSERT(prev_rgcount > 0); + + rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1); + if (!rtg) + return -EFSCORRUPTED; + rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1, + mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents); + rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize; + xfs_rtgroup_rele(rtg); + return 0; +} + +/* Lock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_lock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a freespace + * update. + */ + xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL); + xfs_ilock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED); + } +} + +/* Unlock metadata inodes associated with this rt group. */ +void +xfs_rtgroup_unlock( + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || + !(rtglock_flags & XFS_RTGLOCK_BITMAP)); + + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL); + xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED); + } +} + +/* + * Join realtime group metadata inodes to the transaction. The ILOCKs will be + * released on transaction commit. + */ +void +xfs_rtgroup_trans_join( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + unsigned int rtglock_flags) +{ + ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); + ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); + + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP], + XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_SUMMARY], + XFS_ILOCK_EXCL); + } +} + +/* Retrieve rt group geometry. */ +int +xfs_rtgroup_get_geometry( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + /* Fill out form. */ + memset(rgeo, 0, sizeof(*rgeo)); + rgeo->rg_number = rtg_rgno(rtg); + rgeo->rg_length = rtg_group(rtg)->xg_block_count; + xfs_rtgroup_geom_health(rtg, rgeo); + return 0; +} + +#ifdef CONFIG_PROVE_LOCKING +static struct lock_class_key xfs_rtginode_lock_class; + +static int +xfs_rtginode_ilock_cmp_fn( + const struct lockdep_map *m1, + const struct lockdep_map *m2) +{ + const struct xfs_inode *ip1 = + container_of(m1, struct xfs_inode, i_lock.dep_map); + const struct xfs_inode *ip2 = + container_of(m2, struct xfs_inode, i_lock.dep_map); + + if (ip1->i_projid < ip2->i_projid) + return -1; + if (ip1->i_projid > ip2->i_projid) + return 1; + return 0; +} + +static inline void +xfs_rtginode_ilock_print_fn( + const struct lockdep_map *m) +{ + const struct xfs_inode *ip = + container_of(m, struct xfs_inode, i_lock.dep_map); + + printk(KERN_CONT " rgno=%u", ip->i_projid); +} + +/* + * Most of the time each of the RTG inode locks are only taken one at a time. + * But when committing deferred ops, more than one of a kind can be taken. + * However, deferred rt ops will be committed in rgno order so there is no + * potential for deadlocks. The code here is needed to tell lockdep about this + * order. + */ +static inline void +xfs_rtginode_lockdep_setup( + struct xfs_inode *ip, + xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class, + type); + lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn, + xfs_rtginode_ilock_print_fn); +} +#else +#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0) +#endif /* CONFIG_PROVE_LOCKING */ + +struct xfs_rtginode_ops { + const char *name; /* short name */ + + enum xfs_metafile_type metafile_type; + + unsigned int sick; /* rtgroup sickness flag */ + + /* Does the fs have this feature? */ + bool (*enabled)(struct xfs_mount *mp); + + /* Create this rtgroup metadata inode and initialize it. */ + int (*create)(struct xfs_rtgroup *rtg, + struct xfs_inode *ip, + struct xfs_trans *tp, + bool init); +}; + +static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { + [XFS_RTGI_BITMAP] = { + .name = "bitmap", + .metafile_type = XFS_METAFILE_RTBITMAP, + .sick = XFS_SICK_RG_BITMAP, + .create = xfs_rtbitmap_create, + }, + [XFS_RTGI_SUMMARY] = { + .name = "summary", + .metafile_type = XFS_METAFILE_RTSUMMARY, + .sick = XFS_SICK_RG_SUMMARY, + .create = xfs_rtsummary_create, + }, +}; + +/* Return the shortname of this rtgroup inode. */ +const char * +xfs_rtginode_name( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].name; +} + +/* Return the metafile type of this rtgroup inode. */ +enum xfs_metafile_type +xfs_rtginode_metafile_type( + enum xfs_rtg_inodes type) +{ + return xfs_rtginode_ops[type].metafile_type; +} + +/* Should this rtgroup inode be present? */ +bool +xfs_rtginode_enabled( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + if (!ops->enabled) + return true; + return ops->enabled(rtg_mount(rtg)); +} + +/* Mark an rtgroup inode sick */ +void +xfs_rtginode_mark_sick( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + + xfs_group_mark_sick(rtg_group(rtg), ops->sick); +} + +/* Load and existing rtgroup inode into the rtgroup structure. */ +int +xfs_rtginode_load( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!xfs_has_rtgroups(mp)) { + xfs_ino_t ino; + + switch (type) { + case XFS_RTGI_BITMAP: + ino = mp->m_sb.sb_rbmino; + break; + case XFS_RTGI_SUMMARY: + ino = mp->m_sb.sb_rsumino; + break; + default: + /* None of the other types exist on !rtgroups */ + return 0; + } + + error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type, + &ip); + } else { + const char *path; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!path) + return -ENOMEM; + error = xfs_metadir_load(tp, mp->m_rtdirip, path, + ops->metafile_type, &ip); + kfree(path); + } + + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_rtginode_mark_sick(rtg, type); + return error; + } + + if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) { + xfs_irele(ip); + xfs_rtginode_mark_sick(rtg, type); + return -EFSCORRUPTED; + } + + xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type); + rtg->rtg_inodes[type] = ip; + return 0; +} + +/* Release an rtgroup metadata inode. */ +void +xfs_rtginode_irele( + struct xfs_inode **ipp) +{ + if (*ipp) + xfs_irele(*ipp); + *ipp = NULL; +} + +/* Add a metadata inode for a realtime rmap btree. */ +int +xfs_rtginode_create( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type, + bool init) +{ + const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type]; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_metadir_update upd = { + .dp = mp->m_rtdirip, + .metafile_type = ops->metafile_type, + }; + int error; + + if (!xfs_rtginode_enabled(rtg, type)) + return 0; + + if (!mp->m_rtdirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + upd.path = xfs_rtginode_path(rtg_rgno(rtg), type); + if (!upd.path) + return -ENOMEM; + + error = xfs_metadir_start_create(&upd); + if (error) + goto out_path; + + error = xfs_metadir_create(&upd, S_IFREG); + if (error) + goto out_cancel; + + xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type); + + upd.ip->i_projid = rtg_rgno(rtg); + error = ops->create(rtg, upd.ip, upd.tp, init); + if (error) + goto out_cancel; + + error = xfs_metadir_commit(&upd); + if (error) + goto out_path; + + kfree(upd.path); + xfs_finish_inode_setup(upd.ip); + rtg->rtg_inodes[type] = upd.ip; + return 0; + +out_cancel: + xfs_metadir_cancel(&upd, error); + /* Have to finish setting up the inode to ensure it's deleted. */ + if (upd.ip) { + xfs_finish_inode_setup(upd.ip); + xfs_irele(upd.ip); + } +out_path: + kfree(upd.path); + return error; +} + +/* Create the parent directory for all rtgroup inodes and load it. */ +int +xfs_rtginode_mkdir_parent( + struct xfs_mount *mp) +{ + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip); +} + +/* Load the parent directory of all rtgroup inodes. */ +int +xfs_rtginode_load_parent( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!mp->m_metadirip) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; + } + + return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups", + XFS_METAFILE_DIR, &mp->m_rtdirip); +} + +/* Check superblock fields for a read or a write. */ +static xfs_failaddr_t +xfs_rtsb_verify_common( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + + if (!xfs_verify_magic(bp, rsb->rsb_magicnum)) + return __this_address; + if (rsb->rsb_pad) + return __this_address; + + /* Everything to the end of the fs block must be zero */ + if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb))) + return __this_address; + + return NULL; +} + +/* Check superblock fields for a read or revalidation. */ +static inline xfs_failaddr_t +xfs_rtsb_verify_all( + struct xfs_buf *bp) +{ + struct xfs_rtsb *rsb = bp->b_addr; + struct xfs_mount *mp = bp->b_mount; + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) + return fa; + + if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX)) + return __this_address; + if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid)) + return __this_address; + if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return NULL; +} + +static void +xfs_rtsb_read_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) { + xfs_verifier_error(bp, -EFSBADCRC, __this_address); + return; + } + + fa = xfs_rtsb_verify_all(bp); + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +static void +xfs_rtsb_write_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + + fa = xfs_rtsb_verify_common(bp); + if (fa) { + xfs_verifier_error(bp, -EFSCORRUPTED, fa); + return; + } + + xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_rtsb_buf_ops = { + .name = "xfs_rtsb", + .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) }, + .verify_read = xfs_rtsb_read_verify, + .verify_write = xfs_rtsb_write_verify, + .verify_struct = xfs_rtsb_verify_all, +}; + +/* Update a realtime superblock from the primary fs super */ +void +xfs_update_rtsb( + struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp) +{ + const struct xfs_dsb *dsb = sb_bp->b_addr; + struct xfs_rtsb *rsb = rtsb_bp->b_addr; + const uuid_t *meta_uuid; + + rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC); + + rsb->rsb_pad = 0; + memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX); + + memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid)); + + /* + * The metadata uuid is the fs uuid if the metauuid feature is not + * enabled. + */ + if (dsb->sb_features_incompat & + cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID)) + meta_uuid = &dsb->sb_meta_uuid; + else + meta_uuid = &dsb->sb_uuid; + memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid)); +} + +/* + * Update the realtime superblock from a filesystem superblock and log it to + * the given transaction. + */ +struct xfs_buf * +xfs_log_rtsb( + struct xfs_trans *tp, + const struct xfs_buf *sb_bp) +{ + struct xfs_buf *rtsb_bp; + + if (!xfs_has_rtsb(tp->t_mountp)) + return NULL; + + rtsb_bp = xfs_trans_getrtsb(tp); + if (!rtsb_bp) { + /* + * It's possible for the rtgroups feature to be enabled but + * there is no incore rt superblock buffer if the rt geometry + * was specified at mkfs time but the rt section has not yet + * been attached. In this case, rblocks must be zero. + */ + ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0); + return NULL; + } + + xfs_update_rtsb(rtsb_bp, sb_bp); + xfs_trans_ordered_buf(tp, rtsb_bp); + return rtsb_bp; +} diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h new file mode 100644 index 000000000000..2d7822644eff --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -0,0 +1,284 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __LIBXFS_RTGROUP_H +#define __LIBXFS_RTGROUP_H 1 + +#include "xfs_group.h" + +struct xfs_mount; +struct xfs_trans; + +enum xfs_rtg_inodes { + XFS_RTGI_BITMAP, /* allocation bitmap */ + XFS_RTGI_SUMMARY, /* allocation summary */ + + XFS_RTGI_MAX, +}; + +#ifdef MAX_LOCKDEP_SUBCLASSES +static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES); +#endif + +/* + * Realtime group incore structure, similar to the per-AG structure. + */ +struct xfs_rtgroup { + struct xfs_group rtg_group; + + /* per-rtgroup metadata inodes */ + struct xfs_inode *rtg_inodes[XFS_RTGI_MAX]; + + /* Number of blocks in this group */ + xfs_rtxnum_t rtg_extents; + + /* + * Cache of rt summary level per bitmap block with the invariant that + * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, + * or 0 if rsum[i][bbno] == 0 for all i. + * + * Reads and writes are serialized by the rsumip inode lock. + */ + uint8_t *rtg_rsum_cache; +}; + +static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) +{ + return container_of(xg, struct xfs_rtgroup, rtg_group); +} + +static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg) +{ + return &rtg->rtg_group; +} + +static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_mount; +} + +static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_gno; +} + +/* Passive rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_get( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_hold( + struct xfs_rtgroup *rtg) +{ + return to_rtg(xfs_group_hold(rtg_group(rtg))); +} + +static inline void +xfs_rtgroup_put( + struct xfs_rtgroup *rtg) +{ + xfs_group_put(rtg_group(rtg)); +} + +/* Active rtgroup references */ +static inline struct xfs_rtgroup * +xfs_rtgroup_grab( + struct xfs_mount *mp, + xfs_rgnumber_t rgno) +{ + return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG)); +} + +static inline void +xfs_rtgroup_rele( + struct xfs_rtgroup *rtg) +{ + xfs_group_rele(rtg_group(rtg)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next_range( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg, + xfs_rgnumber_t start_rgno, + xfs_rgnumber_t end_rgno) +{ + return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL, + start_rgno, end_rgno, XG_TYPE_RTG)); +} + +static inline struct xfs_rtgroup * +xfs_rtgroup_next( + struct xfs_mount *mp, + struct xfs_rtgroup *rtg) +{ + return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1); +} + +static inline xfs_rtblock_t +xfs_rgbno_to_rtb( + struct xfs_rtgroup *rtg, + xfs_rgblock_t rgbno) +{ + return xfs_gbno_to_fsb(rtg_group(rtg), rgbno); +} + +static inline xfs_rgnumber_t +xfs_rtb_to_rgno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG); +} + +static inline xfs_rgblock_t +xfs_rtb_to_rgbno( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG); +} + +/* Is rtbno the start of a RT group? */ +static inline bool +xfs_rtbno_is_group_start( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0; +} + +/* Convert an rtgroups rt extent number into an rgbno. */ +static inline xfs_rgblock_t +xfs_rtx_to_rgbno( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (likely(mp->m_rtxblklog >= 0)) + return rtx << mp->m_rtxblklog; + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_daddr_t +xfs_rtb_to_daddr( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; + + return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); +} + +static inline xfs_rtblock_t +xfs_daddr_to_rtb( + struct xfs_mount *mp, + xfs_daddr_t daddr) +{ + xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); + + if (xfs_has_rtgroups(mp)) { + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rgnumber_t rgno; + uint32_t rgbno; + + rgno = div_u64_rem(bno, g->blocks, &rgbno); + return ((xfs_rtblock_t)rgno << g->blklog) + rgbno; + } + + return bno; +} + +#ifdef CONFIG_XFS_RT +int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno); + +void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno); +int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno, + xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents); + +xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno, + xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents); +xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno); +void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount, + xfs_rtbxlen_t rextents); + +int xfs_update_last_rtgroup_size(struct xfs_mount *mp, + xfs_rgnumber_t prev_rgcount); + +/* Lock the rt bitmap inode in exclusive mode */ +#define XFS_RTGLOCK_BITMAP (1U << 0) +/* Lock the rt bitmap inode in shared mode */ +#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) + +#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ + XFS_RTGLOCK_BITMAP_SHARED) + +void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); +void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + unsigned int rtglock_flags); + +int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo); + +int xfs_rtginode_mkdir_parent(struct xfs_mount *mp); +int xfs_rtginode_load_parent(struct xfs_trans *tp); + +const char *xfs_rtginode_name(enum xfs_rtg_inodes type); +enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type); +bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type); +int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + struct xfs_trans *tp); +int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, + bool init); +void xfs_rtginode_irele(struct xfs_inode **ipp); + +static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno, + enum xfs_rtg_inodes type) +{ + return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type)); +} + +void xfs_update_rtsb(struct xfs_buf *rtsb_bp, + const struct xfs_buf *sb_bp); +struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp, + const struct xfs_buf *sb_bp); +#else +static inline void xfs_free_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno) +{ +} + +static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, + xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno, + xfs_rtbxlen_t rextents) +{ + return 0; +} + +# define xfs_rtgroup_extents(mp, rgno) (0) +# define xfs_update_last_rtgroup_size(mp, rgno) (0) +# define xfs_rtgroup_lock(rtg, gf) ((void)0) +# define xfs_rtgroup_unlock(rtg, gf) ((void)0) +# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0) +# define xfs_update_rtsb(bp, sb_bp) ((void)0) +# define xfs_log_rtsb(tp, sb_bp) (NULL) +# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) +#endif /* CONFIG_XFS_RT */ + +#endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d95409f3cba6..3b5623611eba 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -27,6 +27,7 @@ #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_exchrange.h" +#include "xfs_rtgroup.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -180,6 +181,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_EXCHANGE_RANGE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT) features |= XFS_FEAT_PARENT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) + features |= XFS_FEAT_METADIR; return features; } @@ -232,11 +235,37 @@ xfs_validate_sb_read( return 0; } +/* Return the number of extents covered by a single rt bitmap file */ +static xfs_rtbxlen_t +xfs_extents_per_rbm( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_rgextents; + return sbp->sb_rextents; +} + +/* + * Return the payload size of a single rt bitmap block (without the metadata + * header if any). + */ +static inline unsigned int +xfs_rtbmblock_size( + struct xfs_sb *sbp) +{ + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo); + return sbp->sb_blocksize; +} + static uint64_t -xfs_sb_calc_rbmblocks( +xfs_expected_rbmblocks( struct xfs_sb *sbp) { - return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); + return howmany_64(xfs_extents_per_rbm(sbp), + NBBY * xfs_rtbmblock_size(sbp)); } /* Validate the realtime geometry */ @@ -258,7 +287,7 @@ xfs_validate_rt_geometry( if (sbp->sb_rextents == 0 || sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) || sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) || - sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp)) + sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) return false; return true; @@ -297,13 +326,6 @@ xfs_validate_sb_write( * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { - xfs_warn(mp, -"Corruption detected in superblock compatible features (0x%x)!", - (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); - return -EFSCORRUPTED; - } - if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, @@ -339,6 +361,78 @@ xfs_validate_sb_write( return 0; } +int +xfs_compute_rgblklog( + xfs_rtxlen_t rgextents, + xfs_rgblock_t rextsize) +{ + uint64_t rgblocks = (uint64_t)rgextents * rextsize; + + return xfs_highbit64(rgblocks - 1) + 1; +} + +static int +xfs_validate_sb_rtgroups( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + uint64_t groups; + int rgblklog; + + if (sbp->sb_rextsize == 0) { + xfs_warn(mp, +"Realtime extent size must not be zero."); + return -EINVAL; + } + + if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) { + xfs_warn(mp, +"Realtime group size (%u) must be less than %u rt extents.", + sbp->sb_rgextents, + XFS_MAX_RGBLOCKS / sbp->sb_rextsize); + return -EINVAL; + } + + if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) { + xfs_warn(mp, +"Realtime group size (%u) must be at least %u rt extents.", + sbp->sb_rgextents, XFS_MIN_RGEXTENTS); + return -EINVAL; + } + + if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) { + xfs_warn(mp, +"Realtime groups (%u) must be less than %u.", + sbp->sb_rgcount, XFS_MAX_RGNUMBER); + return -EINVAL; + } + + groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents); + if (groups != sbp->sb_rgcount) { + xfs_warn(mp, +"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.", + sbp->sb_rgcount, groups); + return -EINVAL; + } + + /* Exchange-range is required for fsr to work on realtime files */ + if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) { + xfs_warn(mp, +"Realtime groups feature requires exchange-range support."); + return -EINVAL; + } + + rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize); + if (sbp->sb_rgblklog != rgblklog) { + xfs_warn(mp, +"Realtime group log (%d) does not match expected value (%d).", + sbp->sb_rgblklog, rgblklog); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -350,6 +444,7 @@ xfs_validate_sb_common( uint32_t agcount = 0; uint32_t rem; bool has_dalign; + int error; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, @@ -398,6 +493,33 @@ xfs_validate_sb_common( sbp->sb_inoalignmt, align); return -EINVAL; } + + if (sbp->sb_spino_align && + (sbp->sb_spino_align > sbp->sb_inoalignmt || + (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0)) { + xfs_warn(mp, +"Sparse inode alignment (%u) is invalid, must be integer factor of (%u).", + sbp->sb_spino_align, + sbp->sb_inoalignmt); + return -EINVAL; + } + } else if (sbp->sb_spino_align) { + xfs_warn(mp, + "Sparse inode alignment (%u) should be zero.", + sbp->sb_spino_align); + return -EINVAL; + } + + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) { + xfs_warn(mp, +"Metadir superblock padding fields must be zero."); + return -EINVAL; + } + + error = xfs_validate_sb_rtgroups(mp, sbp); + if (error) + return error; } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { @@ -566,6 +688,14 @@ xfs_validate_sb_common( void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + sbp->sb_uquotino = NULLFSINO; + sbp->sb_gquotino = NULLFSINO; + sbp->sb_pquotino = NULLFSINO; + return; + } + /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota @@ -689,6 +819,20 @@ __xfs_sb_from_disk( /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = be64_to_cpu(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad)); + to->sb_rgcount = be32_to_cpu(from->sb_rgcount); + to->sb_rgextents = be32_to_cpu(from->sb_rgextents); + to->sb_rbmino = NULLFSINO; + to->sb_rsumino = NULLFSINO; + } else { + to->sb_metadirino = NULLFSINO; + to->sb_rgcount = 1; + to->sb_rgextents = 0; + } } void @@ -706,6 +850,15 @@ xfs_sb_quota_to_disk( { uint16_t qflags = from->sb_qflags; + if (xfs_sb_is_v5(from) && + (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_uquotino = cpu_to_be64(0); + to->sb_gquotino = cpu_to_be64(0); + to->sb_pquotino = cpu_to_be64(0); + return; + } + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* @@ -836,6 +989,16 @@ xfs_sb_to_disk( to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) { + to->sb_metadirino = cpu_to_be64(from->sb_metadirino); + to->sb_rgblklog = from->sb_rgblklog; + memset(to->sb_pad, 0, sizeof(to->sb_pad)); + to->sb_rgcount = cpu_to_be32(from->sb_rgcount); + to->sb_rgextents = cpu_to_be32(from->sb_rgextents); + to->sb_rbmino = cpu_to_be64(0); + to->sb_rsumino = cpu_to_be64(0); + } } /* @@ -965,13 +1128,43 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .verify_write = xfs_sb_write_verify, }; +/* Compute cached rt geometry from the incore sb. */ void -xfs_mount_sb_set_rextsize( +xfs_sb_mount_rextsize( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); + + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) { + rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; + rgs->blklog = mp->m_sb.sb_rgblklog; + rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + } else { + rgs->blocks = 0; + rgs->blklog = 0; + rgs->blkmask = (uint64_t)-1; + } +} + +/* Update incore sb rt extent size, then recompute the cached rt geometry. */ +void +xfs_mount_sb_set_rextsize( + struct xfs_mount *mp, + struct xfs_sb *sbp, + xfs_agblock_t rextsize) +{ + sbp->sb_rextsize = rextsize; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) + sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, + rextsize); + + xfs_sb_mount_rextsize(mp, sbp); } /* @@ -988,6 +1181,8 @@ xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; @@ -996,9 +1191,14 @@ xfs_sb_mount_common( mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - xfs_mount_sb_set_rextsize(mp, sbp); + mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG; + mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG; + + ags->blocks = mp->m_sb.sb_agblocks; + ags->blklog = mp->m_sb.sb_agblklog; + ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog); + + xfs_sb_mount_rextsize(mp, sbp); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false); @@ -1045,11 +1245,6 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. - * - * Do not update sb_frextents here because it is not part of the lazy - * sb counters, despite having a percpu counter. It is always kept - * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() - * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); @@ -1060,6 +1255,16 @@ xfs_log_sb( percpu_counter_sum_positive(&mp->m_fdblocks); } + /* + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This counter can go negative due to the way + * we handle nearly-lockless reservations, so we must use the _positive + * variant here to avoid writing out nonsense frextents. + */ + if (xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = + percpu_counter_sum_positive(&mp->m_frextents); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); @@ -1109,18 +1314,17 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno = 1; + struct xfs_perag *pag = NULL; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, 1))) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -1132,7 +1336,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - pag->pag_agno); + pag_agno(pag)); if (!saved_error) saved_error = error; continue; @@ -1146,26 +1350,22 @@ xfs_update_secondary_sbs( xfs_buf_relse(bp); /* don't hold too many buffers at once */ - if (agno % 16) + if (pag_agno(pag) % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, pag->pag_agno); + error, pag_agno(pag)); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); - if (error) { - xfs_warn(mp, - "write error %d updating a secondary superblock near ag %d", - error, agno); - } - + if (error) + xfs_warn(mp, "error %d writing secondary superblocks", error); return saved_error ? saved_error : error; } @@ -1175,10 +1375,12 @@ xfs_update_secondary_sbs( */ int xfs_sync_sb_buf( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool update_rtsb) { struct xfs_trans *tp; struct xfs_buf *bp; + struct xfs_buf *rtsb_bp = NULL; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); @@ -1188,6 +1390,11 @@ xfs_sync_sb_buf( bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); + if (update_rtsb) { + rtsb_bp = xfs_log_rtsb(tp, bp); + if (rtsb_bp) + xfs_trans_bhold(tp, rtsb_bp); + } xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -1196,7 +1403,11 @@ xfs_sync_sb_buf( * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); + if (!error && rtsb_bp) + error = xfs_bwrite(rtsb_bp); out: + if (rtsb_bp) + xfs_buf_relse(rtsb_bp); xfs_buf_relse(bp); return error; } @@ -1283,6 +1494,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; if (xfs_has_exchange_range(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; + if (xfs_has_metadir(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1298,6 +1511,11 @@ xfs_fs_geometry( return; geo->version = XFS_FSOP_GEOM_VERSION_V5; + + if (xfs_has_rtgroups(mp)) { + geo->rgcount = sbp->sb_rgcount; + geo->rgextents = sbp->sb_rgextents; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 885c83755991..34d0dd374e9b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -15,10 +15,11 @@ struct xfs_perag; extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); -extern int xfs_sync_sb_buf(struct xfs_mount *mp); +extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp); void xfs_mount_sb_set_rextsize(struct xfs_mount *mp, - struct xfs_sb *sbp); + struct xfs_sb *sbp, xfs_agblock_t rextsize); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); @@ -43,5 +44,6 @@ bool xfs_validate_stripe_geometry(struct xfs_mount *mp, bool xfs_validate_rt_geometry(struct xfs_sb *sbp); uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 33b84a3a83ff..e7efdb9ceaf3 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -38,7 +38,10 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops; +extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; +extern const struct xfs_buf_ops xfs_rtsb_buf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; @@ -157,6 +160,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_SB_RBLOCKS 0x00000800 #define XFS_TRANS_SB_REXTENTS 0x00001000 #define XFS_TRANS_SB_REXTSLOG 0x00002000 +#define XFS_TRANS_SB_RGCOUNT 0x00004000 /* * Here we centralize the specification of XFS meta-data buffer reference count diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f228127a88ff..fb47a76ead18 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -92,8 +92,10 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; + /* no verification of non-crc buffers */ if (!xfs_has_crc(mp)) - return __this_address; + return NULL; + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 3c40f37e82c7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 1a7f95bcf069..bab402340b5d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -224,7 +224,7 @@ xfs_rtalloc_block_count( xfs_rtxlen_t rtxlen; rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); + rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen); return (rtbmp_blocks + 1) * num_ops; } diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index c299b16c9365..1faf04204c5d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -12,6 +12,8 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* @@ -111,7 +113,7 @@ xfs_verify_ino( /* Is this an internal inode number? */ inline bool -xfs_internal_inum( +xfs_is_sb_inum( struct xfs_mount *mp, xfs_ino_t ino) { @@ -129,24 +131,42 @@ xfs_verify_dir_ino( struct xfs_mount *mp, xfs_ino_t ino) { - if (xfs_internal_inum(mp, ino)) + if (xfs_is_sb_inum(mp, ino)) return false; return xfs_verify_ino(mp, ino); } /* - * Verify that an realtime block number pointer doesn't point off the - * end of the realtime device. + * Verify that a realtime block number pointer neither points outside the + * allocatable areas of the rtgroup nor off the end of the realtime + * device. */ inline bool xfs_verify_rtbno( struct xfs_mount *mp, xfs_rtblock_t rtbno) { + if (xfs_has_rtgroups(mp)) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno); + + if (rgno >= mp->m_sb.sb_rgcount) + return false; + if (rtx >= xfs_rtgroup_extents(mp, rgno)) + return false; + if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0) + return false; + return true; + } + return rtbno < mp->m_sb.sb_rblocks; } -/* Verify that a realtime device extent is fully contained inside the volume. */ +/* + * Verify that an allocated realtime device extent neither points outside + * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the + * end of the realtime device. + */ bool xfs_verify_rtbext( struct xfs_mount *mp, @@ -159,7 +179,14 @@ xfs_verify_rtbext( if (!xfs_verify_rtbno(mp, rtbno)) return false; - return xfs_verify_rtbno(mp, rtbno + len - 1); + if (!xfs_verify_rtbno(mp, rtbno + len - 1)) + return false; + + if (xfs_has_rtgroups(mp) && + xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1)) + return false; + + return true; } /* Calculate the range of valid icount values. */ @@ -170,13 +197,12 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a8cd44d03ef6..bf33c2b1e43e 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -9,10 +9,12 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ +typedef uint32_t xfs_rgnumber_t; /* realtime group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ @@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t; #define NULLFILEOFF ((xfs_fileoff_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLRGBLOCK ((xfs_rgblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLRGNUMBER ((xfs_rgnumber_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -212,6 +216,16 @@ enum xbtree_recpacking { XBTREE_RECPACKING_FULL, }; +enum xfs_group_type { + XG_TYPE_AG, + XG_TYPE_RTG, + XG_TYPE_MAX, +} __packed; + +#define XG_TYPE_STRINGS \ + { XG_TYPE_AG, "ag" }, \ + { XG_TYPE_RTG, "rtg" } + /* * Type verifier functions */ @@ -222,7 +236,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); -bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); +bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index f8e5b67128d2..9f8c312dfd3c 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -60,6 +60,32 @@ xchk_superblock_xref( } /* + * Calculate the ondisk superblock size in bytes given the feature set of the + * mounted filesystem (aka the primary sb). This is subtlely different from + * the logic in xfs_repair, which computes the size of a secondary sb given the + * featureset listed in the secondary sb. + */ +STATIC size_t +xchk_superblock_ondisk_size( + struct xfs_mount *mp) +{ + if (xfs_has_metadir(mp)) + return offsetofend(struct xfs_dsb, sb_pad); + if (xfs_has_metauuid(mp)) + return offsetofend(struct xfs_dsb, sb_meta_uuid); + if (xfs_has_crc(mp)) + return offsetofend(struct xfs_dsb, sb_lsn); + if (xfs_sb_version_hasmorebits(&mp->m_sb)) + return offsetofend(struct xfs_dsb, sb_bad_features2); + if (xfs_has_logv2(mp)) + return offsetofend(struct xfs_dsb, sb_logsunit); + if (xfs_has_sector(mp)) + return offsetofend(struct xfs_dsb, sb_logsectsize); + /* only support dirv2 or more recent */ + return offsetofend(struct xfs_dsb, sb_dirblklog); +} + +/* * Scrub the filesystem superblock. * * Note: We do /not/ attempt to check AG 0's superblock. Mount is @@ -75,6 +101,7 @@ xchk_superblock( struct xfs_buf *bp; struct xfs_dsb *sb; struct xfs_perag *pag; + size_t sblen; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -144,11 +171,19 @@ xchk_superblock( if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino)) xchk_block_set_preen(sc, bp); - if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(sc->mp)) { + if (sb->sb_rbmino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_rsumino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino)) + xchk_block_set_preen(sc, bp); + } if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize)) xchk_block_set_corrupt(sc, bp); @@ -224,11 +259,19 @@ xchk_superblock( * sb_icount, sb_ifree, sb_fdblocks, sb_frexents */ - if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_uquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); - if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) - xchk_block_set_preen(sc, bp); + if (sb->sb_gquotino != cpu_to_be64(0)) + xchk_block_set_preen(sc, bp); + } else { + if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino)) + xchk_block_set_preen(sc, bp); + } /* * Skip the quota flags since repair will force quotacheck. @@ -337,8 +380,13 @@ xchk_superblock( if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align)) xchk_block_set_corrupt(sc, bp); - if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) - xchk_block_set_preen(sc, bp); + if (xfs_has_metadir(mp)) { + if (sb->sb_pquotino != cpu_to_be64(0)) + xchk_block_set_corrupt(sc, bp); + } else { + if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino)) + xchk_block_set_preen(sc, bp); + } /* Don't care about sb_lsn */ } @@ -349,9 +397,26 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); } + if (xfs_has_metadir(mp)) { + if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino)) + xchk_block_set_preen(sc, bp); + + if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents)) + xchk_block_set_corrupt(sc, bp); + + if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog) + xchk_block_set_corrupt(sc, bp); + + if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad))) + xchk_block_set_corrupt(sc, bp); + } + /* Everything else must be zero. */ - if (memchr_inv(sb + 1, 0, - BBTOB(bp->b_length) - sizeof(struct xfs_dsb))) + sblen = xchk_superblock_ondisk_size(mp); + if (memchr_inv((char *)sb + sblen, 0, BBTOB(bp->b_length) - sblen)) xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); @@ -434,7 +499,7 @@ xchk_agf_xref_btreeblks( { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; - xfs_agblock_t blocks; + xfs_filblks_t blocks; xfs_agblock_t btreeblks; int error; @@ -483,7 +548,7 @@ xchk_agf_xref_refcblks( struct xfs_scrub *sc) { struct xfs_agf *agf = sc->sa.agf_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; if (!sc->sa.refc_cur) @@ -552,7 +617,7 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ @@ -816,7 +881,7 @@ xchk_agi_xref_fiblocks( struct xfs_scrub *sc) { struct xfs_agi *agi = sc->sa.agi_bp->b_addr; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error = 0; if (!xfs_has_inobtcounts(sc->mp)) @@ -932,7 +997,7 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != pag->block_count) + if (eoag != pag_group(pag)->xg_block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 2f98d90d7fd6..b45d2b32051a 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -208,8 +208,8 @@ xrep_agf_init_header( memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(pag->pag_agno); - agf->agf_length = cpu_to_be32(pag->block_count); + agf->agf_seqno = cpu_to_be32(pag_agno(pag)); + agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -256,7 +256,7 @@ xrep_agf_calc_from_btrees( struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; - xfs_agblock_t blocks; + xfs_filblks_t blocks; int error; /* Update the AGF counters from the bnobt. */ @@ -384,7 +384,7 @@ xrep_agf( * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); if (error) @@ -687,7 +687,7 @@ xrep_agfl_init_header( agfl = XFS_BUF_TO_AGFL(agfl_bp); memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag)); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); /* @@ -741,7 +741,7 @@ xrep_agfl( * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); if (error) @@ -897,8 +897,8 @@ xrep_agi_init_header( memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(pag->pag_agno); - agi->agi_length = cpu_to_be32(pag->block_count); + agi->agi_seqno = cpu_to_be32(pag_agno(pag)); + agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) @@ -946,7 +946,7 @@ xrep_agi_calc_from_btrees( if (error) goto err; if (xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; error = xfs_btree_count_blocks(cur, &blocks); if (error) @@ -959,7 +959,7 @@ xrep_agi_calc_from_btrees( agi->agi_freecount = cpu_to_be32(freecount); if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { - xfs_agblock_t blocks; + xfs_filblks_t blocks; cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); @@ -1038,12 +1038,10 @@ xrep_iunlink_reload_next( { struct xfs_scrub *sc = ragi->sc; struct xfs_inode *ip; - xfs_ino_t ino; xfs_agino_t ret = NULLAGINO; int error; - ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino); - error = xchk_iget(ragi->sc, ino, &ip); + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip); if (error) return ret; @@ -1114,9 +1112,9 @@ xrep_iunlink_igrab( struct xfs_perag *pag, struct xfs_inode *ip) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) return false; if (!xfs_inode_on_unlinked_list(ip)) @@ -1140,7 +1138,7 @@ xrep_iunlink_visit( unsigned int bucket; int error; - ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno); + ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag)); ASSERT(xfs_inode_on_unlinked_list(ip)); agino = XFS_INO_TO_AGINO(mp, ip->i_ino); @@ -1171,7 +1169,7 @@ xrep_iunlink_mark_incore( struct xrep_agi *ragi) { struct xfs_perag *pag = ragi->sc->sa.pag; - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); uint32_t first_index = 0; bool done = false; unsigned int nr_found = 0; @@ -1211,7 +1209,7 @@ xrep_iunlink_mark_incore( * us to see this inode, so another lookup from the * same index will not find it again. */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) @@ -1278,9 +1276,7 @@ xrep_iunlink_mark_ondisk_rec( * on because we haven't actually scrubbed the inobt or the * inodes yet. */ - error = xchk_iget(ragi->sc, - XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno, - agino), + error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip); if (error) continue; @@ -1539,15 +1535,13 @@ xrep_iunlink_relink_next( ip = xfs_iunlink_lookup(pag, agino); if (!ip) { - xfs_ino_t ino; xfs_agino_t prev_agino; /* * No inode exists in cache. Load it off the disk so that we * can reinsert it into the incore unlinked list. */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); - error = xchk_iget(sc, ino, &ip); + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); if (error) return -EFSCORRUPTED; @@ -1601,15 +1595,13 @@ xrep_iunlink_relink_prev( ip = xfs_iunlink_lookup(pag, agino); if (!ip) { - xfs_ino_t ino; xfs_agino_t next_agino; /* * No inode exists in cache. Load it off the disk so that we * can reinsert it into the incore unlinked list. */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); - error = xchk_iget(sc, ino, &ip); + error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip); if (error) return -EFSCORRUPTED; @@ -1769,7 +1761,7 @@ xrep_agi( * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AG_DADDR(mp, pag_agno(sc->sa.pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL); if (error) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index d1b8a4997dd2..8b282138097f 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -139,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_alloc_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 30295898cc8a..0433363a90b6 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -132,17 +132,16 @@ int xrep_setup_ag_allocbt( struct xfs_scrub *sc) { + struct xfs_group *xg = pag_group(sc->sa.pag); unsigned int busy_gen; /* * Make sure the busy extent list is clear because we can't put extents * on there twice. */ - busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); - if (xfs_extent_busy_list_empty(sc->sa.pag)) + if (xfs_extent_busy_list_empty(xg, &busy_gen)) return 0; - - return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); + return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); } /* Check for any obvious conflicts in the free extent. */ @@ -210,7 +209,7 @@ xrep_abt_stash( if (error) return error; - trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + trace_xrep_abt_found(sc->sa.pag, &arec); error = xfarray_append(ra->free_records, &arec); if (error) @@ -484,8 +483,8 @@ xrep_abt_reserve_space( ASSERT(arec.ar_blockcount <= UINT_MAX); len = min_t(unsigned int, arec.ar_blockcount, desired); - trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, - arec.ar_startblock, len, XFS_RMAP_OWN_AG); + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock, + len, XFS_RMAP_OWN_AG); error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, arec.ar_startblock, len); @@ -543,7 +542,7 @@ xrep_abt_dispose_one( /* Add a deferred rmap for each extent we used. */ if (resv->used > 0) - xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno, resv->used, XFS_RMAP_OWN_AG); /* @@ -554,8 +553,8 @@ xrep_abt_dispose_one( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, ra->new_bnobt.oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + ra->new_bnobt.oinfo.oi_owner); error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); @@ -849,6 +848,7 @@ xrep_allocbt( { struct xrep_abt *ra; struct xfs_mount *mp = sc->mp; + unsigned int busy_gen; char *descr; int error; @@ -869,7 +869,7 @@ xrep_allocbt( * on there twice. In theory we cleared this before we started, but * let's not risk the filesystem. */ - if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) { error = -EDEADLOCK; goto out_ra; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 5ab2ac53c920..7e00312225ed 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,6 +19,7 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_rtgroup.h" #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -314,8 +315,20 @@ xchk_bmap_rt_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + int error; + + error = xchk_rtgroup_init_existing(info->sc, + xfs_rtb_to_rgno(ip->i_mount, irec->br_startblock), + &info->sc->sr); + if (!xchk_fblock_process_error(info->sc, info->whichfork, + irec->br_startoff, &error)) + return; + + xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL); xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, irec->br_blockcount); + + xchk_rtgroup_free(info->sc, &info->sc->sr); } /* Cross-reference a single datadev extent record. */ @@ -600,8 +613,8 @@ xchk_bmap_check_rmap( if (irec.br_startoff != check_rec.rm_offset) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); - if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.pag->pag_agno, + if (irec.br_startblock != + xfs_agbno_to_fsb(to_perag(cur->bc_group), check_rec.rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, check_rec.rm_offset); @@ -761,11 +774,10 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error; - for_each_perag(sc->mp, agno, pag) { + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { @@ -822,9 +834,12 @@ xchk_bmap_iext_mapping( /* Are these two mappings contiguous with each other? */ static inline bool xchk_are_bmaps_contiguous( + const struct xchk_bmap_info *info, const struct xfs_bmbt_irec *b1, const struct xfs_bmbt_irec *b2) { + struct xfs_mount *mp = info->sc->mp; + /* Don't try to combine unallocated mappings. */ if (!xfs_bmap_is_real_extent(b1)) return false; @@ -838,6 +853,17 @@ xchk_are_bmaps_contiguous( return false; if (b1->br_state != b2->br_state) return false; + + /* + * Don't combine bmaps that would cross rtgroup boundaries. This is a + * valid state, but if combined they will fail rtb extent checks. + */ + if (info->is_rt && xfs_has_rtgroups(mp)) { + if (xfs_rtb_to_rgno(mp, b1->br_startblock) != + xfs_rtb_to_rgno(mp, b2->br_startblock)) + return false; + } + return true; } @@ -875,7 +901,7 @@ xchk_bmap_iext_iter( * that we just read, if possible. */ while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { - if (!xchk_are_bmaps_contiguous(irec, &got)) + if (!xchk_are_bmaps_contiguous(info, irec, &got)) break; if (!xchk_bmap_iext_mapping(info, &got)) { diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index 49dc38acc66b..7c4955482641 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -196,7 +196,7 @@ xrep_bmap_check_fork_rmap( return -EFSCORRUPTED; /* Check that this is within the AG. */ - if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + if (!xfs_verify_agbext(to_perag(cur->bc_group), rec->rm_startblock, rec->rm_blockcount)) return -EFSCORRUPTED; @@ -237,7 +237,6 @@ xrep_bmap_walk_rmap( void *priv) { struct xrep_bmap *rb = priv; - struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t fsbno; int error = 0; @@ -269,8 +268,7 @@ xrep_bmap_walk_rmap( if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) return -EFSCORRUPTED; - fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); + fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), rec->rm_startblock); if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { rb->old_bmbt_block_count += rec->rm_blockcount; @@ -409,12 +407,11 @@ xrep_bmap_find_mappings( struct xrep_bmap *rb) { struct xfs_scrub *sc = rb->sc; - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error = 0; /* Iterate the rmaps for extents. */ - for_each_perag(sc->mp, agno, pag) { + while ((pag = xfs_perag_next(sc->mp, pag))) { error = xrep_bmap_scan_ag(rb, pag); if (error) { xfs_perag_rele(pag); @@ -801,7 +798,7 @@ xrep_bmap( { struct xrep_bmap *rb; char *descr; - unsigned int max_bmbt_recs; + xfs_extnum_t max_bmbt_recs; bool large_extcount; int error = 0; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 22f5f1a9d3f0..5cbd94b56582 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -34,11 +34,13 @@ #include "xfs_quota.h" #include "xfs_exchmaps.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/tempfile.h" /* Common code for the metadata scrubbers. */ @@ -121,6 +123,17 @@ xchk_process_error( } bool +xchk_process_rt_error( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, + int *error) +{ + return __xchk_process_error(sc, rgno, rgbno, error, + XFS_SCRUB_OFLAG_CORRUPT, __return_address); +} + +bool xchk_xref_process_error( struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -513,7 +526,7 @@ xchk_perag_drain_and_lock( * Obviously, this should be slanted against scrub and in favor * of runtime threads. */ - if (!xfs_perag_intent_busy(sa->pag)) + if (!xfs_group_intent_busy(pag_group(sa->pag))) return 0; if (sa->agf_bp) { @@ -528,7 +541,7 @@ xchk_perag_drain_and_lock( if (!(sc->flags & XCHK_FSGATES_DRAIN)) return -ECHRNG; - error = xfs_perag_intent_drain(sa->pag); + error = xfs_group_intent_drain(pag_group(sa->pag)); if (error == -ERESTARTSYS) error = -EINTR; } while (!error); @@ -683,6 +696,72 @@ xchk_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* + * For scrubbing a realtime group, grab all the in-core resources we'll need to + * check the metadata, which means taking the ILOCK of the realtime group's + * metadata inodes. Callers must not join these inodes to the transaction with + * non-zero lockflags or concurrency problems will result. The @rtglock_flags + * argument takes XFS_RTGLOCK_* flags. + */ +int +xchk_rtgroup_init( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg == NULL); + ASSERT(sr->rtlock_flags == 0); + + sr->rtg = xfs_rtgroup_get(sc->mp, rgno); + if (!sr->rtg) + return -ENOENT; + return 0; +} + +void +xchk_rtgroup_lock( + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + xfs_rtgroup_lock(sr->rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; +} + +/* + * Unlock the realtime group. This must be done /after/ committing (or + * cancelling) the scrub transaction. + */ +static void +xchk_rtgroup_unlock( + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + if (sr->rtlock_flags) { + xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags); + sr->rtlock_flags = 0; + } +} + +/* + * Unlock the realtime group and release its resources. This must be done + * /after/ committing (or cancelling) the scrub transaction. + */ +void +xchk_rtgroup_free( + struct xfs_scrub *sc, + struct xchk_rt *sr) +{ + ASSERT(sr->rtg != NULL); + + xchk_rtgroup_unlock(sr); + + xfs_rtgroup_put(sr->rtg); + sr->rtg = NULL; +} +#endif /* CONFIG_XFS_RT */ + /* Per-scrubber setup functions */ void @@ -947,9 +1026,15 @@ xchk_iget_for_scrubbing( if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) return xchk_install_live_inode(sc, ip_in); - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -1084,6 +1169,10 @@ xchk_setup_inode_contents( if (error) return error; + error = xrep_tempfile_adjust_directory_tree(sc); + if (error) + return error; + /* Lock the inode so the VFS cannot touch this file. */ xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1239,12 +1328,6 @@ xchk_metadata_inode_forks( return 0; } - /* They also should never have extended attributes. */ - if (xfs_inode_hasattr(sc->ip)) { - xchk_ino_set_corrupt(sc, sc->ip->i_ino); - return 0; - } - /* Invoke the data fork scrubber. */ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) @@ -1261,6 +1344,21 @@ xchk_metadata_inode_forks( xchk_ino_set_corrupt(sc, sc->ip->i_ino); } + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + */ + if (xfs_inode_hasattr(sc->ip)) { + if (!xfs_has_metadir(sc->mp)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + } + return 0; } @@ -1336,7 +1434,7 @@ xchk_inode_is_allocated( } /* reject inode numbers outside existing AGs */ - ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + ino = xfs_agino_to_ino(pag, agino); if (!xfs_verify_ino(mp, ino)) return -EINVAL; @@ -1446,3 +1544,32 @@ out_rcu: rcu_read_unlock(); return error; } + +/* Is this inode a root directory for either tree? */ +bool +xchk_inode_is_dirtree_root(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + return ip == mp->m_rootip || + (xfs_has_metadir(mp) && ip == mp->m_metadirip); +} + +/* Does the superblock point down to this inode? */ +bool +xchk_inode_is_sb_rooted(const struct xfs_inode *ip) +{ + return xchk_inode_is_dirtree_root(ip) || + xfs_is_sb_inum(ip->i_mount, ip->i_ino); +} + +/* What is the root directory inumber for this inode? */ +xfs_ino_t +xchk_inode_rootdir_inum(const struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (xfs_is_metadir_inode(ip)) + return mp->m_metadirip->i_ino; + return mp->m_rootip->i_ino; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 47148cc4a833..9ff3cafd8679 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -12,6 +12,8 @@ void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); +bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + xfs_rgblock_t rgbno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset, int *error); @@ -73,12 +75,15 @@ int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); int xchk_setup_dirtree(struct xfs_scrub *sc); +int xchk_setup_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_setup_rtbitmap(struct xfs_scrub *sc); int xchk_setup_rtsummary(struct xfs_scrub *sc); +int xchk_setup_rgsuperblock(struct xfs_scrub *sc); #else # define xchk_setup_rtbitmap xchk_setup_nothing # define xchk_setup_rtsummary xchk_setup_nothing +# define xchk_setup_rgsuperblock xchk_setup_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); @@ -117,6 +122,34 @@ xchk_ag_init_existing( return error == -ENOENT ? -EFSCORRUPTED : error; } +#ifdef CONFIG_XFS_RT + +/* All the locks we need to check an rtgroup. */ +#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP) + +int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno, + struct xchk_rt *sr); + +static inline int +xchk_rtgroup_init_existing( + struct xfs_scrub *sc, + xfs_rgnumber_t rgno, + struct xchk_rt *sr) +{ + int error = xchk_rtgroup_init(sc, rgno, sr); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + +void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags); +void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr); +#else +# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED) +# define xchk_rtgroup_lock(sc, lockflags) do { } while (0) +# define xchk_rtgroup_free(sc, sr) do { } while (0) +#endif /* CONFIG_XFS_RT */ + int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); @@ -216,7 +249,8 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_ag_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ (sc)->mp->m_super->s_id, \ - (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + (sc)->sa.pag ? \ + pag_agno((sc)->sa.pag) : (sc)->sm->sm_agno, \ ##__VA_ARGS__) #define xchk_xfile_ino_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ @@ -241,4 +275,8 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, bool *inuse); +bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip); +bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip); +xfs_ino_t xchk_inode_rootdir_inum(const struct xfs_inode *ip); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 4de3f0f40f48..5b6194cef3e5 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -137,7 +137,6 @@ xrep_cow_mark_shared_staging( { struct xrep_cow *xc = priv; struct xfs_refcount_irec rrec; - xfs_fsblock_t fsbno; if (!xfs_refcount_check_domain(rec) || rec->rc_domain != XFS_REFC_DOMAIN_SHARED) @@ -145,9 +144,10 @@ xrep_cow_mark_shared_staging( xrep_cow_trim_refcount(xc, &rrec, rec); - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - rrec.rc_startblock); - return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); + return xrep_cow_mark_file_range(xc, + xfs_agbno_to_fsb(to_perag(cur->bc_group), + rrec.rc_startblock), + rrec.rc_blockcount); } /* @@ -177,9 +177,9 @@ xrep_cow_mark_missing_staging( if (xc->next_bno >= rrec.rc_startblock) goto next; + error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, - xc->next_bno), + xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno), rrec.rc_startblock - xc->next_bno); if (error) return error; @@ -200,7 +200,6 @@ xrep_cow_mark_missing_staging_rmap( void *priv) { struct xrep_cow *xc = priv; - xfs_fsblock_t fsbno; xfs_agblock_t rec_bno; xfs_extlen_t rec_len; unsigned int adj; @@ -222,8 +221,9 @@ xrep_cow_mark_missing_staging_rmap( rec_len -= adj; } - fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); - return xrep_cow_mark_file_range(xc, fsbno, rec_len); + return xrep_cow_mark_file_range(xc, + xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno), + rec_len); } /* @@ -275,8 +275,7 @@ xrep_cow_find_bad( if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { error = xrep_cow_mark_file_range(xc, - XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, - xc->next_bno), + xfs_agbno_to_fsb(pag, xc->next_bno), xc->irec_startbno + xc->irec.br_blockcount - xc->next_bno); if (error) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index bf9199e8df63..c877bde71e62 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -100,6 +100,14 @@ xchk_dir_check_ftype( if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + /* + * Metadata and regular inodes cannot cross trees. This property + * cannot change without a full inode free and realloc cycle, so it's + * safe to check this without holding locks. + */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* @@ -253,7 +261,7 @@ xchk_dir_actor( * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c index 64679fe08446..249313882108 100644 --- a/fs/xfs/scrub/dir_repair.c +++ b/fs/xfs/scrub/dir_repair.c @@ -415,6 +415,12 @@ xrep_dir_salvage_entry( if (error) return 0; + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) { + xchk_irele(sc, ip); + return 0; + } + xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); xchk_irele(sc, ip); @@ -1270,7 +1276,7 @@ xrep_dir_scan_dirtree( int error; /* Roots of directory trees are their own parents. */ - if (sc->ip == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(sc->ip)) xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino); /* @@ -1632,6 +1638,7 @@ xrep_dir_swap( struct xrep_dir *rd) { struct xfs_scrub *sc = rd->sc; + xfs_ino_t ino; bool ip_local, temp_local; int error = 0; @@ -1649,14 +1656,17 @@ xrep_dir_swap( /* * Reset the temporary directory's '..' entry to point to the parent - * that we found. The temporary directory was created with the root - * directory as the parent, so we can skip this if repairing a - * subdirectory of the root. + * that we found. The dirent replace code asserts if the dirent + * already points at the new inumber, so we look it up here. * * It's also possible that this replacement could also expand a sf * tempdir into block format. */ - if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) { + error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino); + if (error) + return error; + + if (rd->pscan.parent_ino != ino) { error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot, rd->pscan.parent_ino, rd->tx.req.resblks); if (error) diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c index bde58fb561ea..3a9cdf8738b6 100644 --- a/fs/xfs/scrub/dirtree.c +++ b/fs/xfs/scrub/dirtree.c @@ -362,7 +362,8 @@ xchk_dirpath_set_outcome( STATIC int xchk_dirpath_step_up( struct xchk_dirtree *dl, - struct xchk_dirpath *path) + struct xchk_dirpath *path, + bool is_metadir) { struct xfs_scrub *sc = dl->sc; struct xfs_inode *dp; @@ -435,6 +436,14 @@ xchk_dirpath_step_up( goto out_scanlock; } + /* Parent must be in the same directory tree. */ + if (is_metadir != xfs_is_metadir_inode(dp)) { + trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr, + path->nr_steps, &dl->xname, &dl->pptr_rec); + error = -EFSCORRUPTED; + goto out_scanlock; + } + /* * If the extended attributes look as though they has been zapped by * the inode record repair code, we cannot scan for parent pointers. @@ -508,6 +517,7 @@ xchk_dirpath_walk_upwards( struct xchk_dirpath *path) { struct xfs_scrub *sc = dl->sc; + bool is_metadir; int error; ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL); @@ -538,6 +548,7 @@ xchk_dirpath_walk_upwards( * ILOCK state is no longer tracked in the scrub context. Hence we * must drop @sc->ip's ILOCK during the walk. */ + is_metadir = xfs_is_metadir_inode(sc->ip); mutex_unlock(&dl->lock); xchk_iunlock(sc, XFS_ILOCK_EXCL); @@ -547,7 +558,7 @@ xchk_dirpath_walk_upwards( * If we see any kind of error here (including corruptions), the parent * pointer of @sc->ip is corrupt. Stop the whole scan. */ - error = xchk_dirpath_step_up(dl, path); + error = xchk_dirpath_step_up(dl, path, is_metadir); if (error) { xchk_ilock(sc, XFS_ILOCK_EXCL); mutex_lock(&dl->lock); @@ -560,7 +571,7 @@ xchk_dirpath_walk_upwards( * *somewhere* in the path, but we don't need to stop scanning. */ while (!error && path->outcome == XCHK_DIRPATH_SCANNING) - error = xchk_dirpath_step_up(dl, path); + error = xchk_dirpath_step_up(dl, path, is_metadir); /* Retake the locks we had, mark paths, etc. */ xchk_ilock(sc, XFS_ILOCK_EXCL); @@ -917,7 +928,7 @@ xchk_dirtree( * scan, because the hook doesn't detach until after sc->ip gets * released during teardown. */ - dl->root_ino = sc->mp->m_rootip->i_ino; + dl->root_ino = xchk_inode_rootdir_inum(sc->ip); dl->scan_ino = sc->ip->i_ino; trace_xchk_dirtree_start(sc->ip, sc->sm, 0); @@ -983,3 +994,16 @@ out: trace_xchk_dirtree_done(sc->ip, sc->sm, error); return error; } + +/* Does the directory targetted by this scrub have no parents? */ +bool +xchk_dirtree_parentless(const struct xchk_dirtree *dl) +{ + struct xfs_scrub *sc = dl->sc; + + if (xchk_inode_is_dirtree_root(sc->ip)) + return true; + if (VFS_I(sc->ip)->i_nlink == 0) + return true; + return false; +} diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h index 1e1686365c61..9e5d95492717 100644 --- a/fs/xfs/scrub/dirtree.h +++ b/fs/xfs/scrub/dirtree.h @@ -156,17 +156,7 @@ struct xchk_dirtree { #define xchk_dirtree_for_each_path(dl, path) \ list_for_each_entry((path), &(dl)->path_list, list) -static inline bool -xchk_dirtree_parentless(const struct xchk_dirtree *dl) -{ - struct xfs_scrub *sc = dl->sc; - - if (sc->ip == sc->mp->m_rootip) - return true; - if (VFS_I(sc->ip)->i_nlink == 0) - return true; - return false; -} +bool xchk_dirtree_parentless(const struct xchk_dirtree *dl); int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl); int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip, diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c index 01766041ba2c..84487072b6dd 100644 --- a/fs/xfs/scrub/findparent.c +++ b/fs/xfs/scrub/findparent.c @@ -172,6 +172,10 @@ xrep_findparent_walk_directory( */ lock_mode = xfs_ilock_data_map_shared(dp); + /* Don't mix metadata and regular directory trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) + goto out_unlock; + /* * If this directory is known to be sick, we cannot scan it reliably * and must abort. @@ -362,15 +366,24 @@ xrep_findparent_confirm( }; int error; - /* - * The root directory always points to itself. Unlinked dirs can point - * anywhere, so we point them at the root dir too. - */ - if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) { + /* The root directory always points to itself. */ + if (sc->ip == sc->mp->m_rootip) { *parent_ino = sc->mp->m_sb.sb_rootino; return 0; } + /* The metadata root directory always points to itself. */ + if (sc->ip == sc->mp->m_metadirip) { + *parent_ino = sc->mp->m_sb.sb_metadirino; + return 0; + } + + /* Unlinked dirs can point anywhere; point them up to the root dir. */ + if (VFS_I(sc->ip)->i_nlink == 0) { + *parent_ino = xchk_inode_rootdir_inum(sc->ip); + return 0; + } + /* Reject garbage parent inode numbers and self-referential parents. */ if (*parent_ino == NULLFSINO) return 0; @@ -412,8 +425,11 @@ xrep_findparent_self_reference( if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino) return sc->mp->m_sb.sb_rootino; + if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino) + return sc->mp->m_sb.sb_metadirino; + if (VFS_I(sc->ip)->i_nlink == 0) - return sc->mp->m_sb.sb_rootino; + return xchk_inode_rootdir_inum(sc->ip); return NULLFSINO; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 1d3e98346933..ca23cf4db6c5 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -19,6 +19,7 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_icache.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -74,10 +75,9 @@ xchk_fscount_warmup( struct xfs_buf *agi_bp = NULL; struct xfs_buf *agf_bp = NULL; struct xfs_perag *pag = NULL; - xfs_agnumber_t agno; int error = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; if (xfs_perag_initialised_agi(pag) && @@ -261,7 +261,7 @@ xchk_fscount_btreeblks( struct xchk_fscounters *fsc, xfs_agnumber_t agno) { - xfs_extlen_t blocks; + xfs_filblks_t blocks; int error; error = xchk_ag_init_existing(sc, agno, &sc->sa); @@ -295,9 +295,8 @@ xchk_fscount_aggregate_agcounts( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; uint64_t delayed; - xfs_agnumber_t agno; int tries = 8; int error = 0; @@ -306,7 +305,7 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { if (xchk_should_terminate(sc, &error)) break; @@ -327,7 +326,7 @@ retry: if (xfs_has_lazysbcount(sc->mp)) { fsc->fdblocks += pag->pagf_btreeblks; } else { - error = xchk_fscount_btreeblks(sc, fsc, agno); + error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag)); if (error) break; } @@ -388,7 +387,7 @@ retry: #ifdef CONFIG_XFS_RT STATIC int xchk_fscount_add_frextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -409,6 +408,7 @@ xchk_fscount_count_frextents( struct xchk_fscounters *fsc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = NULL; int error; fsc->frextents = 0; @@ -416,19 +416,20 @@ xchk_fscount_count_frextents( if (!xfs_has_realtime(mp)) return 0; - xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_all(sc->mp, sc->tp, - xchk_fscount_add_frextent, fsc); - if (error) { - xchk_set_incomplete(sc); - goto out_unlock; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, sc->tp, + xchk_fscount_add_frextent, fsc); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xchk_set_incomplete(sc); + xfs_rtgroup_rele(rtg); + return error; + } } fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents); - -out_unlock: - xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP); - return error; + return 0; } #else STATIC int diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index 469bf645dbea..cda13447a373 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -68,15 +68,16 @@ xrep_fscounters( /* * Online repair is only supported on v5 file systems, which require - * lazy sb counters and thus no update of sb_fdblocks here. But as of - * now we don't support lazy counting sb_frextents yet, and thus need - * to also update it directly here. And for that we need to keep + * lazy sb counters and thus no update of sb_fdblocks here. But + * sb_frextents only uses a lazy counter with rtgroups, and thus needs + * to be updated directly here otherwise. And for that we need to keep * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ percpu_counter_set(&mp->m_frextents, fsc->frextents - fsc->frextents_delayed); - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index b712a8bd34f5..ccc6ca5934ca 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -12,6 +12,7 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_health.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/health.h" #include "scrub/common.h" @@ -70,10 +71,11 @@ /* Map our scrub type to a sick mask and a set of health update functions. */ enum xchk_health_group { - XHG_FS = 1, - XHG_RT, + XHG_NONE = 1, + XHG_FS, XHG_AG, XHG_INO, + XHG_RTGROUP, }; struct xchk_health_map { @@ -82,6 +84,7 @@ struct xchk_health_map { }; static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_PROBE] = { XHG_NONE, 0 }, [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, @@ -100,8 +103,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, - [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, - [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RTGROUP, XFS_SICK_RG_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RTGROUP, XFS_SICK_RG_SUMMARY }, [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, @@ -109,6 +112,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, + [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH }, + [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER }, }; /* Return the health status mask for this scrub type. */ @@ -130,7 +135,7 @@ xchk_mark_healthy_if_clean( { if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT))) - sc->sick_mask |= mask; + sc->healthy_mask |= mask; } /* @@ -160,13 +165,14 @@ STATIC void xchk_mark_all_healthy( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); - xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); - for_each_perag(mp, agno, pag) - xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); + while ((pag = xfs_perag_next(mp, pag))) + xfs_group_mark_healthy(pag_group(pag), XFS_SICK_AG_INDIRECT); + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_group_mark_healthy(rtg_group(rtg), XFS_SICK_RG_INDIRECT); } /* @@ -184,6 +190,8 @@ xchk_update_health( struct xfs_scrub *sc) { struct xfs_perag *pag; + struct xfs_rtgroup *rtg; + unsigned int mask = sc->sick_mask; bool bad; /* @@ -198,49 +206,57 @@ xchk_update_health( return; } - if (!sc->sick_mask) - return; - bad = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | XFS_SCRUB_OFLAG_XCORRUPT)); + if (!bad) + mask |= sc->healthy_mask; switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_NONE: + break; case XHG_AG: + if (!mask) + return; pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_corrupt(pag, sc->sick_mask); + xfs_group_mark_corrupt(pag_group(pag), mask); else - xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_group_mark_healthy(pag_group(pag), mask); xfs_perag_put(pag); break; case XHG_INO: if (!sc->ip) return; - if (bad) { - unsigned int mask = sc->sick_mask; - - /* - * If we're coming in for repairs then we don't want - * sickness flags to propagate to the incore health - * status if the inode gets inactivated before we can - * fix it. - */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - mask |= XFS_SICK_INO_FORGET; + /* + * If we're coming in for repairs then we don't want sickness + * flags to propagate to the incore health status if the inode + * gets inactivated before we can fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + if (!mask) + return; + if (bad) xfs_inode_mark_corrupt(sc->ip, mask); - } else - xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, mask); break; case XHG_FS: + if (!mask) + return; if (bad) - xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, mask); else - xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + xfs_fs_mark_healthy(sc->mp, mask); break; - case XHG_RT: + case XHG_RTGROUP: + if (!mask) + return; + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); + xfs_group_mark_corrupt(rtg_group(rtg), mask); else - xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + xfs_group_mark_healthy(rtg_group(rtg), mask); + xfs_rtgroup_put(rtg); break; default: ASSERT(0); @@ -277,7 +293,7 @@ xchk_ag_btree_del_cursor_if_sick( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { + if (xfs_group_has_sickness((*curp)->bc_group, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); *curp = NULL; @@ -294,9 +310,8 @@ xchk_health_record( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; - + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; unsigned int sick; unsigned int checked; @@ -304,15 +319,17 @@ xchk_health_record( if (sick & XFS_SICK_FS_PRIMARY) xchk_set_corrupt(sc); - xfs_rt_measure_sickness(mp, &sick, &checked); - if (sick & XFS_SICK_RT_PRIMARY) - xchk_set_corrupt(sc); - - for_each_perag(mp, agno, pag) { - xfs_ag_measure_sickness(pag, &sick, &checked); + while ((pag = xfs_perag_next(mp, pag))) { + xfs_group_measure_sickness(pag_group(pag), &sick, &checked); if (sick & XFS_SICK_AG_PRIMARY) xchk_set_corrupt(sc); } + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + if (sick & XFS_SICK_RG_PRIMARY) + xchk_set_corrupt(sc); + } + return 0; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 750d7b0cd25a..4dc7c83dc08a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -258,7 +258,7 @@ xchk_iallocbt_chunk( { struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xfs_perag *pag = to_perag(bs->cur->bc_group); xfs_agblock_t agbno; xfs_extlen_t len; @@ -303,7 +303,6 @@ xchk_iallocbt_check_cluster_ifree( unsigned int irec_ino, struct xfs_dinode *dip) { - struct xfs_mount *mp = bs->cur->bc_mp; xfs_ino_t fsino; xfs_agino_t agino; bool irec_free; @@ -319,7 +318,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); + fsino = xfs_agino_to_ino(to_perag(bs->cur->bc_group), agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -368,7 +367,6 @@ xchk_iallocbt_check_cluster( struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -396,7 +394,7 @@ xchk_iallocbt_check_cluster( * ir_startino can be large enough to make im_boffset nonzero. */ ir_holemask = (irec->ir_holemask & cluster_mask); - imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_blkno = xfs_agbno_to_daddr(to_perag(bs->cur->bc_group), agbno); imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) << mp->m_sb.sb_inodelog; @@ -407,9 +405,9 @@ xchk_iallocbt_check_cluster( return 0; } - trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, - imap.im_blkno, imap.im_len, cluster_base, nr_inodes, - cluster_mask, ir_holemask, + trace_xchk_iallocbt_check_cluster(to_perag(bs->cur->bc_group), + irec->ir_startino, imap.im_blkno, imap.im_len, + cluster_base, nr_inodes, cluster_mask, ir_holemask, XFS_INO_TO_OFFSET(mp, irec->ir_startino + cluster_base)); @@ -585,7 +583,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_inobt_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -652,8 +650,8 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc) { xfs_filblks_t blocks; - xfs_extlen_t inobt_blocks = 0; - xfs_extlen_t finobt_blocks = 0; + xfs_filblks_t inobt_blocks = 0; + xfs_filblks_t finobt_blocks = 0; int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index a00ec7ae1792..14e48d3f1912 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -146,15 +146,12 @@ xrep_ibt_check_ifree( struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp; struct xfs_dinode *dip; - xfs_ino_t fsino; xfs_agino_t agino; - xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; unsigned int cluster_buf_base; unsigned int offset; int error; agino = cluster_ag_base + cluster_index; - fsino = XFS_AGINO_TO_INO(mp, agno, agino); /* Inode uncached or half assembled, read disk buffer */ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); @@ -165,7 +162,8 @@ xrep_ibt_check_ifree( if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) return -EFSCORRUPTED; - if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + if (dip->di_version >= 3 && + be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino)) return -EFSCORRUPTED; /* Will the in-core inode tell us if it's in use? */ @@ -194,7 +192,7 @@ xrep_ibt_stash( if (ri->rie.ir_freecount > 0) ri->finobt_recs++; - trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie); error = xfarray_append(ri->inode_records, &ri->rie); if (error) @@ -307,7 +305,7 @@ xrep_ibt_process_cluster( * inobt because imap_to_bp directly maps the buffer without touching * either inode btree. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno); imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); imap.im_boffset = 0; error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); @@ -423,9 +421,7 @@ xrep_ibt_record_inode_blocks( if (error) return error; - trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, - rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, - rec->rm_offset, rec->rm_flags); + trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec); /* * Record the free/hole masks for each inode cluster that could be @@ -634,7 +630,6 @@ xrep_ibt_build_new_trees( struct xfs_scrub *sc = ri->sc; struct xfs_btree_cur *ino_cur; struct xfs_btree_cur *fino_cur = NULL; - xfs_fsblock_t fsbno; bool need_finobt; int error; @@ -656,9 +651,8 @@ xrep_ibt_build_new_trees( * * Start by setting up the inobt staging cursor. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_IBT_BLOCK(sc->mp)), - xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, + xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)), XFS_AG_RESV_NONE); ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; @@ -677,10 +671,9 @@ xrep_ibt_build_new_trees( if (sc->mp->m_finobt_nores) resv = XFS_AG_RESV_NONE; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_FIBT_BLOCK(sc->mp)), xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, - fsbno, resv); + xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)), + resv); ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; @@ -821,7 +814,7 @@ xrep_iallocbt( sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; /* Set up enough storage to handle an AG with nothing but inodes. */ - xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino); last_agino /= XFS_INODES_PER_CHUNK; descr = xchk_xfile_ag_descr(sc, "inode index records"); error = xfarray_create(descr, last_agino, diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index d32716fb2fec..25ee66e7649d 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -60,6 +60,22 @@ xchk_install_handle_iscrub( if (error) return error; + /* + * Don't allow scrubbing by handle of any non-directory inode records + * in the metadata directory tree. We don't know if any of the scans + * launched by this scrubber will end up indirectly trying to lock this + * file. + * + * Scrubbers of inode-rooted metadata files (e.g. quota files) will + * attach all the resources needed to scrub the inode and call + * xchk_inode directly. Userspace cannot call this directly. + */ + if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) { + xchk_irele(sc, ip); + sc->ip = NULL; + return -ENOENT; + } + return xchk_prepare_iscrub(sc); } @@ -94,9 +110,15 @@ xchk_setup_inode( return xchk_prepare_iscrub(sc); } - /* Reject internal metadata files and obviously bad inode numbers. */ - if (xfs_internal_inum(mp, sc->sm->sm_ino)) + /* + * On pre-metadir filesystems, reject internal metadata files. For + * metadir filesystems, limited scrubbing of any file in the metadata + * directory tree by handle is allowed, because that is the only way to + * validate the lack of parent pointers in the sb-root metadata inodes. + */ + if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino)) return -ENOENT; + /* Reject obviously bad inode numbers. */ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; @@ -421,8 +443,13 @@ xchk_dinode( break; case 2: case 3: - if (dip->di_onlink != 0) - xchk_ino_set_corrupt(sc, ino); + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + xchk_ino_set_corrupt(sc, ino); + } else { + if (dip->di_metatype != 0) + xchk_ino_set_corrupt(sc, ino); + } if (dip->di_mode == 0 && sc->ip) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 3e45b9b72312..5a58ddd27bd2 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -521,10 +521,17 @@ STATIC void xrep_dinode_nlinks( struct xfs_dinode *dip) { - if (dip->di_version > 1) - dip->di_onlink = 0; - else + if (dip->di_version < 2) { dip->di_nlink = 0; + return; + } + + if (xfs_dinode_is_metadir(dip)) { + if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) + dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN); + } else { + dip->di_metatype = 0; + } } /* Fix any conflicting flags that the verifiers complain about. */ @@ -565,6 +572,16 @@ xrep_dinode_flags( dip->di_nrext64_pad = 0; else if (dip->di_version >= 3) dip->di_v3_pad = 0; + + if (flags2 & XFS_DIFLAG2_METADATA) { + xfs_failaddr_t fa; + + fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags, + flags2); + if (fa) + flags2 &= ~XFS_DIFLAG2_METADATA; + } + dip->di_flags = cpu_to_be16(flags); dip->di_flags2 = cpu_to_be64(flags2); } @@ -761,14 +778,13 @@ STATIC int xrep_dinode_count_rmaps( struct xrep_inode *ri) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error; if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) return -EOPNOTSUPP; - for_each_perag(ri->sc->mp, agno, pag) { + while ((pag = xfs_perag_next(ri->sc->mp, pag))) { error = xrep_dinode_count_ag_rmaps(ri, pag); if (error) { xfs_perag_rele(pag); @@ -1755,15 +1771,8 @@ xrep_inode_pptr( if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) return 0; - /* The root directory doesn't have a parent pointer. */ - if (ip == mp->m_rootip) - return 0; - - /* - * Metadata inodes are rooted in the superblock and do not have any - * parents. - */ - if (xfs_is_metadata_inode(ip)) + /* Children of the superblock do not have parent pointers. */ + if (xchk_inode_is_sb_rooted(ip)) return 0; /* Inode already has an attr fork; no further work possible here. */ diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index cf9d983667ce..84f117667ca2 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -67,7 +67,7 @@ xchk_iscan_mask_skipino( xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); - if (pag->pag_agno != skip_agno) + if (pag_agno(pag) != skip_agno) return; if (skip_agino < rec->ir_startino) return; @@ -95,7 +95,7 @@ xchk_iscan_find_next( struct xfs_btree_cur *cur; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = sc->tp; - xfs_agnumber_t agno = pag->pag_agno; + xfs_agnumber_t agno = pag_agno(pag); xfs_agino_t lastino = NULLAGINO; xfs_agino_t first, last; xfs_agino_t agino = *cursor; diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c new file mode 100644 index 000000000000..c678cba1ffc3 --- /dev/null +++ b/fs/xfs/scrub/metapath.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_metafile.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dir2.h" +#include "xfs_parent.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_attr.h" +#include "xfs_rtgroup.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" +#include "scrub/repair.h" + +/* + * Metadata Directory Tree Paths + * ============================= + * + * A filesystem with metadir enabled expects to find metadata structures + * attached to files that are accessible by walking a path down the metadata + * directory tree. Given the metadir path and the incore inode storing the + * metadata, this scrubber ensures that the ondisk metadir path points to the + * ondisk inode represented by the incore inode. + */ + +struct xchk_metapath { + struct xfs_scrub *sc; + + /* Name for lookup */ + struct xfs_name xname; + + /* Directory update for repairs */ + struct xfs_dir_update du; + + /* Path down to this metadata file from the parent directory */ + const char *path; + + /* Directory parent of the metadata file. */ + struct xfs_inode *dp; + + /* Locks held on dp */ + unsigned int dp_ilock_flags; + + /* Transaction block reservations */ + unsigned int link_resblks; + unsigned int unlink_resblks; + + /* Parent pointer updates */ + struct xfs_parent_args link_ppargs; + struct xfs_parent_args unlink_ppargs; + + /* Scratchpads for removing links */ + struct xfs_da_args pptr_args; +}; + +/* Release resources tracked in the buffer. */ +static inline void +xchk_metapath_cleanup( + void *buf) +{ + struct xchk_metapath *mpath = buf; + + if (mpath->dp_ilock_flags) + xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); + kfree(mpath->path); +} + +/* Set up a metadir path scan. @path must be dynamically allocated. */ +static inline int +xchk_setup_metapath_scan( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const char *path, + struct xfs_inode *ip) +{ + struct xchk_metapath *mpath; + int error; + + if (!path) + return -ENOMEM; + + error = xchk_install_live_inode(sc, ip); + if (error) { + kfree(path); + return error; + } + + mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); + if (!mpath) { + kfree(path); + return -ENOMEM; + } + + mpath->sc = sc; + sc->buf = mpath; + sc->buf_cleanup = xchk_metapath_cleanup; + + mpath->dp = dp; + mpath->path = path; /* path is now owned by mpath */ + + mpath->xname.name = mpath->path; + mpath->xname.len = strlen(mpath->path); + mpath->xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + + return 0; +} + +#ifdef CONFIG_XFS_RT +/* Scan the /rtgroups directory itself. */ +static int +xchk_setup_metapath_rtdir( + struct xfs_scrub *sc) +{ + if (!sc->mp->m_rtdirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); +} + +/* Scan a rtgroup inode under the /rtgroups directory. */ +static int +xchk_setup_metapath_rtginode( + struct xfs_scrub *sc, + enum xfs_rtg_inodes type) +{ + struct xfs_rtgroup *rtg; + struct xfs_inode *ip; + int error; + + rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno); + if (!rtg) + return -ENOENT; + + ip = rtg->rtg_inodes[type]; + if (!ip) { + error = -ENOENT; + goto out_put_rtg; + } + + error = xchk_setup_metapath_scan(sc, sc->mp->m_rtdirip, + xfs_rtginode_path(rtg_rgno(rtg), type), ip); + +out_put_rtg: + xfs_rtgroup_put(rtg); + return error; +} +#else +# define xchk_setup_metapath_rtdir(...) (-ENOENT) +# define xchk_setup_metapath_rtginode(...) (-ENOENT) +#endif /* CONFIG_XFS_RT */ + +#ifdef CONFIG_XFS_QUOTA +/* Scan the /quota directory itself. */ +static int +xchk_setup_metapath_quotadir( + struct xfs_scrub *sc) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + + if (!qi || !qi->qi_dirip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, + kstrdup("quota", GFP_KERNEL), qi->qi_dirip); +} + +/* Scan a quota inode under the /quota directory. */ +static int +xchk_setup_metapath_dqinode( + struct xfs_scrub *sc, + xfs_dqtype_t type) +{ + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_inode *ip = NULL; + + if (!qi) + return -ENOENT; + + switch (type) { + case XFS_DQTYPE_USER: + ip = qi->qi_uquotaip; + break; + case XFS_DQTYPE_GROUP: + ip = qi->qi_gquotaip; + break; + case XFS_DQTYPE_PROJ: + ip = qi->qi_pquotaip; + break; + default: + ASSERT(0); + return -EINVAL; + } + if (!ip) + return -ENOENT; + + return xchk_setup_metapath_scan(sc, qi->qi_dirip, + kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); +} +#else +# define xchk_setup_metapath_quotadir(...) (-ENOENT) +# define xchk_setup_metapath_dqinode(...) (-ENOENT) +#endif /* CONFIG_XFS_QUOTA */ + +int +xchk_setup_metapath( + struct xfs_scrub *sc) +{ + if (!xfs_has_metadir(sc->mp)) + return -ENOENT; + if (sc->sm->sm_gen) + return -EINVAL; + + switch (sc->sm->sm_ino) { + case XFS_SCRUB_METAPATH_PROBE: + /* Just probing, nothing else to do. */ + if (sc->sm->sm_agno) + return -EINVAL; + return 0; + case XFS_SCRUB_METAPATH_RTDIR: + return xchk_setup_metapath_rtdir(sc); + case XFS_SCRUB_METAPATH_RTBITMAP: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_BITMAP); + case XFS_SCRUB_METAPATH_RTSUMMARY: + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_SUMMARY); + case XFS_SCRUB_METAPATH_QUOTADIR: + return xchk_setup_metapath_quotadir(sc); + case XFS_SCRUB_METAPATH_USRQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_USER); + case XFS_SCRUB_METAPATH_GRPQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP); + case XFS_SCRUB_METAPATH_PRJQUOTA: + return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ); + default: + return -ENOENT; + } +} + +/* + * Take the ILOCK on the metadata directory parent and child. We do not know + * that the metadata directory is not corrupt, so we lock the parent and try + * to lock the child. Returns 0 if successful, or -EINTR to abort the scrub. + */ +STATIC int +xchk_metapath_ilock_both( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) { + mpath->dp_ilock_flags |= XFS_ILOCK_EXCL; + return 0; + } + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* Unlock parent and child inodes. */ +static inline void +xchk_metapath_iunlock( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + xchk_iunlock(sc, XFS_ILOCK_EXCL); + + mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); +} + +int +xchk_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + xfs_ino_t ino = NULLFSINO; + int error; + + /* Just probing, nothing else to do. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) + goto out_cancel; + + /* Make sure the parent dir has a dirent pointing to this file. */ + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* No directory entry at all */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + error = 0; + goto out_ilock; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + goto out_ilock; + if (ino != sc->ip->i_ino) { + /* Pointing to wrong inode */ + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + } + +out_ilock: + xchk_metapath_iunlock(mpath); +out_cancel: + xchk_trans_cancel(sc); + return error; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Create the dirent represented by the final component of the path. */ +STATIC int +xrep_metapath_link( + struct xchk_metapath *mpath) +{ + struct xfs_scrub *sc = mpath->sc; + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = sc->ip; + + if (xfs_has_parent(sc->mp)) + mpath->du.ppargs = &mpath->link_ppargs; + else + mpath->du.ppargs = NULL; + + trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino); + + return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du); +} + +/* Remove the dirent at the final component of the path. */ +STATIC int +xrep_metapath_unlink( + struct xchk_metapath *mpath, + xfs_ino_t ino, + struct xfs_inode *ip) +{ + struct xfs_parent_rec rec; + struct xfs_scrub *sc = mpath->sc; + struct xfs_mount *mp = sc->mp; + int error; + + trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino); + + if (!ip) { + /* The child inode isn't allocated. Junk the dirent. */ + xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE); + return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname, + ino, mpath->unlink_resblks); + } + + mpath->du.dp = mpath->dp; + mpath->du.name = &mpath->xname; + mpath->du.ip = ip; + mpath->du.ppargs = NULL; + + /* Figure out if we're removing a parent pointer too. */ + if (xfs_has_parent(mp)) { + xfs_inode_to_parent_rec(&rec, ip); + error = xfs_parent_lookup(sc->tp, ip, &mpath->xname, &rec, + &mpath->pptr_args); + switch (error) { + case -ENOATTR: + break; + case 0: + mpath->du.ppargs = &mpath->unlink_ppargs; + break; + default: + return error; + } + } + + return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du); +} + +/* + * Try to create a dirent in @mpath->dp with the name @mpath->xname that points + * to @sc->ip. Returns: + * + * -EEXIST and an @alleged_child if the dirent that points to the wrong inode; + * 0 if there is now a dirent pointing to @sc->ip; or + * A negative errno on error. + */ +STATIC int +xrep_metapath_try_link( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + xfs_ino_t ino; + int error; + + /* Allocate transaction, lock inodes, join to transaction. */ + error = xchk_trans_alloc(sc, mpath->link_resblks); + if (error) + return error; + + error = xchk_metapath_ilock_both(mpath); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory. Create an entry + * pointing to @sc->ip. + */ + error = xrep_metapath_link(mpath); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + xchk_metapath_iunlock(mpath); + return error; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = 0; + goto out_cancel; + } + + /* + * The dirent points elsewhere; pass that back so that the caller + * can try to remove the dirent. + */ + *alleged_child = ino; + error = -EEXIST; + +out_cancel: + xchk_trans_cancel(sc); + xchk_metapath_iunlock(mpath); + return error; +} + +/* + * Take the ILOCK on the metadata directory parent and a bad child, if one is + * supplied. We do not know that the metadata directory is not corrupt, so we + * lock the parent and try to lock the child. Returns 0 if successful, or + * -EINTR to abort the repair. The lock state of @dp is not recorded in @mpath. + */ +STATIC int +xchk_metapath_ilock_parent_and_child( + struct xchk_metapath *mpath, + struct xfs_inode *ip) +{ + struct xfs_scrub *sc = mpath->sc; + int error = 0; + + while (true) { + xfs_ilock(mpath->dp, XFS_ILOCK_EXCL); + if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return 0; + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + + if (xchk_should_terminate(sc, &error)) + return error; + + delay(1); + } + + ASSERT(0); + return -EINTR; +} + +/* + * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points + * to @alleged_child. Returns: + * + * 0 if there is no longer a dirent; + * -EEXIST if the dirent points to @sc->ip; + * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or + * A negative errno for any other error. + */ +STATIC int +xrep_metapath_try_unlink( + struct xchk_metapath *mpath, + xfs_ino_t *alleged_child) +{ + struct xfs_scrub *sc = mpath->sc; + struct xfs_inode *ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(*alleged_child != sc->ip->i_ino); + + trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp, + *alleged_child); + + /* + * Allocate transaction, grab the alleged child inode, lock inodes, + * join to transaction. + */ + error = xchk_trans_alloc(sc, mpath->unlink_resblks); + if (error) + return error; + + error = xchk_iget(sc, *alleged_child, &ip); + if (error == -EINVAL || error == -ENOENT) { + /* inode number is bogus, junk the dirent */ + error = 0; + } + if (error) { + xchk_trans_cancel(sc); + return error; + } + + error = xchk_metapath_ilock_parent_and_child(mpath, ip); + if (error) { + xchk_trans_cancel(sc); + return error; + } + xfs_trans_ijoin(sc->tp, mpath->dp, 0); + if (ip) + xfs_trans_ijoin(sc->tp, ip, 0); + + error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino); + trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino); + if (error == -ENOENT) { + /* + * There is no dirent in the directory anymore. We're ready to + * try the link operation again. + */ + error = 0; + goto out_cancel; + } + if (error) + goto out_cancel; + + if (ino == sc->ip->i_ino) { + /* The dirent already points to @sc->ip; we're done. */ + error = -EEXIST; + goto out_cancel; + } + + /* + * The dirent does not point to the alleged child. Update the caller + * and signal that we want to be called again. + */ + if (ino != *alleged_child) { + *alleged_child = ino; + error = -EAGAIN; + goto out_cancel; + } + + /* Remove the link to the child. */ + error = xrep_metapath_unlink(mpath, ino, ip); + if (error) + goto out_cancel; + + error = xrep_trans_commit(sc); + goto out_unlock; + +out_cancel: + xchk_trans_cancel(sc); +out_unlock: + xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xchk_irele(sc, ip); + } + return error; +} + +/* + * Make sure the metadata directory path points to the child being examined. + * + * Repair needs to be able to create a directory structure, create its own + * transactions, and take ILOCKs. This function /must/ be called after all + * other repairs have completed. + */ +int +xrep_metapath( + struct xfs_scrub *sc) +{ + struct xchk_metapath *mpath = sc->buf; + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Just probing, nothing to repair. */ + if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE) + return 0; + + /* Parent required to do anything else. */ + if (mpath->dp == NULL) + return -EFSCORRUPTED; + + /* + * Make sure the child file actually has an attr fork to receive a new + * parent pointer if the fs has parent pointers. + */ + if (xfs_has_parent(mp)) { + error = xfs_attr_add_fork(sc->ip, + sizeof(struct xfs_attr_sf_hdr), 1); + if (error) + return error; + } + + /* Compute block reservation required to unlink and link a file. */ + mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN); + mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN); + + do { + xfs_ino_t alleged_child; + + /* Re-establish the link, or tell us which inode to remove. */ + error = xrep_metapath_try_link(mpath, &alleged_child); + if (!error) + return 0; + if (error != -EEXIST) + return error; + + /* + * Remove an incorrect link to an alleged child, or tell us + * which inode to remove. + */ + do { + error = xrep_metapath_try_unlink(mpath, &alleged_child); + } while (error == -EAGAIN); + if (error == -EEXIST) { + /* Link established; we're done. */ + error = 0; + break; + } + } while (!error); + + return error; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 2aa14b7ab630..70af27d98734 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -58,7 +58,7 @@ xrep_newbt_estimate_slack( if (sc->ops->type == ST_PERAG) { free = sc->sa.pag->pagf_freeblks; - sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { free = percpu_counter_sum(&sc->mp->m_fdblocks); sz = sc->mp->m_sb.sb_dblocks; @@ -186,11 +186,10 @@ xrep_newbt_add_extent( xfs_agblock_t agbno, xfs_extlen_t len) { - struct xfs_mount *mp = xnr->sc->mp; struct xfs_alloc_arg args = { .tp = NULL, /* no autoreap */ .oinfo = xnr->oinfo, - .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .fsbno = xfs_agbno_to_fsb(pag, agbno), .len = len, .resv = xnr->resv, }; @@ -206,12 +205,12 @@ xrep_newbt_validate_ag_alloc_hint( struct xfs_scrub *sc = xnr->sc; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); - if (agno == sc->sa.pag->pag_agno && + if (agno == pag_agno(sc->sa.pag) && xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) return; - xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, - XFS_AGFL_BLOCK(sc->mp) + 1); + xnr->alloc_hint = + xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1); } /* Allocate disk space for a new per-AG btree. */ @@ -251,16 +250,15 @@ xrep_newbt_alloc_ag_blocks( return -ENOSPC; agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + if (agno != pag_agno(sc->sa.pag)) { + ASSERT(agno == pag_agno(sc->sa.pag)); + return -EFSCORRUPTED; + } - trace_xrep_newbt_alloc_ag_blocks(mp, agno, + trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, xnr->oinfo.oi_owner); - if (agno != sc->sa.pag->pag_agno) { - ASSERT(agno == sc->sa.pag->pag_agno); - return -EFSCORRUPTED; - } - error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); if (error) return error; @@ -326,16 +324,16 @@ xrep_newbt_alloc_file_blocks( agno = XFS_FSB_TO_AGNO(mp, args.fsbno); - trace_xrep_newbt_alloc_file_blocks(mp, agno, - XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, - xnr->oinfo.oi_owner); - pag = xfs_perag_get(mp, agno); if (!pag) { ASSERT(0); return -EFSCORRUPTED; } + trace_xrep_newbt_alloc_file_blocks(pag, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + error = xrep_newbt_add_blocks(xnr, pag, &args); xfs_perag_put(pag); if (error) @@ -376,7 +374,6 @@ xrep_newbt_free_extent( struct xfs_scrub *sc = xnr->sc; xfs_agblock_t free_agbno = resv->agbno; xfs_extlen_t free_aglen = resv->len; - xfs_fsblock_t fsbno; int error; if (!btree_committed || resv->used == 0) { @@ -385,8 +382,8 @@ xrep_newbt_free_extent( * space reservation, let the existing EFI free the entire * space extent. */ - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, - free_agbno, free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); return 1; } @@ -403,8 +400,8 @@ xrep_newbt_free_extent( if (free_aglen == 0) return 0; - trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, - free_aglen, xnr->oinfo.oi_owner); + trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, + xnr->oinfo.oi_owner); ASSERT(xnr->resv != XFS_AG_RESV_AGFL); ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); @@ -413,9 +410,9 @@ xrep_newbt_free_extent( * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); - error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, - xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); + error = xfs_free_extent_later(sc->tp, + xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen, + &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD); if (error) return error; @@ -516,7 +513,6 @@ xrep_newbt_claim_block( union xfs_btree_ptr *ptr) { struct xrep_newbt_resv *resv; - struct xfs_mount *mp = cur->bc_mp; xfs_agblock_t agbno; /* @@ -541,12 +537,10 @@ xrep_newbt_claim_block( if (resv->used == resv->len) list_move_tail(&resv->list, &xnr->resv_list); - trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, - xnr->oinfo.oi_owner); + trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, - agbno)); + ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno)); else ptr->s = cpu_to_be32(agbno); diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 80aee30886c4..4a47d0aabf73 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -279,7 +279,7 @@ xchk_nlinks_collect_dirent( * determine the backref count. */ if (dotdot) { - if (dp == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(dp)) error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); else if (!xfs_has_parent(sc->mp)) error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); @@ -735,7 +735,7 @@ xchk_nlinks_compare_inode( } } - if (ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(ip)) { /* * For the root of a directory tree, both the '.' and '..' * entries should point to the root directory. The dotdot diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index b3e707f47b7b..4ebdee095428 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -60,11 +60,9 @@ xrep_nlinks_is_orphaned( unsigned int actual_nlink, const struct xchk_nlink *obs) { - struct xfs_mount *mp = ip->i_mount; - if (obs->parents != 0) return false; - if (ip == mp->m_rootip || ip == sc->orphanage) + if (xchk_inode_is_dirtree_root(ip) || ip == sc->orphanage) return false; return actual_nlink != 0; } diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 7148d8362db8..c287c755f2c5 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -295,7 +295,9 @@ xrep_orphanage_can_adopt( return false; if (sc->ip == sc->orphanage) return false; - if (xfs_internal_inum(sc->mp, sc->ip->i_ino)) + if (xchk_inode_is_sb_rooted(sc->ip)) + return false; + if (xfs_is_internal_inode(sc->ip)) return false; return true; } diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 91e7b51ce068..3b692c4acc1e 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -132,6 +132,14 @@ xchk_parent_validate( return 0; } + /* Is this the metadata root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_metadirip) { + if (sc->ip->i_ino != mp->m_sb.sb_metadirino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -185,6 +193,12 @@ xchk_parent_validate( goto out_unlock; } + /* Metadata and regular inodes cannot cross trees. */ + if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -300,7 +314,7 @@ xchk_parent_pptr_and_dotdot( } /* Is this the root dir? Then '..' must point to itself. */ - if (sc->ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(sc->ip)) { if (sc->ip->i_ino != pp->parent_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; @@ -711,7 +725,7 @@ xchk_parent_count_pptrs( } if (S_ISDIR(VFS_I(sc->ip)->i_mode)) { - if (sc->ip == sc->mp->m_rootip) + if (xchk_inode_is_dirtree_root(sc->ip)) pp->pptrs_found++; if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0) @@ -720,6 +734,14 @@ xchk_parent_count_pptrs( pp->pptrs_found == 0) xchk_ino_set_corrupt(sc, sc->ip->i_ino); } else { + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Pretend that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + pp->pptrs_found++; + if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found) xchk_ino_set_corrupt(sc, sc->ip->i_ino); } @@ -885,10 +907,9 @@ bool xchk_pptr_looks_zapped( struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); - ASSERT(xfs_has_parent(mp)); + ASSERT(xfs_has_parent(ip->i_mount)); /* * Temporary files that cannot be linked into the directory tree do not @@ -902,15 +923,15 @@ xchk_pptr_looks_zapped( * of a parent pointer scan is always the empty set. It's safe to scan * them even if the attr fork was zapped. */ - if (ip == mp->m_rootip) + if (xchk_inode_is_dirtree_root(ip)) return false; /* - * Metadata inodes are all rooted in the superblock and do not have - * any parents. Hence the attr fork will not be initialized, but - * there are no parent pointers that might have been zapped. + * Metadata inodes that are rooted in the superblock do not have any + * parents. Hence the attr fork will not be initialized, but there are + * no parent pointers that might have been zapped. */ - if (xfs_is_metadata_inode(ip)) + if (xchk_inode_is_sb_rooted(ip)) return false; /* diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c index 7b42b7f65a0b..31bfe10be22a 100644 --- a/fs/xfs/scrub/parent_repair.c +++ b/fs/xfs/scrub/parent_repair.c @@ -1334,7 +1334,7 @@ xrep_parent_rebuild_pptrs( * so that we can decide if we're moving this file to the orphanage. * For this purpose, root directories are their own parents. */ - if (sc->ip == sc->mp->m_rootip) { + if (xchk_inode_is_dirtree_root(sc->ip)) { xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino); } else { error = xrep_parent_lookup_pptrs(sc, &parent_ino); @@ -1354,21 +1354,40 @@ STATIC int xrep_parent_rebuild_tree( struct xrep_parent *rp) { + struct xfs_scrub *sc = rp->sc; + bool try_adoption; int error; - if (xfs_has_parent(rp->sc->mp)) { + if (xfs_has_parent(sc->mp)) { error = xrep_parent_rebuild_pptrs(rp); if (error) return error; } - if (rp->pscan.parent_ino == NULLFSINO) { - if (xrep_orphanage_can_adopt(rp->sc)) + /* + * Any file with no parent could be adopted. This check happens after + * rebuilding the parent pointer structure because we might have cycled + * the ILOCK during that process. + */ + try_adoption = rp->pscan.parent_ino == NULLFSINO; + + /* + * Starting with metadir, we allow checking of parent pointers + * of non-directory files that are children of the superblock. + * Lack of parent is ok here. + */ + if (try_adoption && xfs_has_metadir(sc->mp) && + xchk_inode_is_sb_rooted(sc->ip)) + try_adoption = false; + + if (try_adoption) { + if (xrep_orphanage_can_adopt(sc)) return xrep_parent_move_to_orphanage(rp); return -EFSCORRUPTED; + } - if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode)) + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) return xrep_parent_reset_dotdot(rp); return 0; @@ -1422,6 +1441,14 @@ xrep_parent_set_nondir_nlink( if (error) return error; + /* + * Starting with metadir, we allow checking of parent pointers of + * non-directory files that are children of the superblock. Pretend + * that we found a parent pointer attr. + */ + if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip)) + rp->parents++; + if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) { xfs_trans_ijoin(sc->tp, sc->ip, 0); joined = true; diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index c77eb2de8df7..dc4033b91e44 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -398,10 +398,13 @@ xqcheck_collect_inode( bool isreg = S_ISREG(VFS_I(ip)->i_mode); int error = 0; - if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + if (xfs_is_metadir_inode(ip) || + xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { /* * Quota files are never counted towards quota, so we do not - * need to take the lock. + * need to take the lock. Files do not switch between the + * metadata and regular directory trees without a reallocation, + * so we do not need to ILOCK them either. */ xchk_iscan_mark_visited(&xqc->iscan, ip); return 0; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 53697f3c5e1b..08230952053b 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -137,7 +137,7 @@ xreap_put_freelist( agfl_bp, agbno, 0); if (error) return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; @@ -263,7 +263,6 @@ xreap_agextent_binval( struct xfs_scrub *sc = rs->sc; struct xfs_perag *pag = sc->sa.pag; struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; xfs_agblock_t agbno_next = agbno + *aglenp; xfs_agblock_t bno = agbno; @@ -284,7 +283,7 @@ xreap_agextent_binval( */ while (bno < agbno_next) { struct xrep_bufscan scan = { - .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .daddr = xfs_agbno_to_daddr(pag, bno), .max_sectors = xrep_bufscan_max_sectors(mp, agbno_next - bno), .daddr_step = XFS_FSB_TO_BB(mp, 1), @@ -391,7 +390,7 @@ xreap_agextent_iter( xfs_fsblock_t fsbno; int error = 0; - fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); /* * If there are other rmappings, this block is cross linked and must @@ -780,7 +779,6 @@ xreap_bmapi_binval( xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; @@ -837,7 +835,7 @@ xreap_bmapi_binval( */ while (bno < agbno_next) { struct xrep_bufscan scan = { - .daddr = XFS_AGB_TO_DADDR(mp, agno, bno), + .daddr = xfs_agbno_to_daddr(pag, bno), .max_sectors = xrep_bufscan_max_sectors(mp, scan_blocks), .daddr_step = XFS_FSB_TO_BB(mp, 1), diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d0c7d4a29c0f..1c5e45cc6419 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -453,7 +453,8 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + if (xfs_refcount_check_irec(to_perag(bs->cur->bc_group), &irec) != + NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -490,7 +491,7 @@ xchk_refcount_xref_rmap( struct xfs_scrub *sc, xfs_filblks_t cow_blocks) { - xfs_extlen_t refcbt_blocks = 0; + xfs_filblks_t refcbt_blocks = 0; xfs_filblks_t blocks; int error; diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index a00d7ce7ae5b..4e572b81c986 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -215,7 +215,7 @@ xrep_refc_rmap_shareable( return false; /* Metadata in files are never shareable */ - if (xfs_internal_inum(mp, rmap->rm_owner)) + if (xfs_is_sb_inum(mp, rmap->rm_owner)) return false; /* Metadata and unwritten file blocks are not shareable. */ @@ -590,7 +590,6 @@ xrep_refc_build_new_tree( struct xfs_scrub *sc = rr->sc; struct xfs_btree_cur *refc_cur; struct xfs_perag *pag = sc->sa.pag; - xfs_fsblock_t fsbno; int error; error = xrep_refc_sort_records(rr); @@ -603,8 +602,8 @@ xrep_refc_build_new_tree( * to root the new btree while it's under construction and before we * attach it to the AG header. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); - xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, + xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)), XFS_AG_RESV_METADATA); rr->new_btree.bload.get_records = xrep_refc_get_records; rr->new_btree.bload.claim_block = xrep_refc_claim_block; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 67478294f11a..91c8bc055a4f 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_rtbitmap.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" @@ -305,7 +306,7 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } else { @@ -325,16 +326,14 @@ xrep_calc_ag_resblks( /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != pag->block_count || + aglen != pag_group(pag)->xg_block_count || freelen >= aglen) { - aglen = pag->block_count; + aglen = pag_group(pag)->xg_block_count; freelen = aglen; usedlen = aglen; } - xfs_perag_put(pag); - trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, - freelen, usedlen); + trace_xrep_calc_ag_resblks(pag, icount, aglen, freelen, usedlen); /* * Figure out how many blocks we'd need worst case to rebuild @@ -372,8 +371,9 @@ xrep_calc_ag_resblks( rmapbt_sz = 0; } - trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, - inobt_sz, rmapbt_sz, refcbt_sz); + trace_xrep_calc_ag_resblks_btsize(pag, bnobt_sz, inobt_sz, rmapbt_sz, + refcbt_sz); + xfs_perag_put(pag); return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } @@ -414,7 +414,7 @@ xrep_fix_freelist( args.mp = sc->mp; args.tp = sc->tp; - args.agno = sc->sa.pag->pag_agno; + args.agno = pag_agno(sc->sa.pag); args.alignment = 1; args.pag = sc->sa.pag; @@ -483,7 +483,7 @@ xrep_findroot_block( int block_level; int error = 0; - daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); + daddr = xfs_agbno_to_daddr(ri->sc->sa.pag, agbno); /* * Blocks in the AGFL have stale contents that might just happen to @@ -612,7 +612,7 @@ xrep_findroot_block( else fab->root = NULLAGBLOCK; - trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, + trace_xrep_findroot_block(ri->sc->sa.pag, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); out: xfs_trans_brelse(ri->sc->tp, bp); @@ -953,6 +953,29 @@ xrep_ag_init( return 0; } +#ifdef CONFIG_XFS_RT +/* + * Given a reference to a rtgroup structure, lock rtgroup btree inodes and + * create btree cursors. Must only be called to repair a regular rt file. + */ +int +xrep_rtgroup_init( + struct xfs_scrub *sc, + struct xfs_rtgroup *rtg, + struct xchk_rt *sr, + unsigned int rtglock_flags) +{ + ASSERT(sr->rtg == NULL); + + xfs_rtgroup_lock(rtg, rtglock_flags); + sr->rtlock_flags = rtglock_flags; + + /* Grab our own passive reference from the caller's ref. */ + sr->rtg = xfs_rtgroup_hold(rtg); + return 0; +} +#endif /* CONFIG_XFS_RT */ + /* Reinitialize the per-AG block reservation for the AG we just fixed. */ int xrep_reset_perag_resv( @@ -973,7 +996,7 @@ xrep_reset_perag_resv( if (error == -ENOSPC) { xfs_err(sc->mp, "Insufficient free space to reset per-AG reservation for AG %u after repair.", - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); error = 0; } @@ -1083,10 +1106,17 @@ xrep_metadata_inode_forks( if (error) return error; - /* Make sure the attr fork looks ok before we delete it. */ - error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); - if (error) - return error; + /* + * Metadata files can only have extended attributes on metadir + * filesystems, either for parent pointers or for actual xattr data. + * For a non-metadir filesystem, make sure the attr fork looks ok + * before we delete it. + */ + if (xfs_inode_hasattr(sc->ip)) { + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + } /* Clear the reflink flag since metadata never shares. */ if (xfs_is_reflink_inode(sc->ip)) { @@ -1097,8 +1127,11 @@ xrep_metadata_inode_forks( return error; } - /* Clear the attr forks since metadata shouldn't have that. */ - if (xfs_inode_hasattr(sc->ip)) { + /* + * Metadata files on non-metadir filesystems cannot have attr forks, + * so clear them now. + */ + if (xfs_inode_hasattr(sc->ip) && !xfs_has_metadir(sc->mp)) { if (!dirty) { dirty = true; xfs_trans_ijoin(sc->tp, sc->ip, 0); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 0e0dc2bf985c..b649da1a93eb 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,7 @@ #include "xfs_quota_defs.h" +struct xfs_rtgroup; struct xchk_stats_run; static inline int xrep_notsupported(struct xfs_scrub *sc) @@ -106,6 +107,12 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, struct xchk_ag *sa); +#ifdef CONFIG_XFS_RT +int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg, + struct xchk_rt *sr, unsigned int rtglock_flags); +#else +# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS) +#endif /* CONFIG_XFS_RT */ /* Metadata revalidators */ @@ -134,13 +141,16 @@ int xrep_directory(struct xfs_scrub *sc); int xrep_parent(struct xfs_scrub *sc); int xrep_symlink(struct xfs_scrub *sc); int xrep_dirtree(struct xfs_scrub *sc); +int xrep_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); int xrep_rtsummary(struct xfs_scrub *sc); +int xrep_rgsuperblock(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported # define xrep_rtsummary xrep_notsupported +# define xrep_rgsuperblock xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -208,6 +218,7 @@ xrep_setup_nothing( #define xrep_setup_parent xrep_setup_nothing #define xrep_setup_nlinks xrep_setup_nothing #define xrep_setup_dirtree xrep_setup_nothing +#define xrep_setup_metapath xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) @@ -243,6 +254,8 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x) #define xrep_parent xrep_notsupported #define xrep_symlink xrep_notsupported #define xrep_dirtree xrep_notsupported +#define xrep_metapath xrep_notsupported +#define xrep_rgsuperblock xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c new file mode 100644 index 000000000000..463b3573bb76 --- /dev/null +++ b/fs/xfs/scrub/rgsuper.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_rtgroup.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" + +/* Set us up with a transaction and an empty context. */ +int +xchk_setup_rgsuperblock( + struct xfs_scrub *sc) +{ + return xchk_trans_alloc(sc, 0); +} + +/* Cross-reference with the other rt metadata. */ +STATIC void +xchk_rgsuperblock_xref( + struct xfs_scrub *sc) +{ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1); +} + +int +xchk_rgsuperblock( + struct xfs_scrub *sc) +{ + xfs_rgnumber_t rgno = sc->sm->sm_agno; + int error; + + /* + * Only rtgroup 0 has a superblock. We may someday want to use higher + * rgno for other functions, similar to what we do with the primary + * super scrub function. + */ + if (rgno != 0) + return -ENOENT; + + /* + * Grab an active reference to the rtgroup structure. If we can't get + * it, we're racing with something that's tearing down the group, so + * signal that the group no longer exists. Take the rtbitmap in shared + * mode so that the group can't change while we're doing things. + */ + error = xchk_rtgroup_init_existing(sc, rgno, &sc->sr); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED); + + /* + * Since we already validated the rt superblock at mount time, we don't + * need to check its contents again. All we need is to cross-reference. + */ + xchk_rgsuperblock_xref(sc); + return 0; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int +xrep_rgsuperblock( + struct xfs_scrub *sc) +{ + ASSERT(rtg_rgno(sc->sr.rtg) == 0); + + xfs_log_sb(sc->tp); + return 0; +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index ba5bbc3fb754..39e9ad7cd8ae 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -358,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { + xfs_rmap_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } @@ -410,7 +410,7 @@ xchk_rmapbt_walk_ag_metadata( goto out; /* OWN_LOG: Internal log */ - if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + if (xfs_ag_contains_log(mp, pag_agno(sc->sa.pag))) { error = xagb_bitmap_set(&cr->log_owned, XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), mp->m_sb.sb_logblocks); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index e8080eba37d2..a0a227d183d2 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -231,7 +231,7 @@ xrep_rmap_stash( if (xchk_iscan_aborted(&rr->iscan)) return -EFSCORRUPTED; - trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + trace_xrep_rmap_found(sc->sa.pag, &rmap); mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); @@ -344,7 +344,7 @@ xrep_rmap_visit_bmbt( int error; if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != - rf->rr->sc->sa.pag->pag_agno) + pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); @@ -391,7 +391,7 @@ xrep_rmap_visit_iroot_btree_block( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag)) return 0; agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -622,7 +622,7 @@ xrep_rmap_walk_inobt( return error; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL) return -EFSCORRUPTED; agino = irec.ir_startino; @@ -801,7 +801,7 @@ xrep_rmap_find_log_rmaps( { struct xfs_scrub *sc = rr->sc; - if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag))) return 0; return xrep_rmap_stash(rr, @@ -976,7 +976,7 @@ xrep_rmap_try_reserve( { struct xrep_rmap_agfl ra = { .bitmap = freesp_blocks, - .agno = rr->sc->sa.pag->pag_agno, + .agno = pag_agno(rr->sc->sa.pag), }; struct xfs_scrub *sc = rr->sc; struct xrep_newbt_resv *resv, *n; @@ -1272,7 +1272,6 @@ xrep_rmap_build_new_tree( struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_btree_cur *rmap_cur; - xfs_fsblock_t fsbno; int error; /* @@ -1290,9 +1289,9 @@ xrep_rmap_build_new_tree( * rmapbt per-AG reservation, which we will adjust further after * committing the new btree. */ - fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, - fsbno, XFS_AG_RESV_RMAPBT); + xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)), + XFS_AG_RESV_RMAPBT); rr->new_btree.bload.get_records = xrep_rmap_get_records; rr->new_btree.bload.claim_block = xrep_rmap_claim_block; rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; @@ -1553,7 +1552,7 @@ xrep_rmapbt_live_update( if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) goto out_unlock; - trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + trace_xrep_rmap_live_update(rr->sc->sa.pag, action, p); error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); if (error) @@ -1597,7 +1596,7 @@ xrep_rmap_setup_scan( /* Set up in-memory rmap btree */ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, - sc->sa.pag->pag_agno); + pag_agno(sc->sa.pag)); if (error) goto out_mutex; @@ -1612,7 +1611,7 @@ xrep_rmap_setup_scan( */ ASSERT(sc->flags & XCHK_FSGATES_RMAP); xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); - error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook); if (error) goto out_iscan; return 0; @@ -1633,7 +1632,7 @@ xrep_rmap_teardown( struct xfs_scrub *sc = rr->sc; xchk_iscan_abort(&rr->iscan); - xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); + xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook); xchk_iscan_teardown(&rr->iscan); xfbtree_destroy(&rr->rmap_btree); mutex_destroy(&rr->lock); diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 46583517377f..376a36fd9a9c 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -35,6 +35,10 @@ xchk_setup_rtbitmap( return -ENOMEM; sc->buf = rtb; + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + if (xchk_could_repair(sc)) { error = xrep_setup_rtbitmap(sc, rtb); if (error) @@ -45,7 +49,8 @@ xchk_setup_rtbitmap( if (error) return error; - error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + error = xchk_install_live_inode(sc, + sc->sr.rtg->rtg_inodes[XFS_RTGI_BITMAP]); if (error) return error; @@ -53,18 +58,18 @@ xchk_setup_rtbitmap( if (error) return error; - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); - /* * Now that we've locked the rtbitmap, we can't race with growfsrt * trying to expand the bitmap or change the size of the rt volume. * Hence it is safe to compute and check the geometry values. */ + xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP); if (mp->m_sb.sb_rblocks) { - rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); rtb->rextslog = xfs_compute_rextslog(rtb->rextents); - rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp); } + return 0; } @@ -73,7 +78,7 @@ xchk_setup_rtbitmap( /* Scrub a free extent record from the realtime bitmap. */ STATIC int xchk_rtbitmap_rec( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -82,10 +87,10 @@ xchk_rtbitmap_rec( xfs_rtblock_t startblock; xfs_filblks_t blockcount; - startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); - blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); + startblock = xfs_rtx_to_rtb(rtg, rec->ar_startext); + blockcount = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); - if (!xfs_verify_rtbext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -140,18 +145,20 @@ xchk_rtbitmap( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; struct xchk_rtbitmap *rtb = sc->buf; int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rtb->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* Is sb_rextslog correct? */ if (mp->m_sb.sb_rextslog != rtb->rextslog) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -160,17 +167,17 @@ xchk_rtbitmap( * case can we exceed 4bn bitmap blocks since the super field is a u32. */ if (rtb->rbmblocks > U32_MAX) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } /* The bitmap file length must be aligned to an fsblock. */ - if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -179,8 +186,8 @@ xchk_rtbitmap( * growfsrt expands the bitmap file before updating sb_rextents, so the * file can be larger than sb_rbmblocks. */ - if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + if (rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, rbmip->i_ino); return 0; } @@ -193,7 +200,7 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; @@ -207,6 +214,8 @@ xchk_xref_is_used_rt_space( xfs_rtblock_t rtbno, xfs_extlen_t len) { + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -217,13 +226,10 @@ xchk_xref_is_used_rt_space( startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); - xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) - goto out_unlock; + return; if (is_free) - xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); -out_unlock: - xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); } diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 7c7366c98338..49fc6250bafc 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_sb.h" #include "xfs_exchmaps.h" +#include "xfs_rtgroup.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -46,12 +47,19 @@ xchk_setup_rtsummary( struct xchk_rtsummary *rts; int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), XCHK_GFP_FLAGS); if (!rts) return -ENOMEM; sc->buf = rts; + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); + if (error) + return error; + if (xchk_could_repair(sc)) { error = xrep_setup_rtsummary(sc, rts); if (error) @@ -73,7 +81,8 @@ xchk_setup_rtsummary( if (error) return error; - error = xchk_install_live_inode(sc, mp->m_rsumip); + error = xchk_install_live_inode(sc, + sc->sr.rtg->rtg_inodes[XFS_RTGI_SUMMARY]); if (error) return error; @@ -82,29 +91,23 @@ xchk_setup_rtsummary( return error; /* - * Locking order requires us to take the rtbitmap first. We must be - * careful to unlock it ourselves when we are done with the rtbitmap - * file since the scrub infrastructure won't do that for us. Only - * then we can lock the rtsummary inode. - */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); - - /* * Now that we've locked the rtbitmap and rtsummary, we can't race with * growfsrt trying to expand the summary or change the size of the rt * volume. Hence it is safe to compute and check the geometry values. + * + * Note that there is no strict requirement for an exclusive lock on the + * summary here, but to keep the locking APIs simple we lock both inodes + * exclusively here. If we ever start caring about running concurrent + * fsmap with scrub this could be changed. */ + xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP); if (mp->m_sb.sb_rblocks) { - int rextslog; - - rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); - rextslog = xfs_compute_rextslog(rts->rextents); - rts->rsumlevels = rextslog + 1; - rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); - rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, - rts->rbmblocks); + rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); + rts->rbmblocks = xfs_rtbitmap_blockcount(mp); + rts->rsumblocks = + xfs_rtsummary_blockcount(mp, &rts->rsumlevels); } + return 0; } @@ -148,6 +151,11 @@ xchk_rtsum_inc( struct xfs_mount *mp, union xfs_suminfo_raw *v) { + if (xfs_has_rtgroups(mp)) { + be32_add_cpu(&v->rtg, 1); + return be32_to_cpu(v->rtg); + } + v->old += 1; return v->old; } @@ -155,11 +163,12 @@ xchk_rtsum_inc( /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_scrub *sc = priv; xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; @@ -178,11 +187,12 @@ xchk_rtsum_record_free( lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + rtbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); + rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount); if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, + rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_ino); return -EFSCORRUPTED; } @@ -204,15 +214,14 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_blocks; + struct xfs_rtgroup *rtg = sc->sr.rtg; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); - if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) + if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) != + rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_disk_size) return -EFSCORRUPTED; - return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, - sc); + return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc); } /* Compare the rtsummary file against the one we computed. */ @@ -231,8 +240,9 @@ xchk_rtsum_compare( xfs_rtsumoff_t sumoff = 0; int error = 0; - rts->args.mp = sc->mp; + rts->args.mp = mp; rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; /* Mappings may not cross or lie beyond EOF. */ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); @@ -299,31 +309,34 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xfs_rtgroup *rtg = sc->sr.rtg; + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; + struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; struct xchk_rtsummary *rts = sc->buf; - int error = 0; + int error; /* Is sb_rextents correct? */ if (mp->m_sb.sb_rextents != rts->rextents) { - xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } /* Is m_rsumlevels correct? */ if (mp->m_rsumlevels != rts->rsumlevels) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Is m_rsumsize correct? */ if (mp->m_rsumblocks != rts->rsumblocks) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* The summary file length must be aligned to an fsblock. */ - if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* @@ -331,15 +344,15 @@ xchk_rtsummary( * growfsrt expands the summary file before updating sb_rextents, so * the file can be larger than rsumsize. */ - if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { - xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); - goto out_rbm; + if (rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) { + xchk_ino_set_corrupt(sc, rsumip->i_ino); + return 0; } /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out_rbm; + return error; /* Construct the new summary file from the rtbitmap. */ error = xchk_rtsum_compute(sc); @@ -348,23 +361,12 @@ xchk_rtsummary( * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref * error since we're checking the summary file. */ - xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); - error = 0; - goto out_rbm; + xchk_ino_set_corrupt(sc, rbmip->i_ino); + return 0; } if (error) - goto out_rbm; + return error; /* Does the computed summary file match the actual rtsummary file? */ - error = xchk_rtsum_compare(sc); - -out_rbm: - /* - * Unlock the rtbitmap since we're done with it. All other writers of - * the rt free space metadata grab the bitmap and summary ILOCKs in - * that order, so we're still protected against allocation activities - * even if we continue on to the repair function. - */ - xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - return error; + return xchk_rtsum_compare(sc); } diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c index 7deeb948cb70..8198ea84ad70 100644 --- a/fs/xfs/scrub/rtsummary_repair.c +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -76,18 +76,30 @@ xrep_rtsummary_prep_buf( union xfs_suminfo_raw *ondisk; int error; - rts->args.mp = sc->mp; + rts->args.mp = mp; rts->args.tp = sc->tp; + rts->args.rtg = sc->sr.rtg; rts->args.sumbp = bp; ondisk = xfs_rsumblock_infoptr(&rts->args, 0); rts->args.sumbp = NULL; - bp->b_ops = &xfs_rtbuf_ops; - error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); if (error) return error; + if (xfs_has_rtgroups(sc->mp)) { + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; + + hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC); + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); + hdr->rt_lsn = 0; + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); + bp->b_ops = &xfs_rtsummary_buf_ops; + } else { + bp->b_ops = &xfs_rtbuf_ops; + } + rts->prep_wordoff += mp->m_blockwsize; xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); return 0; @@ -162,8 +174,8 @@ xrep_rtsummary( return error; /* Reset incore state and blow out the summary cache. */ - if (mp->m_rsum_cache) - memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + if (sc->sr.rtg->rtg_rsum_cache) + memset(sc->sr.rtg->rtg_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); mp->m_rsumlevels = rts->rsumlevels; mp->m_rsumblocks = rts->rsumblocks; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4cbcf7a86dbe..950f5a58dcd9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -225,6 +225,8 @@ xchk_teardown( xfs_trans_cancel(sc->tp); sc->tp = NULL; } + if (sc->sr.rtg) + xchk_rtgroup_free(sc, &sc->sr); if (sc->ip) { if (sc->ilock_flags) xchk_iunlock(sc, sc->ilock_flags); @@ -382,13 +384,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .repair = xrep_parent, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ - .type = ST_FS, + .type = ST_RTGROUP, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ - .type = ST_FS, + .type = ST_RTGROUP, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, @@ -442,6 +444,20 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .has = xfs_has_parent, .repair = xrep_dirtree, }, + [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ + .type = ST_GENERIC, + .setup = xchk_setup_metapath, + .scrub = xchk_metapath, + .has = xfs_has_metadir, + .repair = xrep_metapath, + }, + [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ + .type = ST_RTGROUP, + .setup = xchk_setup_rgsuperblock, + .scrub = xchk_rgsuperblock, + .has = xfs_has_rtsb, + .repair = xrep_rgsuperblock, + }, }; static int @@ -489,6 +505,35 @@ xchk_validate_inputs( if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) goto out; break; + case ST_GENERIC: + break; + case ST_RTGROUP: + if (sm->sm_ino || sm->sm_gen) + goto out; + if (xfs_has_rtgroups(mp)) { + /* + * On a rtgroups filesystem, there won't be an rtbitmap + * or rtsummary file for group 0 unless there's + * actually a realtime volume attached. However, older + * xfs_scrub always calls the rtbitmap/rtsummary + * scrubbers with sm_agno==0 so transform the error + * code to ENOENT. + */ + if (sm->sm_agno >= mp->m_sb.sb_rgcount) { + if (sm->sm_agno == 0) + error = -ENOENT; + goto out; + } + } else { + /* + * Prior to rtgroups, the rtbitmap/rtsummary scrubbers + * accepted sm_agno==0, so we still accept that for + * scrubbing pre-rtgroups filesystems. + */ + if (sm->sm_agno != 0) + goto out; + } + break; default: goto out; } @@ -605,8 +650,7 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, - "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5993fcaffb2c..5dbbe93cb49b 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -73,6 +73,8 @@ enum xchk_type { ST_PERAG, /* per-AG metadata */ ST_FS, /* per-FS metadata */ ST_INODE, /* per-inode metadata */ + ST_GENERIC, /* determined by the scrubber */ + ST_RTGROUP, /* rtgroup metadata */ }; struct xchk_meta_ops { @@ -117,6 +119,15 @@ struct xchk_ag { struct xfs_btree_cur *refc_cur; }; +/* Inode lock state for the RT volume. */ +struct xchk_rt { + /* incore rtgroup, if applicable */ + struct xfs_rtgroup *rtg; + + /* XFS_RTGLOCK_* lock state if locked */ + unsigned int rtlock_flags; +}; + struct xfs_scrub { /* General scrub state. */ struct xfs_mount *mp; @@ -173,11 +184,20 @@ struct xfs_scrub { */ unsigned int sick_mask; + /* + * Clear these XFS_SICK_* flags but only if the scan is ok. Useful for + * removing ZAPPED flags after a repair. + */ + unsigned int healthy_mask; + /* next time we want to cond_resched() */ struct xchk_relax relax; /* State tracking for single-AG operations. */ struct xchk_ag sa; + + /* State tracking for realtime operations. */ + struct xchk_rt sr; }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ @@ -255,12 +275,15 @@ int xchk_xattr(struct xfs_scrub *sc); int xchk_symlink(struct xfs_scrub *sc); int xchk_parent(struct xfs_scrub *sc); int xchk_dirtree(struct xfs_scrub *sc); +int xchk_metapath(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xchk_rtbitmap(struct xfs_scrub *sc); int xchk_rtsummary(struct xfs_scrub *sc); +int xchk_rgsuperblock(struct xfs_scrub *sc); #else # define xchk_rtbitmap xchk_nothing # define xchk_rtsummary xchk_nothing +# define xchk_rgsuperblock xchk_nothing #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index 7996c2335476..a476c7b2ab75 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -80,6 +80,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", [XFS_SCRUB_TYPE_NLINKS] = "nlinks", [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", + [XFS_SCRUB_TYPE_METAPATH] = "metapath", + [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index d015a86ef460..953ce7be78dc 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -36,6 +36,7 @@ #include "scrub/tempfile.h" #include "scrub/tempexch.h" #include "scrub/reap.h" +#include "scrub/health.h" /* * Symbolic Link Repair @@ -233,7 +234,7 @@ xrep_symlink_salvage( * target zapped flag. */ if (buflen == 0) { - sc->sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); sprintf(target_buf, DUMMY_TARGET); } diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 177f922acfaf..2d7ca7e1bbca 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -22,6 +22,7 @@ #include "xfs_exchmaps.h" #include "xfs_defer.h" #include "xfs_symlink_remote.h" +#include "xfs_metafile.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -182,6 +183,114 @@ out_release_dquots: return error; } +/* + * Move sc->tempip from the regular directory tree to the metadata directory + * tree if sc->ip is part of the metadata directory tree and tempip has an + * eligible file mode. + * + * Temporary files have to be created before we even know which inode we're + * going to scrub, so we assume that they will be part of the regular directory + * tree. If it turns out that we're actually scrubbing a file from the + * metadata directory tree, we have to subtract the temp file from the root + * dquots and detach the dquots prior to setting the METADATA iflag. However, + * the scrub setup functions grab sc->ip and create sc->tempip before we + * actually get around to checking if the file mode is the right type for the + * scrubber. + */ +int +xrep_tempfile_adjust_directory_tree( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip) + return 0; + + ASSERT(sc->tp == NULL); + ASSERT(!xfs_is_metadir_inode(sc->tempip)); + + if (!sc->ip || !xfs_is_metadir_inode(sc->ip)) + return 0; + if (!S_ISDIR(VFS_I(sc->tempip)->i_mode) && + !S_ISREG(VFS_I(sc->tempip)->i_mode)) + return 0; + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + /* Metadir files are not accounted in quota, so drop icount */ + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L); + xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN); + + error = xrep_trans_commit(sc); + if (error) + goto out_ilock; + + xfs_iflags_set(sc->tempip, XFS_IRECOVERY); + xfs_qm_dqdetach(sc->tempip); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + +/* + * Remove this temporary file from the metadata directory tree so that it can + * be inactivated the normal way. + */ +STATIC int +xrep_tempfile_remove_metadir( + struct xfs_scrub *sc) +{ + int error; + + if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip)) + return 0; + + ASSERT(sc->tp == NULL); + + xfs_iflags_clear(sc->tempip, XFS_IRECOVERY); + + xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL); + sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; + + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_iolock; + + xrep_tempfile_ilock(sc); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + + xfs_metafile_clear_iflag(sc->tp, sc->tempip); + + /* Non-metadir files are accounted in quota, so bump bcount/icount */ + error = xfs_qm_dqattach_locked(sc->tempip, false); + if (error) + goto out_cancel; + + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L); + xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT, + sc->tempip->i_nblocks); + error = xrep_trans_commit(sc); + goto out_ilock; + +out_cancel: + xchk_trans_cancel(sc); +out_ilock: + xrep_tempfile_iunlock(sc); +out_iolock: + xrep_tempfile_iounlock(sc); + return error; +} + /* Take IOLOCK_EXCL on the temporary file, maybe. */ bool xrep_tempfile_iolock_nowait( @@ -290,6 +399,7 @@ xrep_tempfile_rele( sc->temp_ilock_flags = 0; } + xrep_tempfile_remove_metadir(sc); xchk_irele(sc, sc->tempip); sc->tempip = NULL; } @@ -844,6 +954,17 @@ xrep_is_tempfile( const struct xfs_inode *ip) { const struct inode *inode = &ip->i_vnode; + struct xfs_mount *mp = ip->i_mount; + + /* + * Files in the metadata directory tree also have S_PRIVATE set and + * IOP_XATTR unset, so we must distinguish them separately. We (ab)use + * the IRECOVERY flag to mark temporary metadir inodes knowing that the + * end of log recovery clears IRECOVERY, so the only ones that can + * exist during online repair are the ones we create. + */ + if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA)) + return __xfs_iflags_test(ip, XFS_IRECOVERY); if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) return true; diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h index e51399f595fe..71c1b54599c3 100644 --- a/fs/xfs/scrub/tempfile.h +++ b/fs/xfs/scrub/tempfile.h @@ -10,6 +10,8 @@ int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode); void xrep_tempfile_rele(struct xfs_scrub *sc); +int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc); + bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc); int xrep_tempfile_iolock_polled(struct xfs_scrub *sc); void xrep_tempfile_iounlock(struct xfs_scrub *sc); @@ -42,6 +44,7 @@ static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) xchk_ilock(sc, XFS_IOLOCK_EXCL); } # define xrep_is_tempfile(ip) (false) +# define xrep_tempfile_adjust_directory_tree(sc) (0) # define xrep_tempfile_rele(sc) #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 4470ad0533b8..98f923ae664d 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -20,6 +20,7 @@ #include "xfs_dir2.h" #include "xfs_rmap.h" #include "xfs_parent.h" +#include "xfs_metafile.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c886d5d0eb02..d2ae7e93acb0 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -70,6 +70,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -101,7 +103,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ - { XFS_SCRUB_TYPE_BARRIER, "barrier" } + { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \ + { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \ + { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -601,7 +605,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, TP_fast_assign( xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; - __entry->ino = sc->ip->i_ino; + __entry->ino = cur->bc_ino.ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __assign_str(name); @@ -772,12 +776,12 @@ TRACE_EVENT(xchk_xref_error, ); TRACE_EVENT(xchk_iallocbt_check_cluster, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, xfs_daddr_t map_daddr, - unsigned short map_len, unsigned int chunk_ino, - unsigned int nr_inodes, uint16_t cluster_mask, - uint16_t holemask, unsigned int cluster_ino), - TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t startino, + xfs_daddr_t map_daddr, unsigned short map_len, + unsigned int chunk_ino, unsigned int nr_inodes, + uint16_t cluster_mask, uint16_t holemask, + unsigned int cluster_ino), + TP_ARGS(pag, startino, map_daddr, map_len, chunk_ino, nr_inodes, cluster_mask, holemask, cluster_ino), TP_STRUCT__entry( __field(dev_t, dev) @@ -792,8 +796,8 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __field(uint16_t, holemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = startino; __entry->map_daddr = map_daddr; __entry->map_len = map_len; @@ -922,7 +926,8 @@ DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); TRACE_EVENT(xchk_refcount_incorrect, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_refcount_irec *irec, xfs_nlink_t seen), TP_ARGS(pag, irec, seen), TP_STRUCT__entry( @@ -935,8 +940,8 @@ TRACE_EVENT(xchk_refcount_incorrect, __field(xfs_nlink_t, seen) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -1752,6 +1757,7 @@ DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent); DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step); +DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree); TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING); TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE); @@ -1914,11 +1920,44 @@ TRACE_EVENT(xchk_dirtree_live_update, __get_str(name)) ); +DECLARE_EVENT_CLASS(xchk_metapath_class, + TP_PROTO(struct xfs_scrub *sc, const char *path, + struct xfs_inode *dp, xfs_ino_t ino), + TP_ARGS(sc, path, dp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, scrub_ino) + __field(xfs_ino_t, parent_ino) + __field(xfs_ino_t, ino) + __string(name, path) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO; + __entry->parent_ino = dp ? dp->i_ino : NULLFSINO; + __entry->ino = ino; + __assign_str(name); + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->scrub_ino, + __entry->parent_ino, + __get_str(name), + __entry->ino) +); +#define DEFINE_XCHK_METAPATH_EVENT(name) \ +DEFINE_EVENT(xchk_metapath_class, name, \ + TP_PROTO(struct xfs_scrub *sc, const char *path, \ + struct xfs_inode *dp, xfs_ino_t ino), \ + TP_ARGS(sc, path, dp, ino)) +DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t len), TP_ARGS(pag, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) @@ -1927,8 +1966,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; ), @@ -1940,7 +1979,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class, ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ TP_ARGS(pag, agbno, len)) DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); @@ -1949,8 +1989,8 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); DECLARE_EVENT_CLASS(xrep_reap_find_class, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, - bool crosslinked), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t len, bool crosslinked), TP_ARGS(pag, agbno, len, crosslinked), TP_STRUCT__entry( __field(dev_t, dev) @@ -1960,8 +2000,8 @@ DECLARE_EVENT_CLASS(xrep_reap_find_class, __field(bool, crosslinked) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->crosslinked = crosslinked; @@ -1975,17 +2015,15 @@ DECLARE_EVENT_CLASS(xrep_reap_find_class, ); #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ DEFINE_EVENT(xrep_reap_find_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ - bool crosslinked), \ + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + xfs_extlen_t len, bool crosslinked), \ TP_ARGS(pag, agbno, len, crosslinked)) DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); -DECLARE_EVENT_CLASS(xrep_rmap_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - uint64_t owner, uint64_t offset, unsigned int flags), - TP_ARGS(mp, agno, agbno, len, owner, offset, flags), +TRACE_EVENT(xrep_ibt_walk_rmap, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1996,13 +2034,13 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->len = len; - __entry->owner = owner; - __entry->offset = offset; - __entry->flags = flags; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; ), TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2013,19 +2051,11 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __entry->offset, __entry->flags) ); -#define DEFINE_REPAIR_RMAP_EVENT(name) \ -DEFINE_EVENT(xrep_rmap_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - uint64_t owner, uint64_t offset, unsigned int flags), \ - TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_alloc_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2033,8 +2063,8 @@ TRACE_EVENT(xrep_abt_found, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startblock = rec->ar_startblock; __entry->blockcount = rec->ar_blockcount; ), @@ -2046,9 +2076,9 @@ TRACE_EVENT(xrep_abt_found, ) TRACE_EVENT(xrep_ibt_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, const struct xfs_inobt_rec_incore *rec), - TP_ARGS(mp, agno, rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2059,8 +2089,8 @@ TRACE_EVENT(xrep_ibt_found, __field(uint64_t, freemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = rec->ir_startino; __entry->holemask = rec->ir_holemask; __entry->count = rec->ir_count; @@ -2078,7 +2108,8 @@ TRACE_EVENT(xrep_ibt_found, ) TRACE_EVENT(xrep_refc_found, - TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_refcount_irec *rec), TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) @@ -2089,8 +2120,8 @@ TRACE_EVENT(xrep_refc_found, __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->domain = rec->rc_domain; __entry->startblock = rec->rc_startblock; __entry->blockcount = rec->rc_blockcount; @@ -2138,9 +2169,8 @@ TRACE_EVENT(xrep_bmap_found, ); TRACE_EVENT(xrep_rmap_found, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - const struct xfs_rmap_irec *rec), - TP_ARGS(mp, agno, rec), + TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2151,8 +2181,8 @@ TRACE_EVENT(xrep_rmap_found, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = rec->rm_startblock; __entry->len = rec->rm_blockcount; __entry->owner = rec->rm_owner; @@ -2170,9 +2200,9 @@ TRACE_EVENT(xrep_rmap_found, ); TRACE_EVENT(xrep_findroot_block, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, uint32_t magic, uint16_t level), - TP_ARGS(mp, agno, agbno, magic, level), + TP_ARGS(pag, agbno, magic, level), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2181,8 +2211,8 @@ TRACE_EVENT(xrep_findroot_block, __field(uint16_t, level) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->magic = magic; __entry->level = level; @@ -2195,10 +2225,10 @@ TRACE_EVENT(xrep_findroot_block, __entry->level) ) TRACE_EVENT(xrep_calc_ag_resblks, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + TP_PROTO(const struct xfs_perag *pag, xfs_agino_t icount, + xfs_agblock_t aglen, xfs_agblock_t freelen, xfs_agblock_t usedlen), - TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_ARGS(pag, icount, aglen, freelen, usedlen), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2208,8 +2238,8 @@ TRACE_EVENT(xrep_calc_ag_resblks, __field(xfs_agblock_t, usedlen) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->icount = icount; __entry->aglen = aglen; __entry->freelen = freelen; @@ -2224,10 +2254,10 @@ TRACE_EVENT(xrep_calc_ag_resblks, __entry->usedlen) ) TRACE_EVENT(xrep_calc_ag_resblks_btsize, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, - xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), - TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t bnobt_sz, + xfs_agblock_t inobt_sz, xfs_agblock_t rmapbt_sz, + xfs_agblock_t refcbt_sz), + TP_ARGS(pag, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2237,8 +2267,8 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __field(xfs_agblock_t, refcbt_sz) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bnobt_sz = bnobt_sz; __entry->inobt_sz = inobt_sz; __entry->rmapbt_sz = rmapbt_sz; @@ -2278,10 +2308,9 @@ TRACE_EVENT(xrep_reset_counters, ) DECLARE_EVENT_CLASS(xrep_newbt_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - int64_t owner), - TP_ARGS(mp, agno, agbno, len, owner), + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t len, int64_t owner), + TP_ARGS(pag, agbno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2290,8 +2319,8 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, __field(int64_t, owner) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->owner = owner; @@ -2305,10 +2334,9 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class, ); #define DEFINE_NEWBT_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_newbt_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len, \ - int64_t owner), \ - TP_ARGS(mp, agno, agbno, len, owner)) + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ + xfs_extlen_t len, int64_t owner), \ + TP_ARGS(pag, agbno, len, owner)) DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); @@ -2596,7 +2624,7 @@ TRACE_EVENT(xrep_cow_replace_mapping, ); TRACE_EVENT(xrep_cow_free_staging, - TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t blockcount), TP_ARGS(pag, agbno, blockcount), TP_STRUCT__entry( @@ -2606,8 +2634,8 @@ TRACE_EVENT(xrep_cow_free_staging, __field(xfs_extlen_t, blockcount) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->blockcount = blockcount; ), @@ -2652,9 +2680,9 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); TRACE_EVENT(xrep_rmap_live_update, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + TP_PROTO(const struct xfs_perag *pag, unsigned int op, const struct xfs_rmap_update_params *p), - TP_ARGS(mp, agno, op, p), + TP_ARGS(pag, op, p), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2666,8 +2694,8 @@ TRACE_EVENT(xrep_rmap_live_update, __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->op = op; __entry->agbno = p->startblock; __entry->len = p->blockcount; @@ -3313,7 +3341,7 @@ DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild); DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork); TRACE_EVENT(xrep_iunlink_visit, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t bucket_agino, struct xfs_inode *ip), TP_ARGS(pag, bucket, bucket_agino, ip), TP_STRUCT__entry( @@ -3326,9 +3354,9 @@ TRACE_EVENT(xrep_iunlink_visit, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino); + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = XFS_INO_TO_AGINO(pag_mount(pag), ip->i_ino); __entry->bucket = bucket; __entry->bucket_agino = bucket_agino; __entry->prev_agino = ip->i_prev_unlinked; @@ -3403,7 +3431,7 @@ TRACE_EVENT(xrep_iunlink_reload_ondisk, ); TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t prev_agino, xfs_agino_t next_agino), TP_ARGS(pag, bucket, prev_agino, next_agino), TP_STRUCT__entry( @@ -3414,8 +3442,8 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->prev_agino = prev_agino; __entry->next_agino = next_agino; @@ -3429,7 +3457,7 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket, ); DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t prev_agino, xfs_agino_t next_agino), TP_ARGS(pag, bucket, prev_agino, next_agino), TP_STRUCT__entry( @@ -3440,8 +3468,8 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->prev_agino = prev_agino; __entry->next_agino = next_agino; @@ -3455,7 +3483,7 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class, ); #define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \ DEFINE_EVENT(xrep_iunlink_resolve_class, name, \ - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \ + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, \ xfs_agino_t prev_agino, xfs_agino_t next_agino), \ TP_ARGS(pag, bucket, prev_agino, next_agino)) DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached); @@ -3516,7 +3544,7 @@ TRACE_EVENT(xrep_iunlink_relink_prev, ); TRACE_EVENT(xrep_iunlink_add_to_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t agino, xfs_agino_t curr_head), TP_ARGS(pag, bucket, agino, curr_head), TP_STRUCT__entry( @@ -3527,8 +3555,8 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket, __field(xfs_agino_t, next_agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->agino = agino; __entry->next_agino = curr_head; @@ -3542,7 +3570,7 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket, ); TRACE_EVENT(xrep_iunlink_commit_bucket, - TP_PROTO(struct xfs_perag *pag, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t old_agino, xfs_agino_t agino), TP_ARGS(pag, bucket, old_agino, agino), TP_STRUCT__entry( @@ -3553,8 +3581,8 @@ TRACE_EVENT(xrep_iunlink_commit_bucket, __field(xfs_agino_t, agino) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->old_agino = old_agino; __entry->agino = agino; @@ -3572,6 +3600,11 @@ DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path); DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption); DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink); +DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6dead20338e2..559a3a577097 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -116,7 +116,7 @@ xfs_end_ioend( if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); - xfs_bmap_punch_delalloc_range(ip, offset, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); } goto done; @@ -456,7 +456,7 @@ xfs_discard_folio( * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ - xfs_bmap_punch_delalloc_range(ip, pos, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, folio_pos(folio) + folio_size(folio)); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 7db386304875..379b48d015d2 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -114,7 +114,8 @@ xfs_attr_shortform_list( * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->count * sizeof(*sbuf); - sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL); + sbp = sbuf = kmalloc(sbsize, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 35a8c1b8b3cb..3d52e9d7ad57 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -318,14 +318,16 @@ xfs_bmap_update_create_done( return &budp->bud_item; } -/* Take a passive ref to the AG containing the space we're mapping. */ +/* Take a passive ref to the group containing the space we're mapping. */ static inline void xfs_bmap_update_get_group( struct xfs_mount *mp, struct xfs_bmap_intent *bi) { + enum xfs_group_type type = XG_TYPE_AG; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) - return; + type = XG_TYPE_RTG; /* * Bump the intent count on behalf of the deferred rmap and refcount @@ -334,7 +336,8 @@ xfs_bmap_update_get_group( * intent drops the intent count, ensuring that the intent count * remains nonzero across the transaction roll. */ - bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock); + bi->bi_group = xfs_group_intent_get(mp, bi->bi_bmap.br_startblock, + type); } /* Add this deferred BUI to the transaction. */ @@ -343,8 +346,6 @@ xfs_bmap_defer_add( struct xfs_trans *tp, struct xfs_bmap_intent *bi) { - trace_xfs_bmap_defer(bi); - xfs_bmap_update_get_group(tp->t_mountp, bi); /* @@ -357,18 +358,9 @@ xfs_bmap_defer_add( */ if (bi->bi_type == XFS_BMAP_MAP) bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount; - xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); -} - -/* Release a passive AG ref after finishing mapping work. */ -static inline void -xfs_bmap_update_put_group( - struct xfs_bmap_intent *bi) -{ - if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) - return; - xfs_perag_intent_put(bi->bi_pag); + trace_xfs_bmap_defer(bi); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); } /* Cancel a deferred bmap update. */ @@ -381,7 +373,7 @@ xfs_bmap_update_cancel_item( if (bi->bi_type == XFS_BMAP_MAP) bi->bi_owner->i_delayed_blks -= bi->bi_bmap.br_blockcount; - xfs_bmap_update_put_group(bi); + xfs_group_intent_put(bi->bi_group); kmem_cache_free(xfs_bmap_intent_cache, bi); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 053d567c9108..0836fea2d6d8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -29,6 +29,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* Kernel only BMAP related definitions and functions */ @@ -41,16 +42,12 @@ xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { if (XFS_IS_REALTIME_INODE(ip)) - return XFS_FSB_TO_BB(ip->i_mount, fsb); + return xfs_rtb_to_daddr(ip->i_mount, fsb); return XFS_FSB_TO_DADDR(ip->i_mount, fsb); } /* * Routine to zero an extent on disk allocated to the specific inode. - * - * The VFS functions take a linearised filesystem block offset, so we have to - * convert the sparse xfs fsb to the right format first. - * VFS types are real funky, too. */ int xfs_zero_extent( @@ -58,15 +55,10 @@ xfs_zero_extent( xfs_fsblock_t start_fsb, xfs_off_t count_fsb) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); - sector_t block = XFS_BB_TO_FSBT(mp, sector); - - return blkdev_issue_zeroout(target->bt_bdev, - block << (mp->m_super->s_blocksize_bits - 9), - count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_KERNEL, 0); + return blkdev_issue_zeroout(xfs_inode_buftarg(ip)->bt_bdev, + xfs_fsb_to_db(ip, start_fsb), + XFS_FSB_TO_BB(ip->i_mount, count_fsb), + GFP_KERNEL, 0); } /* @@ -111,7 +103,7 @@ xfs_bmap_count_blocks( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; - xfs_extlen_t btblocks = 0; + xfs_filblks_t btblocks = 0; int error; *nextents = 0; @@ -442,11 +434,12 @@ out_unlock_iolock: void xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, + int whichfork, xfs_off_t start_byte, xfs_off_t end_byte) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); struct xfs_bmbt_irec got, del; @@ -474,11 +467,14 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } + if (whichfork == XFS_COW_FORK && !ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -536,16 +532,20 @@ xfs_can_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (xfs_inode_has_bigrtalloc(ip)) - end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); + end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; /* - * Check if there is an post-EOF extent to free. + * Check if there is an post-EOF extent to free. If there are any + * delalloc blocks attached to the inode (data fork delalloc + * reservations or CoW extents of any kind), we need to free them so + * that inactivation doesn't fail to erase them. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) + if (ip->i_delayed_blks || + xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap)) found_blocks = true; xfs_iunlock(ip, XFS_ILOCK_SHARED); return found_blocks; @@ -580,7 +580,7 @@ xfs_free_eofblocks( */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_delayed_blks) { - xfs_bmap_punch_delalloc_range(ip, + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), LLONG_MAX); } @@ -854,8 +854,8 @@ xfs_free_file_space( /* We can only free complete realtime extents. */ if (xfs_inode_has_bigrtalloc(ip)) { - startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); - endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); + startoffset_fsb = xfs_fileoff_roundup_rtx(mp, startoffset_fsb); + endoffset_fsb = xfs_fileoff_rounddown_rtx(mp, endoffset_fsb); } /* @@ -1523,6 +1523,18 @@ xfs_swap_extents( goto out_unlock; } + /* + * The rmapbt implementation is unable to resume a swapext operation + * after a crash if the allocation unit size is larger than a block. + * This (deprecated) interface will not be upgraded to handle this + * situation. Defragmentation must be performed with the commit range + * ioctl. + */ + if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(ip->i_mount)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_qm_dqattach(ip); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index eb0895bfb9da..b29760d36e1a 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) } #endif /* CONFIG_XFS_RT */ -void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, xfs_off_t end_byte); struct kgetbmap { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..aa63b8efd782 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1857,7 +1857,6 @@ static enum lru_status xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { @@ -1956,7 +1955,6 @@ static enum lru_status xfs_buftarg_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); @@ -2115,6 +2113,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + if (bdev_can_atomic_write(btp->bt_bdev)) { + btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( + btp->bt_bdev); + btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( + btp->bt_bdev); + } + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..3d56bc7a35cc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -124,6 +124,10 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 09e893cf563c..3d0c6402cb36 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -22,6 +22,11 @@ #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_quota.h" +#include "xfs_alloc.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_rtgroup.h" +#include "xfs_rtbitmap.h" /* * This is the number of entries in the l_buf_cancel_table used during @@ -390,9 +395,18 @@ xlog_recover_validate_buf_type( break; #ifdef CONFIG_XFS_RT case XFS_BLFT_RTBITMAP_BUF: + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) { + warnmsg = "Bad rtbitmap magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP); + break; case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; + if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) { + warnmsg = "Bad rtsummary magic!"; + break; + } + bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY); break; #endif /* CONFIG_XFS_RT */ default: @@ -685,6 +699,90 @@ xlog_recover_do_inode_buffer( } /* + * Update the in-memory superblock and perag structures from the primary SB + * buffer. + * + * This is required because transactions running after growfs may require the + * updated values to be set in a previous fully commit transaction. + */ +static int +xlog_recover_do_primary_sb_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_dsb *dsb = bp->b_addr; + xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; + xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount; + int error; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + + if (orig_agcount == 0) { + xfs_alert(mp, "Trying to grow file system without AGs"); + return -EFSCORRUPTED; + } + + /* + * Update the in-core super block from the freshly recovered on-disk one. + */ + xfs_sb_from_disk(&mp->m_sb, dsb); + + if (mp->m_sb.sb_agcount < orig_agcount) { + xfs_alert(mp, "Shrinking AG count in log recovery not supported"); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < orig_rgcount) { + xfs_warn(mp, + "Shrinking rtgroup count in log recovery not supported"); + return -EFSCORRUPTED; + } + + /* + * If the last AG was grown or shrunk, we also need to update the + * length in the in-core perag structure and values depending on it. + */ + error = xfs_update_last_ag_size(mp, orig_agcount); + if (error) + return error; + + /* + * If the last rtgroup was grown or shrunk, we also need to update the + * length in the in-core rtgroup structure and values depending on it. + * Ignore this on any filesystem with zero rtgroups. + */ + if (orig_rgcount > 0) { + error = xfs_update_last_rtgroup_size(mp, orig_rgcount); + if (error) + return error; + } + + /* + * Initialize the new perags, and also update various block and inode + * allocator setting based off the number of AGs or total blocks. + * Because of the latter this also needs to happen if the agcount did + * not change. + */ + error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed recovery per-ag init: %d", error); + return error; + } + mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + + error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); + if (error) { + xfs_warn(mp, "Failed recovery rtgroup init: %d", error); + return error; + } + return 0; +} + +/* * V5 filesystems know the age of the buffer on disk being recovered. We can * have newer objects on disk than we are replaying, and so for these cases we * don't want to replay the current change as that will make the buffer contents @@ -727,11 +825,20 @@ xlog_recover_get_buf_lsn( * UUIDs, so we must recover them immediately. */ blft = xfs_blft_from_flags(buf_f); - if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) + if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF || + blft == XFS_BLFT_RTSUMMARY_BUF)) goto recover_immediately; magic32 = be32_to_cpu(*(__be32 *)blk); switch (magic32) { + case XFS_RTSUMMARY_MAGIC: + case XFS_RTBITMAP_MAGIC: { + struct xfs_rtbuf_blkinfo *hdr = blk; + + lsn = be64_to_cpu(hdr->rt_lsn); + uuid = &hdr->rt_uuid; + break; + } case XFS_ABTB_CRC_MAGIC: case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: @@ -967,6 +1074,24 @@ xlog_recover_buf_commit_pass2( dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); if (!dirty) goto out_release; + } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && + xfs_buf_daddr(bp) == 0) { + error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, + current_lsn); + if (error) + goto out_release; + + /* Update the rt superblock if we have one. */ + if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { + struct xfs_buf *rtsb_bp = mp->m_rtsb_bp; + + xfs_buf_lock(rtsb_bp); + xfs_buf_hold(rtsb_bp); + xfs_update_rtsb(rtsb_bp, bp); + rtsb_bp->b_flags |= _XBF_LOGRECOVERY; + xfs_buf_delwri_queue(rtsb_bp, buffer_list); + xfs_buf_relse(rtsb_bp); + } } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d8c4a5dcca7a..c4bd145f5ec1 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -21,6 +21,7 @@ #include "xfs_ag.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -72,6 +73,8 @@ * extent search so that it overlaps in flight discard IO. */ +#define XFS_DISCARD_MAX_EXAMINE (100) + struct workqueue_struct *xfs_discard_wq; static void @@ -81,7 +84,7 @@ xfs_discard_endio_work( struct xfs_busy_extents *extents = container_of(work, struct xfs_busy_extents, endio_work); - xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + xfs_extent_busy_clear(&extents->extent_list, false); kfree(extents->owner); } @@ -100,6 +103,24 @@ xfs_discard_endio( bio_put(bio); } +static inline struct block_device * +xfs_group_bdev( + const struct xfs_group *xg) +{ + struct xfs_mount *mp = xg->xg_mount; + + switch (xg->xg_type) { + case XG_TYPE_AG: + return mp->m_ddev_targp->bt_bdev; + case XG_TYPE_RTG: + return mp->m_rtdev_targp->bt_bdev; + default: + ASSERT(0); + break; + } + return NULL; +} + /* * Walk the discard list and issue discards on all the busy extents in the * list. We plug and chain the bios so that we only need a single completion @@ -117,11 +138,11 @@ xfs_discard_extents( blk_start_plug(&plug); list_for_each_entry(busyp, &extents->extent_list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); + trace_xfs_discard_extent(busyp->group, busyp->bno, + busyp->length); - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), + xfs_gbno_to_daddr(busyp->group, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_KERNEL, &bio); if (error && error != -EOPNOTSUPP) { @@ -160,13 +181,13 @@ xfs_trim_gather_extents( struct xfs_trim_cur *tcur, struct xfs_busy_extents *extents) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_trans *tp; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; - int batch = 100; + int batch = XFS_DISCARD_MAX_EXAMINE; /* * Force out the log. This means any transactions that might have freed @@ -239,11 +260,11 @@ xfs_trim_gather_extents( * overlapping ranges for now. */ if (fbno + flen < tcur->start) { - trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_exclude(pag_group(pag), fbno, flen); goto next_extent; } if (fbno > tcur->end) { - trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_exclude(pag_group(pag), fbno, flen); if (tcur->by_bno) { tcur->count = 0; break; @@ -261,7 +282,7 @@ xfs_trim_gather_extents( /* Too small? Give up. */ if (flen < tcur->minlen) { - trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); if (tcur->by_bno) goto next_extent; tcur->count = 0; @@ -272,12 +293,12 @@ xfs_trim_gather_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_extent_busy_search(mp, pag, fbno, flen)) { - trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen); + if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { + trace_xfs_discard_busy(pag_group(pag), fbno, flen); goto next_extent; } - xfs_extent_busy_insert_discard(pag, fbno, flen, + xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, &extents->extent_list); next_extent: if (tcur->by_bno) @@ -301,7 +322,7 @@ next_extent: * we aren't going to issue a discard on them any more. */ if (error) - xfs_extent_busy_clear(mp, &extents->extent_list, false); + xfs_extent_busy_clear(&extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); out_trans_cancel: @@ -335,7 +356,7 @@ xfs_trim_perag_extents( }; int error = 0; - if (start != 0 || end != pag->block_count) + if (start != 0 || end != pag_group(pag)->xg_block_count) tcur.by_bno = true; do { @@ -347,7 +368,6 @@ xfs_trim_perag_extents( break; } - extents->mount = pag->pag_mount; extents->owner = extents; INIT_LIST_HEAD(&extents->extent_list); @@ -367,7 +387,7 @@ xfs_trim_perag_extents( * list after this function call, as it may have been freed by * the time control returns to us. */ - error = xfs_discard_extents(pag->pag_mount, extents); + error = xfs_discard_extents(pag_mount(pag), extents); if (error) break; @@ -389,8 +409,8 @@ xfs_trim_datadev_extents( { xfs_agnumber_t start_agno, end_agno; xfs_agblock_t start_agbno, end_agbno; + struct xfs_perag *pag = NULL; xfs_daddr_t ddev_end; - struct xfs_perag *pag; int last_error = 0, error; ddev_end = min_t(xfs_daddr_t, end, @@ -401,10 +421,10 @@ xfs_trim_datadev_extents( end_agno = xfs_daddr_to_agno(mp, ddev_end); end_agbno = xfs_daddr_to_agbno(mp, ddev_end); - for_each_perag_range(mp, start_agno, end_agno, pag) { - xfs_agblock_t agend = pag->block_count; + while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { + xfs_agblock_t agend = pag_group(pag)->xg_block_count; - if (start_agno == end_agno) + if (pag_agno(pag) == end_agno) agend = end_agbno; error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); if (error) @@ -479,7 +499,7 @@ xfs_discard_rtdev_extents( trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); error = __blkdev_issue_discard(bdev, - XFS_FSB_TO_BB(mp, busyp->bno), + xfs_rtb_to_daddr(mp, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_NOFS, &bio); if (error) @@ -506,7 +526,7 @@ xfs_discard_rtdev_extents( static int xfs_trim_gather_rtextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -525,12 +545,12 @@ xfs_trim_gather_rtextent( return -ECANCELED; } - rbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); + rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); /* Ignore too small. */ if (rlen < tr->minlen_fsb) { - trace_xfs_discard_rttoosmall(mp, rbno, rlen); + trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); return 0; } @@ -547,70 +567,185 @@ xfs_trim_gather_rtextent( return 0; } +/* Trim extents on an !rtgroups realtime device */ static int -xfs_trim_rtdev_extents( - struct xfs_mount *mp, - xfs_daddr_t start, - xfs_daddr_t end, +xfs_trim_rtextents( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t low, + xfs_rtxnum_t high, xfs_daddr_t minlen) { + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_trim_rtdev tr = { .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + .extent_list = LIST_HEAD_INIT(tr.extent_list), }; - xfs_rtxnum_t low, high; struct xfs_trans *tp; - xfs_daddr_t rtdev_daddr; int error; - INIT_LIST_HEAD(&tr.extent_list); - - /* Shift the start and end downwards to match the rt device. */ - rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - if (start > rtdev_daddr) - start -= rtdev_daddr; - else - start = 0; - - if (end <= rtdev_daddr) - return 0; - end -= rtdev_daddr; - error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - end = min_t(xfs_daddr_t, end, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1); - - /* Convert the rt blocks to rt extents */ - low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start)); - high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end)); - /* * Walk the free ranges between low and high. The query_range function * trims the extents returned. */ do { - tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY); - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_range(mp, tp, low, high, + tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, low, high, xfs_trim_gather_rtextent, &tr); if (error == -ECANCELED) error = 0; if (error) { - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); xfs_discard_free_rtdev_extents(&tr); break; } if (list_empty(&tr.extent_list)) { - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); break; } error = xfs_discard_rtdev_extents(mp, &tr); - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) + break; + + low = tr.restart_rtx; + } while (!xfs_trim_should_stop() && low <= high); + + xfs_trans_cancel(tp); + return error; +} + +struct xfs_trim_rtgroup { + /* list of rtgroup extents to free */ + struct xfs_busy_extents *extents; + + /* minimum length that caller allows us to trim */ + xfs_rtblock_t minlen_fsb; + + /* restart point for the rtbitmap walk */ + xfs_rtxnum_t restart_rtx; + + /* number of extents to examine before stopping to issue discard ios */ + int batch; + + /* number of extents queued for discard */ + int queued; +}; + +static int +xfs_trim_gather_rtgroup_extent( + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_trim_rtgroup *tr = priv; + xfs_rgblock_t rgbno; + xfs_extlen_t len; + + if (--tr->batch <= 0) { + /* + * If we've checked a large number of extents, update the + * cursor to point at this extent so we restart the next batch + * from this extent. + */ + tr->restart_rtx = rec->ar_startext; + return -ECANCELED; + } + + rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); + len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); + + /* Ignore too small. */ + if (len < tr->minlen_fsb) { + trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); + return 0; + } + + /* + * If any blocks in the range are still busy, skip the discard and try + * again the next time. + */ + if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { + trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); + return 0; + } + + xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, + &tr->extents->extent_list); + + tr->queued++; + tr->restart_rtx = rec->ar_startext + rec->ar_extcount; + return 0; +} + +/* Trim extents in this rtgroup using the busy extent machinery. */ +static int +xfs_trim_rtgroup_extents( + struct xfs_rtgroup *rtg, + xfs_rtxnum_t low, + xfs_rtxnum_t high, + xfs_daddr_t minlen) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_trim_rtgroup tr = { + .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), + }; + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + /* + * Walk the free ranges between low and high. The query_range function + * trims the extents returned. + */ + do { + tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); + if (!tr.extents) { + error = -ENOMEM; + break; + } + + tr.queued = 0; + tr.batch = XFS_DISCARD_MAX_EXAMINE; + tr.extents->owner = tr.extents; + INIT_LIST_HEAD(&tr.extents->extent_list); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, low, high, + xfs_trim_gather_rtgroup_extent, &tr); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error == -ECANCELED) + error = 0; + if (error) { + kfree(tr.extents); + break; + } + + if (!tr.queued) + break; + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with + * gathering the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(rtg_mount(rtg), tr.extents); if (error) break; @@ -620,6 +755,63 @@ xfs_trim_rtdev_extents( xfs_trans_cancel(tp); return error; } + +static int +xfs_trim_rtdev_extents( + struct xfs_mount *mp, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen) +{ + xfs_rtblock_t start_rtbno, end_rtbno; + xfs_rtxnum_t start_rtx, end_rtx; + xfs_rgnumber_t start_rgno, end_rgno; + xfs_daddr_t daddr_offset; + int last_error = 0, error; + struct xfs_rtgroup *rtg = NULL; + + /* Shift the start and end downwards to match the rt device. */ + daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (start > daddr_offset) + start -= daddr_offset; + else + start = 0; + start_rtbno = xfs_daddr_to_rtb(mp, start); + start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); + start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); + + if (end <= daddr_offset) + return 0; + else + end -= daddr_offset; + end_rtbno = xfs_daddr_to_rtb(mp, end); + end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); + end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); + + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { + xfs_rtxnum_t rtg_end = rtg->rtg_extents; + + if (rtg_rgno(rtg) == end_rgno) + rtg_end = min(rtg_end, end_rtx); + + if (xfs_has_rtgroups(mp)) + error = xfs_trim_rtgroup_extents(rtg, start_rtx, + rtg_end, minlen); + else + error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, + minlen); + if (error) + last_error = error; + + if (xfs_trim_should_stop()) { + xfs_rtgroup_rele(rtg); + break; + } + start_rtx = 0; + } + + return last_error; +} #else # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c1b211c260a9..201c26322ede 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -69,6 +69,31 @@ xfs_dquot_mark_sick( } /* + * Detach the dquot buffer if it's still attached, because we can get called + * through dqpurge after a log shutdown. Caller must hold the dqflock or have + * otherwise isolated the dquot. + */ +void +xfs_dquot_detach_buf( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_buf *bp = NULL; + + spin_lock(&qlip->qli_lock); + if (qlip->qli_item.li_buf) { + bp = qlip->qli_item.li_buf; + qlip->qli_item.li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) { + xfs_buf_lock(bp); + list_del_init(&qlip->qli_item.li_bio_list); + xfs_buf_relse(bp); + } +} + +/* * This is called to free all the memory associated with a dquot */ void @@ -76,6 +101,7 @@ xfs_qm_dqdestroy( struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); + ASSERT(dqp->q_logitem.qli_item.li_buf == NULL); kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); @@ -277,6 +303,25 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_set_prealloc( + struct xfs_dquot_pre *pre, + const struct xfs_dquot_res *res) +{ + xfs_qcnt_t space; + + pre->q_prealloc_hi_wmark = res->hardlimit; + pre->q_prealloc_lo_wmark = res->softlimit; + + space = div_u64(pre->q_prealloc_hi_wmark, 100); + if (!pre->q_prealloc_lo_wmark) + pre->q_prealloc_lo_wmark = space * 95; + + pre->q_low_space[XFS_QLOWSP_1_PCNT] = space; + pre->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + pre->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + /* * Initialize the dynamic speculative preallocation thresholds. The lo/hi * watermarks correspond to the soft and hard limits by default. If a soft limit @@ -285,22 +330,8 @@ xfs_qm_init_dquot_blk( void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { - uint64_t space; - - dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; - dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; - if (!dqp->q_prealloc_lo_wmark) { - dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; - do_div(dqp->q_prealloc_lo_wmark, 100); - dqp->q_prealloc_lo_wmark *= 95; - } - - space = dqp->q_prealloc_hi_wmark; - - do_div(space, 100); - dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; - dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; - dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; + xfs_dquot_set_prealloc(&dqp->q_blk_prealloc, &dqp->q_blk); + xfs_dquot_set_prealloc(&dqp->q_rtb_prealloc, &dqp->q_rtb); } /* @@ -983,6 +1014,7 @@ xfs_qm_dqget_inode( xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); + ASSERT(!xfs_is_metadir_inode(ip)); id = xfs_qm_id_for_quotatype(ip, type); @@ -1136,9 +1168,11 @@ static void xfs_qm_dqflush_done( struct xfs_log_item *lip) { - struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; - struct xfs_dquot *dqp = qip->qli_dquot; + struct xfs_dq_logitem *qlip = + container_of(lip, struct xfs_dq_logitem, qli_item); + struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + struct xfs_buf *bp = NULL; xfs_lsn_t tail_lsn; /* @@ -1150,12 +1184,12 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && - ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); xfs_clear_li_failed(lip); - if (lip->li_lsn == qip->qli_flush_lsn) { + if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); @@ -1165,6 +1199,20 @@ xfs_qm_dqflush_done( } /* + * If this dquot hasn't been dirtied since initiating the last dqflush, + * release the buffer reference. We already unlinked this dquot item + * from the buffer. + */ + spin_lock(&qlip->qli_lock); + if (!qlip->qli_dirty) { + bp = lip->li_buf; + lip->li_buf = NULL; + } + spin_unlock(&qlip->qli_lock); + if (bp) + xfs_buf_rele(bp); + + /* * Release the dq's flush lock since we're done with it. */ xfs_dqfunlock(dqp); @@ -1190,7 +1238,7 @@ xfs_buf_dquot_io_fail( spin_lock(&bp->b_mount->m_ail->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_set_li_failed(lip, bp); + set_bit(XFS_LI_FAILED, &lip->li_flags); spin_unlock(&bp->b_mount->m_ail->ail_lock); } @@ -1233,6 +1281,111 @@ xfs_qm_dqflush_check( } /* + * Get the buffer containing the on-disk dquot. + * + * Requires dquot flush lock, will clear the dirty flag, delete the quota log + * item from the AIL, and shut down the system if something goes wrong. + */ +static int +xfs_dquot_read_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp = NULL; + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, + &bp, &xfs_dquot_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); + if (error) + goto out_abort; + + *bpp = bp; + return 0; + +out_abort: + dqp->q_flags &= ~XFS_DQFLAG_DIRTY; + xfs_trans_ail_delete(&dqp->q_logitem.qli_item, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + +/* + * Attach a dquot buffer to this dquot to avoid allocating a buffer during a + * dqflush, since dqflush can be called from reclaim context. + */ +int +xfs_dquot_attach_buf( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; + int error; + + spin_lock(&qlip->qli_lock); + if (!lip->li_buf) { + struct xfs_buf *bp = NULL; + + spin_unlock(&qlip->qli_lock); + error = xfs_dquot_read_buf(tp, dqp, &bp); + if (error) + return error; + + /* + * Attach the dquot to the buffer so that the AIL does not have + * to read the dquot buffer to push this item. + */ + xfs_buf_hold(bp); + spin_lock(&qlip->qli_lock); + lip->li_buf = bp; + xfs_trans_brelse(tp, bp); + } + qlip->qli_dirty = true; + spin_unlock(&qlip->qli_lock); + + return 0; +} + +/* + * Get a new reference the dquot buffer attached to this dquot for a dqflush + * operation. + * + * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked + * bp; or -EAGAIN if the buffer could not be locked. + */ +int +xfs_dquot_use_attached_buf( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf; + + /* + * A NULL buffer can happen if the dquot dirty flag was set but the + * filesystem shut down before transaction commit happened. In that + * case we're not going to flush anyway. + */ + if (!bp) { + ASSERT(xfs_is_shutdown(dqp->q_mount)); + + *bpp = NULL; + return 0; + } + + if (!xfs_buf_trylock(bp)) + return -EAGAIN; + + xfs_buf_hold(bp); + *bpp = bp; + return 0; +} + +/* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, @@ -1243,11 +1396,11 @@ xfs_qm_dqflush_check( int xfs_qm_dqflush( struct xfs_dquot *dqp, - struct xfs_buf **bpp) + struct xfs_buf *bp) { struct xfs_mount *mp = dqp->q_mount; - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - struct xfs_buf *bp; + struct xfs_dq_logitem *qlip = &dqp->q_logitem; + struct xfs_log_item *lip = &qlip->qli_item; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; @@ -1257,28 +1410,12 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - *bpp = NULL; - xfs_qm_dqunpin_wait(dqp); - /* - * Get the buffer containing the on-disk dquot - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, - &bp, &xfs_dquot_buf_ops); - if (error == -EAGAIN) - goto out_unlock; - if (xfs_metadata_is_sick(error)) - xfs_dquot_mark_sick(dqp); - if (error) - goto out_abort; - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); - xfs_buf_relse(bp); xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; @@ -1293,8 +1430,15 @@ xfs_qm_dqflush( */ dqp->q_flags &= ~XFS_DQFLAG_DIRTY; - xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, - &dqp->q_logitem.qli_item.li_lsn); + /* + * We hold the dquot lock, so nobody can dirty it while we're + * scheduling the write out. Clear the dirty-since-flush flag. + */ + spin_lock(&qlip->qli_lock); + qlip->qli_dirty = false; + spin_unlock(&qlip->qli_lock); + + xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory @@ -1306,7 +1450,7 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_has_crc(mp)) { - dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + dqblk->dd_lsn = cpu_to_be64(lip->li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -1316,7 +1460,7 @@ xfs_qm_dqflush( * the AIL and release the flush lock once the dquot is synced to disk. */ bp->b_flags |= _XBF_DQUOTS; - list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); + list_add_tail(&lip->li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't @@ -1328,14 +1472,12 @@ xfs_qm_dqflush( } trace_xfs_dqflush_done(dqp); - *bpp = bp; return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -out_unlock: xfs_dqfunlock(dqp); return error; } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 677bb2dc9ac9..c617bac75361 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -56,6 +56,12 @@ xfs_dquot_res_over_limits( return false; } +struct xfs_dquot_pre { + xfs_qcnt_t q_prealloc_lo_wmark; + xfs_qcnt_t q_prealloc_hi_wmark; + int64_t q_low_space[XFS_QLOWSP_MAX]; +}; + /* * The incore dquot structure */ @@ -76,9 +82,9 @@ struct xfs_dquot { struct xfs_dq_logitem q_logitem; - xfs_qcnt_t q_prealloc_lo_wmark; - xfs_qcnt_t q_prealloc_hi_wmark; - int64_t q_low_space[XFS_QLOWSP_MAX]; + struct xfs_dquot_pre q_blk_prealloc; + struct xfs_dquot_pre q_rtb_prealloc; + struct mutex q_qlock; struct completion q_flush; atomic_t q_pincount; @@ -192,7 +198,11 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) int64_t freesp; freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved; - if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + if (freesp < dqp->q_blk_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + freesp = dqp->q_rtb.hardlimit - dqp->q_rtb.reserved; + if (freesp < dqp->q_rtb_prealloc.q_low_space[XFS_QLOWSP_1_PCNT]) return true; return false; @@ -204,7 +214,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp); #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY) void xfs_qm_dqdestroy(struct xfs_dquot *dqp); -int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); +int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_dquot *d); @@ -227,6 +237,10 @@ void xfs_dqlockn(struct xfs_dqtrx *q); void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); +int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp); +int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp); +void xfs_dquot_detach_buf(struct xfs_dquot *dqp); + static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { xfs_dqlock(dqp); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 7d19091215b0..271b195ebb93 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push( __releases(&lip->li_ailp->ail_lock) __acquires(&lip->li_ailp->ail_lock) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = lip->li_buf; + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; uint rval = XFS_ITEM_SUCCESS; int error; @@ -155,14 +156,25 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (error == -EAGAIN) { + xfs_dqfunlock(dqp); + rval = XFS_ITEM_LOCKED; + goto out_relock_ail; + } + + /* + * dqflush completes dqflock on error, and the delwri ioend does it on + * success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; - xfs_buf_relse(bp); - } else if (error == -EAGAIN) - rval = XFS_ITEM_LOCKED; + } + xfs_buf_relse(bp); +out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); @@ -195,12 +207,10 @@ xfs_qm_dquot_logitem_committing( } #ifdef DEBUG_EXPENSIVE -static int -xfs_qm_dquot_logitem_precommit( - struct xfs_trans *tp, - struct xfs_log_item *lip) +static void +xfs_qm_dquot_logitem_precommit_check( + struct xfs_dquot *dqp) { - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_mount *mp = dqp->q_mount; struct xfs_disk_dquot ddq = { }; xfs_failaddr_t fa; @@ -216,13 +226,24 @@ xfs_qm_dquot_logitem_precommit( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ASSERT(fa == NULL); } - - return 0; } #else -# define xfs_qm_dquot_logitem_precommit NULL +# define xfs_qm_dquot_logitem_precommit_check(...) ((void)0) #endif +static int +xfs_qm_dquot_logitem_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + + xfs_qm_dquot_logitem_precommit_check(dqp); + + return xfs_dquot_attach_buf(tp, dqp); +} + static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_precommit = xfs_qm_dquot_logitem_precommit, @@ -247,5 +268,7 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); + spin_lock_init(&lp->qli_lock); lp->qli_dquot = dqp; + lp->qli_dirty = false; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 794710c24474..d66e52807d76 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -14,6 +14,13 @@ struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ + + /* + * We use this spinlock to coordinate access to the li_buf pointer in + * the log item and the qli_dirty flag. + */ + spinlock_t qli_lock; + bool qli_dirty; /* dirtied since last flush? */ }; void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c index 7bdb9688c0f5..5ede81fadbd8 100644 --- a/fs/xfs/xfs_drain.c +++ b/fs/xfs/xfs_drain.c @@ -94,55 +94,39 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) } /* - * Get a passive reference to the AG that contains a fsbno and declare an intent - * to update its metadata. + * Get a passive reference to the group that contains a fsbno and declare an + * intent to update its metadata. + * + * Other threads that need exclusive access can decide to back off if they see + * declared intentions. */ -struct xfs_perag * -xfs_perag_intent_get( +struct xfs_group * +xfs_group_intent_get( struct xfs_mount *mp, - xfs_fsblock_t fsbno) + xfs_fsblock_t fsbno, + enum xfs_group_type type) { - struct xfs_perag *pag; + struct xfs_group *xg; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno)); - if (!pag) + xg = xfs_group_get_by_fsb(mp, fsbno, type); + if (!xg) return NULL; - - xfs_perag_intent_hold(pag); - return pag; -} - -/* - * Release our intent to update this AG's metadata, and then release our - * passive ref to the AG. - */ -void -xfs_perag_intent_put( - struct xfs_perag *pag) -{ - xfs_perag_intent_rele(pag); - xfs_perag_put(pag); + trace_xfs_group_intent_hold(xg, __return_address); + xfs_defer_drain_grab(&xg->xg_intents_drain); + return xg; } /* - * Declare an intent to update AG metadata. Other threads that need exclusive - * access can decide to back off if they see declared intentions. + * Release our intent to update this groups metadata, and then release our + * passive ref to it. */ void -xfs_perag_intent_hold( - struct xfs_perag *pag) +xfs_group_intent_put( + struct xfs_group *xg) { - trace_xfs_perag_intent_hold(pag, __return_address); - xfs_defer_drain_grab(&pag->pag_intents_drain); -} - -/* Release our intent to update this AG's metadata. */ -void -xfs_perag_intent_rele( - struct xfs_perag *pag) -{ - trace_xfs_perag_intent_rele(pag, __return_address); - xfs_defer_drain_rele(&pag->pag_intents_drain); + trace_xfs_group_intent_rele(xg, __return_address); + xfs_defer_drain_rele(&xg->xg_intents_drain); + xfs_group_put(xg); } /* @@ -150,17 +134,19 @@ xfs_perag_intent_rele( * Callers must not hold any AG header buffers. */ int -xfs_perag_intent_drain( - struct xfs_perag *pag) +xfs_group_intent_drain( + struct xfs_group *xg) { - trace_xfs_perag_wait_intents(pag, __return_address); - return xfs_defer_drain_wait(&pag->pag_intents_drain); + trace_xfs_group_wait_intents(xg, __return_address); + return xfs_defer_drain_wait(&xg->xg_intents_drain); } -/* Has anyone declared an intent to update this AG? */ +/* + * Has anyone declared an intent to update this group? + */ bool -xfs_perag_intent_busy( - struct xfs_perag *pag) +xfs_group_intent_busy( + struct xfs_group *xg) { - return xfs_defer_drain_busy(&pag->pag_intents_drain); + return xfs_defer_drain_busy(&xg->xg_intents_drain); } diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h index 775164f54ea6..efcf88df9a5e 100644 --- a/fs/xfs/xfs_drain.h +++ b/fs/xfs/xfs_drain.h @@ -6,6 +6,7 @@ #ifndef XFS_DRAIN_H_ #define XFS_DRAIN_H_ +struct xfs_group; struct xfs_perag; #ifdef CONFIG_XFS_DRAIN_INTENTS @@ -61,27 +62,22 @@ void xfs_drain_wait_enable(void); * soon as the item is added to the transaction and cannot drop the counter * until the item is finished or cancelled. */ -struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, - xfs_fsblock_t fsbno); -void xfs_perag_intent_put(struct xfs_perag *pag); +struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp, + xfs_fsblock_t fsbno, enum xfs_group_type type); +void xfs_group_intent_put(struct xfs_group *rtg); -void xfs_perag_intent_hold(struct xfs_perag *pag); -void xfs_perag_intent_rele(struct xfs_perag *pag); +int xfs_group_intent_drain(struct xfs_group *xg); +bool xfs_group_intent_busy(struct xfs_group *xg); -int xfs_perag_intent_drain(struct xfs_perag *pag); -bool xfs_perag_intent_busy(struct xfs_perag *pag); #else struct xfs_defer_drain { /* empty */ }; #define xfs_defer_drain_free(dr) ((void)0) #define xfs_defer_drain_init(dr) ((void)0) -#define xfs_perag_intent_get(mp, fsbno) \ - xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno)) -#define xfs_perag_intent_put(pag) xfs_perag_put(pag) - -static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } -static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } +#define xfs_group_intent_get(_mp, _fsbno, _type) \ + xfs_group_get_by_fsb((_mp), (_fsbno), (_type)) +#define xfs_group_intent_put(xg) xfs_group_put(xg) #endif /* CONFIG_XFS_DRAIN_INTENTS */ diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index 75cb53f090d1..265c42449893 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -217,7 +217,7 @@ xfs_exchrange_mappings( * length in @fxr are safe to round up. */ if (xfs_inode_has_bigrtalloc(ip2)) - req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount); + req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount); error = xfs_exchrange_estimate(&req); if (error) @@ -813,8 +813,6 @@ xfs_ioc_exchange_range( .file2 = file, }; struct xfs_exchange_range args; - struct fd file1; - int error; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; @@ -828,14 +826,12 @@ xfs_ioc_exchange_range( fxr.length = args.length; fxr.flags = args.flags; - file1 = fdget(args.file1_fd); - if (!fd_file(file1)) + CLASS(fd, file1)(args.file1_fd); + if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ @@ -858,7 +854,7 @@ xfs_ioc_start_commit( struct xfs_commit_range __user *argp) { struct xfs_commit_range args = { }; - struct timespec64 ts; + struct kstat kstat = { }; struct xfs_commit_range_fresh *kern_f; struct xfs_commit_range_fresh __user *user_f; struct inode *inode2 = file_inode(file); @@ -875,12 +871,12 @@ xfs_ioc_start_commit( memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); xfs_ilock(ip2, lockflags); - ts = inode_get_ctime(inode2); - kern_f->file2_ctime = ts.tv_sec; - kern_f->file2_ctime_nsec = ts.tv_nsec; - ts = inode_get_mtime(inode2); - kern_f->file2_mtime = ts.tv_sec; - kern_f->file2_mtime_nsec = ts.tv_nsec; + /* Force writing of a distinct ctime if any writes happen. */ + fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); + kern_f->file2_ctime = kstat.ctime.tv_sec; + kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; + kern_f->file2_mtime = kstat.mtime.tv_sec; + kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; kern_f->file2_ino = ip2->i_ino; kern_f->file2_gen = inode2->i_generation; kern_f->magic = XCR_FRESH_MAGIC; @@ -909,8 +905,6 @@ xfs_ioc_commit_range( struct xfs_commit_range_fresh *kern_f; struct xfs_inode *ip2 = XFS_I(file_inode(file)); struct xfs_mount *mp = ip2->i_mount; - struct fd file1; - int error; kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; @@ -934,12 +928,10 @@ xfs_ioc_commit_range( fxr.file2_ctime.tv_sec = kern_f->file2_ctime; fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; - file1 = fdget(args.file1_fd); + CLASS(fd, file1)(args.file1_fd); if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); - error = xfs_exchange_range(&fxr); - fdput(file1); - return error; + return xfs_exchange_range(&fxr); } diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index a73e7c73b664..ea43c9a6e54c 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -18,15 +18,24 @@ #include "xfs_trans.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_rtgroup.h" + +struct xfs_extent_busy_tree { + spinlock_t eb_lock; + struct rb_root eb_tree; + unsigned int eb_gen; + wait_queue_head_t eb_wait; +}; static void xfs_extent_busy_insert_list( - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags, struct list_head *busy_list) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; struct rb_node **rbp; @@ -34,17 +43,17 @@ xfs_extent_busy_insert_list( new = kzalloc(sizeof(struct xfs_extent_busy), GFP_KERNEL | __GFP_NOFAIL); - new->agno = pag->pag_agno; + new->group = xfs_group_hold(xg); new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); + trace_xfs_extent_busy(xg, bno, len); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; + spin_lock(&eb->eb_lock); + rbp = &eb->eb_tree.rb_node; while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); @@ -61,32 +70,32 @@ xfs_extent_busy_insert_list( } rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); + rb_insert_color(&new->rb_node, &eb->eb_tree); /* always process discard lists in fifo order */ list_add_tail(&new->list, busy_list); - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); } void xfs_extent_busy_insert( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { - xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); + xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy); } void xfs_extent_busy_insert_discard( - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len, struct list_head *busy_list) { - xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED, busy_list); } @@ -101,18 +110,18 @@ xfs_extent_busy_insert_discard( */ int xfs_extent_busy_search( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t bno, xfs_extlen_t len) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; /* find closest start bno overlap */ - spin_lock(&pag->pagb_lock); - rbp = pag->pagb_tree.rb_node; + spin_lock(&eb->eb_lock); + rbp = eb->eb_tree.rb_node; while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { @@ -131,7 +140,7 @@ xfs_extent_busy_search( break; } } - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); return match; } @@ -148,14 +157,15 @@ xfs_extent_busy_search( */ STATIC bool xfs_extent_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, - bool userdata) __releases(&pag->pagb_lock) - __acquires(&pag->pagb_lock) + bool userdata) + __releases(&eb->eb_lock) + __acquires(&eb->eb_lock) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; @@ -166,9 +176,9 @@ xfs_extent_busy_update_extent( * and retry. */ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); delay(1); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); return false; } @@ -241,7 +251,7 @@ xfs_extent_busy_update_extent( * tree root, because erasing the node can rearrange the * tree topology. */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); + rb_erase(&busyp->rb_node, &eb->eb_tree); busyp->length = 0; return false; } else if (fend < bend) { @@ -280,35 +290,34 @@ xfs_extent_busy_update_extent( ASSERT(0); } - trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + trace_xfs_extent_busy_reuse(xg, fbno, flen); return true; out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); + xfs_log_force(xg->xg_mount, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(xg, fbno, flen); + spin_lock(&eb->eb_lock); return false; } - /* * For a given extent [fbno, flen], make sure we can reuse it safely. */ void xfs_extent_busy_reuse( - struct xfs_mount *mp, - struct xfs_perag *pag, + struct xfs_group *xg, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; struct rb_node *rbp; ASSERT(flen > 0); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); restart: - rbp = pag->pagb_tree.rb_node; + rbp = eb->eb_tree.rb_node; while (rbp) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); @@ -323,11 +332,11 @@ restart: continue; } - if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen, userdata)) goto restart; } - spin_unlock(&pag->pagb_lock); + spin_unlock(&eb->eb_lock); } /* @@ -336,7 +345,7 @@ restart: * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. * - * Return the current busy generation for the AG if the extent is busy. This + * Return the current busy generation for the group if the extent is busy. This * value can be used to wait for at least one of the currently busy extents * to be cleared. Note that the busy list is not guaranteed to be empty after * the gen is woken. The state of a specific extent must always be confirmed @@ -344,11 +353,14 @@ restart: */ bool xfs_extent_busy_trim( - struct xfs_alloc_arg *args, + struct xfs_group *xg, + xfs_extlen_t minlen, + xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; @@ -356,11 +368,11 @@ xfs_extent_busy_trim( ASSERT(*len > 0); - spin_lock(&args->pag->pagb_lock); + spin_lock(&eb->eb_lock); fbno = *bno; flen = *len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { + rbp = eb->eb_tree.rb_node; + while (rbp && flen >= minlen) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t fend = fbno + flen; @@ -481,13 +493,13 @@ xfs_extent_busy_trim( * good chance subsequent allocations will be * contiguous. */ - if (bbno - fbno >= args->maxlen) { + if (bbno - fbno >= maxlen) { /* left candidate fits perfect */ fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { + } else if (fend - bend >= maxlen * 4) { /* right candidate has enough free space */ fbno = bend; - } else if (bbno - fbno >= args->minlen) { + } else if (bbno - fbno >= minlen) { /* left candidate fits minimum requirement */ fend = bbno; } else { @@ -500,14 +512,13 @@ xfs_extent_busy_trim( out: if (fbno != *bno || flen != *len) { - trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, - fbno, flen); + trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen); *bno = fbno; *len = flen; - *busy_gen = args->pag->pagb_gen; + *busy_gen = eb->eb_gen; ret = true; } - spin_unlock(&args->pag->pagb_lock); + spin_unlock(&eb->eb_lock); return ret; fail: /* @@ -520,22 +531,24 @@ fail: static bool xfs_extent_busy_clear_one( - struct xfs_perag *pag, struct xfs_extent_busy *busyp, bool do_discard) { + struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents; + if (busyp->length) { if (do_discard && !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; return false; } - trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno, - busyp->bno, busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); + trace_xfs_extent_busy_clear(busyp->group, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &eb->eb_tree); } list_del_init(&busyp->list); + xfs_group_put(busyp->group); kfree(busyp); return true; } @@ -547,7 +560,6 @@ xfs_extent_busy_clear_one( */ void xfs_extent_busy_clear( - struct xfs_mount *mp, struct list_head *list, bool do_discard) { @@ -558,30 +570,30 @@ xfs_extent_busy_clear( return; do { + struct xfs_group *xg = xfs_group_hold(busyp->group); + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool wakeup = false; - struct xfs_perag *pag; - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); do { next = list_next_entry(busyp, list); - if (xfs_extent_busy_clear_one(pag, busyp, do_discard)) + if (xfs_extent_busy_clear_one(busyp, do_discard)) wakeup = true; busyp = next; } while (!list_entry_is_head(busyp, list, list) && - busyp->agno == pag->pag_agno); + busyp->group == xg); if (wakeup) { - pag->pagb_gen++; - wake_up_all(&pag->pagb_wait); + eb->eb_gen++; + wake_up_all(&eb->eb_wait); } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); + spin_unlock(&eb->eb_lock); + xfs_group_put(xg); } while (!list_entry_is_head(busyp, list, list)); } /* - * Flush out all busy extents for this AG. + * Flush out all busy extents for this group. * * If the current transaction is holding busy extents, the caller may not want * to wait for committed busy extents to resolve. If we are being told just to @@ -597,10 +609,11 @@ xfs_extent_busy_clear( int xfs_extent_busy_flush( struct xfs_trans *tp, - struct xfs_perag *pag, + struct xfs_group *xg, unsigned busy_gen, uint32_t alloc_flags) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; DEFINE_WAIT (wait); int error; @@ -613,7 +626,7 @@ xfs_extent_busy_flush( if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) return 0; - if (busy_gen != READ_ONCE(pag->pagb_gen)) + if (busy_gen != READ_ONCE(eb->eb_gen)) return 0; if (alloc_flags & XFS_ALLOC_FLAG_FREEING) @@ -622,37 +635,49 @@ xfs_extent_busy_flush( /* Wait for committed busy extents to resolve. */ do { - prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); - if (busy_gen != READ_ONCE(pag->pagb_gen)) + prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); + if (busy_gen != READ_ONCE(eb->eb_gen)) break; schedule(); } while (1); - finish_wait(&pag->pagb_wait, &wait); + finish_wait(&eb->eb_wait, &wait); return 0; } +static void +xfs_extent_busy_wait_group( + struct xfs_group *xg) +{ + DEFINE_WAIT (wait); + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; + + do { + prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE); + if (RB_EMPTY_ROOT(&eb->eb_tree)) + break; + schedule(); + } while (1); + finish_wait(&eb->eb_wait, &wait); +} + void xfs_extent_busy_wait_all( struct xfs_mount *mp) { - struct xfs_perag *pag; - DEFINE_WAIT (wait); - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; - for_each_perag(mp, agno, pag) { - do { - prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); - if (RB_EMPTY_ROOT(&pag->pagb_tree)) - break; - schedule(); - } while (1); - finish_wait(&pag->pagb_wait, &wait); - } + while ((pag = xfs_perag_next(mp, pag))) + xfs_extent_busy_wait_group(pag_group(pag)); + + if (xfs_has_rtgroups(mp)) + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_extent_busy_wait_group(rtg_group(rtg)); } /* - * Callback for list_sort to sort busy extents by the AG they reside in. + * Callback for list_sort to sort busy extents by the group they reside in. */ int xfs_extent_busy_ag_cmp( @@ -666,21 +691,38 @@ xfs_extent_busy_ag_cmp( container_of(l2, struct xfs_extent_busy, list); s32 diff; - diff = b1->agno - b2->agno; + diff = b1->group->xg_gno - b2->group->xg_gno; if (!diff) diff = b1->bno - b2->bno; return diff; } -/* Are there any busy extents in this AG? */ +/* Are there any busy extents in this group? */ bool xfs_extent_busy_list_empty( - struct xfs_perag *pag) + struct xfs_group *xg, + unsigned *busy_gen) { + struct xfs_extent_busy_tree *eb = xg->xg_busy_extents; bool res; - spin_lock(&pag->pagb_lock); - res = RB_EMPTY_ROOT(&pag->pagb_tree); - spin_unlock(&pag->pagb_lock); + spin_lock(&eb->eb_lock); + res = RB_EMPTY_ROOT(&eb->eb_tree); + *busy_gen = READ_ONCE(eb->eb_gen); + spin_unlock(&eb->eb_lock); return res; } + +struct xfs_extent_busy_tree * +xfs_extent_busy_alloc(void) +{ + struct xfs_extent_busy_tree *eb; + + eb = kzalloc(sizeof(*eb), GFP_KERNEL); + if (!eb) + return NULL; + spin_lock_init(&eb->eb_lock); + init_waitqueue_head(&eb->eb_wait); + eb->eb_tree = RB_ROOT; + return eb; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 470032de3139..f069b04e8ea1 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -8,19 +8,18 @@ #ifndef __XFS_EXTENT_BUSY_H__ #define __XFS_EXTENT_BUSY_H__ +struct xfs_group; struct xfs_mount; -struct xfs_perag; struct xfs_trans; -struct xfs_alloc_arg; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in the group to mark blocks + * that have been freed but whose transactions aren't committed to disk yet. */ struct xfs_extent_busy { - struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct rb_node rb_node; /* group by-bno indexed search tree */ struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; + struct xfs_group *group; xfs_agblock_t bno; xfs_extlen_t length; unsigned int flags; @@ -33,7 +32,6 @@ struct xfs_extent_busy { * to discard completion. */ struct xfs_busy_extents { - struct xfs_mount *mount; struct list_head extent_list; struct work_struct endio_work; @@ -45,46 +43,29 @@ struct xfs_busy_extents { void *owner; }; -void -xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, - xfs_extlen_t len, struct list_head *busy_list); - -void -xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -bool -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, - xfs_extlen_t *len, unsigned *busy_gen); - -int -xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, +void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_group *xg, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); +void xfs_extent_busy_insert_discard(struct xfs_group *xg, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); +void xfs_extent_busy_clear(struct list_head *list, bool do_discard); +int xfs_extent_busy_search(struct xfs_group *xg, xfs_agblock_t bno, + xfs_extlen_t len); +void xfs_extent_busy_reuse(struct xfs_group *xg, xfs_agblock_t fbno, + xfs_extlen_t flen, bool userdata); +bool xfs_extent_busy_trim(struct xfs_group *xg, xfs_extlen_t minlen, + xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len, + unsigned *busy_gen); +int xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_group *xg, unsigned busy_gen, uint32_t alloc_flags); +void xfs_extent_busy_wait_all(struct xfs_mount *mp); +bool xfs_extent_busy_list_empty(struct xfs_group *xg, unsigned int *busy_gen); +struct xfs_extent_busy_tree *xfs_extent_busy_alloc(void); -void -xfs_extent_busy_wait_all(struct xfs_mount *mp); - -int -xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, - const struct list_head *b); - +int xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, + const struct list_head *b); static inline void xfs_extent_busy_sort(struct list_head *list) { list_sort(NULL, list, xfs_extent_busy_ag_cmp); } -bool xfs_extent_busy_list_empty(struct xfs_perag *pag); - #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index abffc74a924f..a25c713ff888 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -25,6 +25,10 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -95,16 +99,15 @@ xfs_efi_item_format( ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); + ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT); - efip->efi_format.efi_type = XFS_LI_EFI; + efip->efi_format.efi_type = lip->li_type; efip->efi_format.efi_size = 1; - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, - &efip->efi_format, + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format, xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents)); } - /* * The unpin operation is the last place an EFI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In @@ -140,12 +143,14 @@ xfs_efi_item_release( STATIC struct xfs_efi_log_item * xfs_efi_init( struct xfs_mount *mp, + unsigned short item_type, uint nextents) - { struct xfs_efi_log_item *efip; + ASSERT(item_type == XFS_LI_EFI || item_type == XFS_LI_EFI_RT); ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { efip = kzalloc(xfs_efi_log_item_sizeof(nextents), GFP_KERNEL | __GFP_NOFAIL); @@ -154,7 +159,7 @@ xfs_efi_init( GFP_KERNEL | __GFP_NOFAIL); } - xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + xfs_log_item_init(mp, &efip->efi_item, item_type, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); @@ -264,12 +269,12 @@ xfs_efd_item_format( struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + ASSERT(lip->li_type == XFS_LI_EFD || lip->li_type == XFS_LI_EFD_RT); - efdp->efd_format.efd_type = XFS_LI_EFD; + efdp->efd_format.efd_type = lip->li_type; efdp->efd_format.efd_size = 1; - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, - &efdp->efd_format, + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format, xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents)); } @@ -308,6 +313,14 @@ static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e) return list_entry(e, struct xfs_extent_free_item, xefi_list); } +static inline bool +xfs_efi_item_isrt(const struct xfs_log_item *lip) +{ + ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT); + + return lip->li_type == XFS_LI_EFI_RT; +} + /* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. @@ -362,7 +375,7 @@ xfs_extent_free_diff_items( struct xfs_extent_free_item *ra = xefi_entry(a); struct xfs_extent_free_item *rb = xefi_entry(b); - return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; + return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno; } /* Log a free extent to the intent item. */ @@ -388,18 +401,20 @@ xfs_extent_free_log_item( } static struct xfs_log_item * -xfs_extent_free_create_intent( +__xfs_extent_free_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, - bool sort) + bool sort, + unsigned short item_type) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_efi_log_item *efip; struct xfs_extent_free_item *xefi; ASSERT(count > 0); + efip = xfs_efi_init(mp, item_type, count); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -407,6 +422,23 @@ xfs_extent_free_create_intent( return &efip->efi_item; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_extent_free_create_intent(tp, items, count, sort, + XFS_LI_EFI); +} + +static inline unsigned short +xfs_efd_type_from_efi(const struct xfs_efi_log_item *efip) +{ + return xfs_efi_item_isrt(&efip->efi_item) ? XFS_LI_EFD_RT : XFS_LI_EFD; +} + /* Get an EFD so we can process all the free extents. */ static struct xfs_log_item * xfs_extent_free_create_done( @@ -427,8 +459,8 @@ xfs_extent_free_create_done( GFP_KERNEL | __GFP_NOFAIL); } - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, + xfs_efd_type_from_efi(efip), &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = count; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; @@ -436,6 +468,17 @@ xfs_extent_free_create_done( return &efdp->efd_item; } +static inline const struct xfs_defer_op_type * +xefi_ops( + struct xfs_extent_free_item *xefi) +{ + if (xfs_efi_is_realtime(xefi)) + return &xfs_rtextent_free_defer_type; + if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) + return &xfs_agfl_free_defer_type; + return &xfs_extent_free_defer_type; +} + /* Add this deferred EFI to the transaction. */ void xfs_extent_free_defer_add( @@ -445,15 +488,11 @@ xfs_extent_free_defer_add( { struct xfs_mount *mp = tp->t_mountp; - trace_xfs_extent_free_defer(mp, xefi); + xefi->xefi_group = xfs_group_intent_get(mp, xefi->xefi_startblock, + xfs_efi_is_realtime(xefi) ? XG_TYPE_RTG : XG_TYPE_AG); - xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock); - if (xefi->xefi_agresv == XFS_AG_RESV_AGFL) - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_agfl_free_defer_type); - else - *dfpp = xfs_defer_add(tp, &xefi->xefi_list, - &xfs_extent_free_defer_type); + trace_xfs_extent_free_defer(mp, xefi); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, xefi_ops(xefi)); } /* Cancel a free extent. */ @@ -463,7 +502,7 @@ xfs_extent_free_cancel_item( { struct xfs_extent_free_item *xefi = xefi_entry(item); - xfs_perag_intent_put(xefi->xefi_pag); + xfs_group_intent_put(xefi->xefi_group); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -499,7 +538,7 @@ xfs_extent_free_finish_item( * in this EFI to the EFD so this works correctly. */ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + error = __xfs_free_extent(tp, to_perag(xefi->xefi_group), agbno, xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); if (error == -EAGAIN) { @@ -545,10 +584,10 @@ xfs_agfl_free_finish_item( trace_xfs_agfl_free_deferred(mp, xefi); - error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); + error = xfs_alloc_read_agf(to_perag(xefi->xefi_group), tp, 0, &agbp); if (!error) - error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno, - agbno, 1, &oinfo, XFS_AG_RESV_AGFL); + error = xfs_free_ag_extent(tp, agbp, agbno, 1, &oinfo, + XFS_AG_RESV_AGFL); xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(&xefi->xefi_list); @@ -559,8 +598,12 @@ xfs_agfl_free_finish_item( static inline bool xfs_efi_validate_ext( struct xfs_mount *mp, + bool isrt, struct xfs_extent *extp) { + if (isrt) + return xfs_verify_rtbext(mp, extp->ext_start, extp->ext_len); + return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } @@ -568,6 +611,7 @@ static inline void xfs_efi_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + bool isrt, struct xfs_extent *extp) { struct xfs_extent_free_item *xefi; @@ -578,7 +622,10 @@ xfs_efi_recover_work( xefi->xefi_blockcount = extp->ext_len; xefi->xefi_agresv = XFS_AG_RESV_NONE; xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; - xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start); + xefi->xefi_group = xfs_group_intent_get(mp, extp->ext_start, + isrt ? XG_TYPE_RTG : XG_TYPE_AG); + if (isrt) + xefi->xefi_flags |= XFS_EFI_REALTIME; xfs_defer_add_item(dfp, &xefi->xefi_list); } @@ -599,14 +646,15 @@ xfs_extent_free_recover_work( struct xfs_trans *tp; int i; int error = 0; + bool isrt = xfs_efi_item_isrt(lip); /* - * First check the validity of the extents described by the - * EFI. If any are bad, then assume that all are bad and - * just toss the EFI. + * First check the validity of the extents described by the EFI. If + * any are bad, then assume that all are bad and just toss the EFI. + * Mixing RT and non-RT extents in the same EFI item is not allowed. */ for (i = 0; i < efip->efi_format.efi_nextents; i++) { - if (!xfs_efi_validate_ext(mp, + if (!xfs_efi_validate_ext(mp, isrt, &efip->efi_format.efi_extents[i])) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &efip->efi_format, @@ -614,7 +662,8 @@ xfs_extent_free_recover_work( return -EFSCORRUPTED; } - xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); + xfs_efi_recover_work(mp, dfp, isrt, + &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -652,10 +701,12 @@ xfs_extent_free_relog_intent( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; + ASSERT(intent->li_type == XFS_LI_EFI || intent->li_type == XFS_LI_EFI_RT); + efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - efip = xfs_efi_init(tp->t_mountp, count); + efip = xfs_efi_init(tp->t_mountp, intent->li_type, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); @@ -687,6 +738,72 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .relog_intent = xfs_extent_free_relog_intent, }; +#ifdef CONFIG_XFS_RT +/* Create a realtime extent freeing */ +static struct xfs_log_item * +xfs_rtextent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return __xfs_extent_free_create_intent(tp, items, count, sort, + XFS_LI_EFI_RT); +} + +/* Process a free realtime extent. */ +STATIC int +xfs_rtextent_free_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent_free_item *xefi = xefi_entry(item); + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_rtgroup **rtgp = (struct xfs_rtgroup **)state; + int error = 0; + + trace_xfs_extent_free_deferred(mp, xefi); + + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { + if (*rtgp != to_rtg(xefi->xefi_group)) { + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(tp, *rtgp, + XFS_RTGLOCK_BITMAP); + } + error = xfs_rtfree_blocks(tp, *rtgp, + xefi->xefi_startblock, xefi->xefi_blockcount); + } + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); + return error; + } + + xfs_efd_add_extent(efdp, xefi); + xfs_extent_free_cancel_item(item); + return error; +} + +const struct xfs_defer_op_type xfs_rtextent_free_defer_type = { + .name = "rtextent_free", + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_rtextent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_rtextent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; +#else +const struct xfs_defer_op_type xfs_rtextent_free_defer_type = { + .name = "rtextent_free", +}; +#endif /* CONFIG_XFS_RT */ + STATIC bool xfs_efi_item_match( struct xfs_log_item *lip, @@ -731,7 +848,7 @@ xlog_recover_efi_commit_pass2( return -EFSCORRUPTED; } - efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents); error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); if (error) { xfs_efi_item_free(efip); @@ -749,6 +866,58 @@ const struct xlog_recover_item_ops xlog_efi_item_ops = { .commit_pass2 = xlog_recover_efi_commit_pass2, }; +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtefi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; + int error; + + efi_formatp = item->ri_buf[0].i_addr; + + if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents); + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + &xfs_rtextent_free_defer_type); + return 0; +} +#else +STATIC int +xlog_recover_rtefi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; +} +#endif + +const struct xlog_recover_item_ops xlog_rtefi_item_ops = { + .item_type = XFS_LI_EFI_RT, + .commit_pass2 = xlog_recover_rtefi_commit_pass2, +}; + /* * This routine is called when an EFD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding EFI if it @@ -791,3 +960,44 @@ const struct xlog_recover_item_ops xlog_efd_item_ops = { .item_type = XFS_LI_EFD, .commit_pass2 = xlog_recover_efd_commit_pass2, }; + +#ifdef CONFIG_XFS_RT +STATIC int +xlog_recover_rtefd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_format *efd_formatp; + int buflen = item->ri_buf[0].i_len; + + efd_formatp = item->ri_buf[0].i_addr; + + if (buflen < sizeof(struct xfs_efd_log_format)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + efd_formatp->efd_nextents) && + item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + efd_formatp->efd_nextents)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, + efd_formatp, buflen); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_EFI_RT, + efd_formatp->efd_efi_id); + return 0; +} +#else +# define xlog_recover_rtefd_commit_pass2 xlog_recover_rtefi_commit_pass2 +#endif + +const struct xlog_recover_item_ops xlog_rtefd_item_ops = { + .item_type = XFS_LI_EFD_RT, + .commit_pass2 = xlog_recover_rtefd_commit_pass2, +}; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 412b1d71b52b..9a435b1ff264 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -348,9 +348,82 @@ xfs_file_splice_read( } /* + * Take care of zeroing post-EOF blocks when they might exist. + * + * Returns 0 if successfully, a negative error for a failure, or 1 if this + * function dropped the iolock and reacquired it exclusively and the caller + * needs to restart the write sanity checks. + */ +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + int error; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + + return error; +} + +/* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -360,13 +433,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -389,7 +459,7 @@ restart: * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -400,64 +470,24 @@ restart: } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. + * write. * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. - * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); } @@ -822,6 +852,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1198,6 +1242,14 @@ out_unlock: xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + /* + * If the caller did not set CAN_SHORTEN, then it is not prepared to + * handle partial results -- either the whole remap succeeds, or we + * must say why it did not. In this case, any error should be returned + * to the caller. + */ + if (ret && remapped < len && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return ret; return remapped > 0 ? remapped : ret; } @@ -1209,6 +1261,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } @@ -1395,6 +1449,8 @@ xfs_dax_read_fault( struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); vm_fault_t ret; + trace_xfs_read_fault(ip, order); + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault_locked(vmf, order, false); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); @@ -1402,6 +1458,16 @@ xfs_dax_read_fault( return ret; } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_lock (MM) + * sb_start_pagefault(vfs, freeze) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ static vm_fault_t xfs_write_fault( struct vm_fault *vmf, @@ -1412,6 +1478,8 @@ xfs_write_fault( unsigned int lock_mode = XFS_MMAPLOCK_SHARED; vm_fault_t ret; + trace_xfs_write_fault(ip, order); + sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); @@ -1430,40 +1498,13 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } -/* - * Locking for serialisation of IO during page faults. This results in a lock - * ordering of: - * - * mmap_lock (MM) - * sb_start_pagefault(vfs, freeze) - * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) - */ -static vm_fault_t -__xfs_filemap_fault( - struct vm_fault *vmf, - unsigned int order, - bool write_fault) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - - trace_xfs_filemap_fault(XFS_I(inode), order, write_fault); - - if (write_fault) - return xfs_write_fault(vmf, order); - if (IS_DAX(inode)) - return xfs_dax_read_fault(vmf, order); - return filemap_fault(vmf); -} - static inline bool xfs_is_write_fault( struct vm_fault *vmf) @@ -1476,10 +1517,17 @@ static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { + struct inode *inode = file_inode(vmf->vma->vm_file); + /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, 0, - IS_DAX(file_inode(vmf->vma->vm_file)) && - xfs_is_write_fault(vmf)); + if (IS_DAX(inode)) { + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, 0); + return xfs_dax_read_fault(vmf, 0); + } + + trace_xfs_read_fault(XFS_I(inode), 0); + return filemap_fault(vmf); } static vm_fault_t @@ -1491,15 +1539,16 @@ xfs_filemap_huge_fault( return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, order, - xfs_is_write_fault(vmf)); + if (xfs_is_write_fault(vmf)) + return xfs_write_fault(vmf, order); + return xfs_dax_read_fault(vmf, order); } static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } /* @@ -1511,8 +1560,7 @@ static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - - return __xfs_filemap_fault(vmf, 0, true); + return xfs_write_fault(vmf, 0); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e3aaa0555597..a961aa420c48 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -64,25 +64,31 @@ xfs_filestream_pick_ag( struct xfs_perag *pag; struct xfs_perag *max_pag = NULL; xfs_extlen_t minlen = *longest; - xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_extlen_t minfree, maxfree = 0; xfs_agnumber_t agno; bool first_pass = true; - int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; restart: for_each_perag_wrap(mp, start_agno, agno, pag) { + int err; + trace_xfs_filestream_scan(pag, pino); + *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - if (err != -EAGAIN) - break; - /* Couldn't lock the AGF, skip this AG. */ - err = 0; - continue; + if (err == -EAGAIN) { + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; + } + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; } /* Keep track of the AG with the most free blocks. */ @@ -90,7 +96,7 @@ restart: maxfree = pag->pagf_freeblks; if (max_pag) xfs_perag_rele(max_pag); - atomic_inc(&pag->pag_active_ref); + atomic_inc(&pag_group(pag)->xg_active_ref); max_pag = pag; } @@ -107,8 +113,9 @@ restart: !(flags & XFS_PICK_USERDATA) || (flags & XFS_PICK_LOWSPACE))) { /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - break; + if (max_pag) + xfs_perag_rele(max_pag); + goto done; } } @@ -116,57 +123,47 @@ restart: atomic_dec(&pag->pagf_fstrms); } - if (err) { - xfs_perag_rele(pag); - if (max_pag) - xfs_perag_rele(max_pag); - return err; + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() another + * attempt at locking AGFs that it might have skipped over before we + * fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - if (!pag) { - /* - * Allow a second pass to give xfs_bmap_longest_free_extent() - * another attempt at locking AGFs that it might have skipped - * over before we fail. - */ - if (first_pass) { - first_pass = false; - goto restart; - } + /* + * We must be low on data space, so run a final lowspace optimised + * selection pass if we haven't already. + */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + goto restart; + } - /* - * We must be low on data space, so run a final lowspace - * optimised selection pass if we haven't already. - */ - if (!(flags & XFS_PICK_LOWSPACE)) { - flags |= XFS_PICK_LOWSPACE; - goto restart; + /* + * No unassociated AGs are available, so select the AG with the most + * free space, regardless of whether it's already in use by another + * filestream. It none suit, just use whatever AG we can grab. + */ + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, pag) { + max_pag = pag; + break; } - /* - * No unassociated AGs are available, so select the AG with the - * most free space, regardless of whether it's already in use by - * another filestream. It none suit, just use whatever AG we can - * grab. - */ - if (!max_pag) { - for_each_perag_wrap(args->mp, 0, start_agno, args->pag) - break; - atomic_inc(&args->pag->pagf_fstrms); - *longest = 0; - } else { - pag = max_pag; - free = maxfree; - atomic_inc(&pag->pagf_fstrms); - } - } else if (max_pag) { - xfs_perag_rele(max_pag); + /* Bail if there are no AGs at all to select from. */ + if (!max_pag) + return -ENOSPC; } - trace_xfs_filestream_pick(pag, pino, free); + pag = max_pag; + atomic_inc(&pag->pagf_fstrms); +done: + trace_xfs_filestream_pick(pag, pino); args->pag = pag; return 0; - } static struct xfs_inode * @@ -225,12 +222,12 @@ xfs_filestream_lookup_association( * down immediately after we mark the lookup as done. */ pag = container_of(mru, struct xfs_fstrm_item, mru)->pag; - atomic_inc(&pag->pag_active_ref); + atomic_inc(&pag_group(pag)->xg_active_ref); xfs_mru_cache_done(mp->m_filestream); trace_xfs_filestream_lookup(pag, ap->ip->i_ino); - ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0); + ap->blkno = xfs_agbno_to_fsb(pag, 0); xfs_bmap_adjacent(ap); /* @@ -278,7 +275,7 @@ xfs_filestream_create_association( struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount; + agno = (pag_agno(item->pag) + 1) % mp->m_sb.sb_agcount; xfs_fstrm_free_func(mp, mru); } else if (xfs_is_inode32(mp)) { xfs_agnumber_t rotorstep = xfs_rotorstep; @@ -317,7 +314,7 @@ xfs_filestream_create_association( if (!item) goto out_put_fstrms; - atomic_inc(&args->pag->pag_active_ref); + atomic_inc(&pag_group(args->pag)->xg_active_ref); item->pag = args->pag; error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); if (error) @@ -347,7 +344,6 @@ xfs_filestream_select_ag( struct xfs_alloc_arg *args, xfs_extlen_t *longest) { - struct xfs_mount *mp = args->mp; struct xfs_inode *pip; xfs_ino_t ino = 0; int error = 0; @@ -373,7 +369,7 @@ xfs_filestream_select_ag( return error; out_select: - ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0); + ap->blkno = xfs_agbno_to_fsb(args->pag, 0); return 0; } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index ae18ab86e608..3290dd8524a6 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -25,6 +25,7 @@ #include "xfs_alloc_btree.h" #include "xfs_rtbitmap.h" #include "xfs_ag.h" +#include "xfs_rtgroup.h" /* Convert an xfs_fsmap to an fsmap. */ static void @@ -110,18 +111,18 @@ xfs_fsmap_owner_to_rmap( /* Convert an rmapbt owner into an fsmap owner. */ static int -xfs_fsmap_owner_from_rmap( +xfs_fsmap_owner_from_frec( struct xfs_fsmap *dest, - const struct xfs_rmap_irec *src) + const struct xfs_fsmap_irec *frec) { dest->fmr_flags = 0; - if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { - dest->fmr_owner = src->rm_owner; + if (!XFS_RMAP_NON_INODE_OWNER(frec->owner)) { + dest->fmr_owner = frec->owner; return 0; } dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; - switch (src->rm_owner) { + switch (frec->owner) { case XFS_RMAP_OWN_FS: dest->fmr_owner = XFS_FMR_OWN_FS; break; @@ -158,11 +159,12 @@ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ - struct xfs_perag *pag; /* AG info, if applicable */ + struct xfs_group *group; /* group info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ /* daddr of low fsmap key when we're using the rtbitmap */ xfs_daddr_t low_daddr; - xfs_daddr_t end_daddr; /* daddr of high fsmap key */ + /* daddr of high fsmap key, or the last daddr on the device */ + xfs_daddr_t end_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ /* @@ -203,7 +205,7 @@ STATIC int xfs_getfsmap_is_shared( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, + const struct xfs_fsmap_irec *frec, bool *stat) { struct xfs_mount *mp = tp->t_mountp; @@ -216,15 +218,17 @@ xfs_getfsmap_is_shared( if (!xfs_has_reflink(mp)) return 0; /* rt files will have no perag structure */ - if (!info->pag) + if (!info->group) return 0; /* Are there any shared blocks here? */ flen = 0; - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag); + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + to_perag(info->group)); - error = xfs_refcount_find_shared(cur, rec->rm_startblock, - rec->rm_blockcount, &fbno, &flen, false); + error = xfs_refcount_find_shared(cur, frec->rec_key, + XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen, + false); xfs_btree_del_cursor(cur, error); if (error) @@ -249,15 +253,22 @@ xfs_getfsmap_format( } static inline bool -xfs_getfsmap_rec_before_start( +xfs_getfsmap_frec_before_start( struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr) + const struct xfs_fsmap_irec *frec) { if (info->low_daddr != XFS_BUF_DADDR_NULL) - return rec_daddr < info->low_daddr; - if (info->low.rm_blockcount) - return xfs_rmap_compare(rec, &info->low) < 0; + return frec->start_daddr < info->low_daddr; + if (info->low.rm_blockcount) { + struct xfs_rmap_irec rec = { + .rm_startblock = frec->rec_key, + .rm_owner = frec->owner, + .rm_flags = frec->rm_flags, + }; + + return xfs_rmap_compare(&rec, &info->low) < 0; + } + return false; } @@ -270,61 +281,36 @@ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr, - xfs_daddr_t len_daddr) + const struct xfs_fsmap_irec *frec) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; bool shared; - int error; + int error = 0; if (fatal_signal_pending(current)) return -EINTR; - if (len_daddr == 0) - len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount); - /* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; - return 0; - } - - /* - * For an info->last query, we're looking for a gap between the last - * mapping emitted and the high key specified by userspace. If the - * user's query spans less than 1 fsblock, then info->high and - * info->low will have the same rm_startblock, which causes rec_daddr - * and next_daddr to be the same. Therefore, use the end_daddr that - * we calculated from userspace's high key to synthesize the record. - * Note that if the btree query found a mapping, there won't be a gap. - */ - if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) - rec_daddr = info->end_daddr; + if (xfs_getfsmap_frec_before_start(info, frec)) + goto out; /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { if (info->head->fmh_entries == UINT_MAX) return -ECANCELED; - if (rec_daddr > info->next_daddr) + if (frec->start_daddr > info->next_daddr) info->head->fmh_entries++; if (info->last) return 0; info->head->fmh_entries++; - - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; - return 0; + goto out; } /* @@ -332,7 +318,7 @@ xfs_getfsmap_helper( * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ - if (rec_daddr > info->next_daddr) { + if (frec->start_daddr > info->next_daddr) { if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; @@ -340,7 +326,7 @@ xfs_getfsmap_helper( fmr.fmr_physical = info->next_daddr; fmr.fmr_owner = info->missing_owner; fmr.fmr_offset = 0; - fmr.fmr_length = rec_daddr - info->next_daddr; + fmr.fmr_length = frec->start_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; xfs_getfsmap_format(mp, &fmr, info); } @@ -353,23 +339,24 @@ xfs_getfsmap_helper( return -ECANCELED; trace_xfs_fsmap_mapping(mp, info->dev, - info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec); + info->group ? info->group->xg_gno : NULLAGNUMBER, + frec); fmr.fmr_device = info->dev; - fmr.fmr_physical = rec_daddr; - error = xfs_fsmap_owner_from_rmap(&fmr, rec); + fmr.fmr_physical = frec->start_daddr; + error = xfs_fsmap_owner_from_frec(&fmr, frec); if (error) return error; - fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); - fmr.fmr_length = len_daddr; - if (rec->rm_flags & XFS_RMAP_UNWRITTEN) + fmr.fmr_offset = XFS_FSB_TO_BB(mp, frec->offset); + fmr.fmr_length = frec->len_daddr; + if (frec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; - if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + if (frec->rm_flags & XFS_RMAP_ATTR_FORK) fmr.fmr_flags |= FMR_OF_ATTR_FORK; - if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + if (frec->rm_flags & XFS_RMAP_BMBT_BLOCK) fmr.fmr_flags |= FMR_OF_EXTENT_MAP; if (fmr.fmr_flags == 0) { - error = xfs_getfsmap_is_shared(tp, info, rec, &shared); + error = xfs_getfsmap_is_shared(tp, info, frec, &shared); if (error) return error; if (shared) @@ -378,28 +365,55 @@ xfs_getfsmap_helper( xfs_getfsmap_format(mp, &fmr, info); out: - rec_daddr += len_daddr; - if (info->next_daddr < rec_daddr) - info->next_daddr = rec_daddr; + info->next_daddr = max(info->next_daddr, + frec->start_daddr + frec->len_daddr); return 0; } +static inline int +xfs_getfsmap_group_helper( + struct xfs_getfsmap_info *info, + struct xfs_trans *tp, + struct xfs_group *xg, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + struct xfs_fsmap_irec *frec) +{ + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last) + frec->start_daddr = info->end_daddr + 1; + else + frec->start_daddr = xfs_gbno_to_daddr(xg, startblock); + + frec->len_daddr = XFS_FSB_TO_BB(xg->xg_mount, blockcount); + return xfs_getfsmap_helper(tp, info, frec); +} + /* Transform a rmapbt irec into a fsmap */ STATIC int -xfs_getfsmap_datadev_helper( +xfs_getfsmap_rmapbt_helper( struct xfs_btree_cur *cur, const struct xfs_rmap_irec *rec, void *priv) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_fsmap_irec frec = { + .owner = rec->rm_owner, + .offset = rec->rm_offset, + .rm_flags = rec->rm_flags, + .rec_key = rec->rm_startblock, + }; struct xfs_getfsmap_info *info = priv; - xfs_fsblock_t fsb; - xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); - rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); - - return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0); + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, + rec->rm_startblock, rec->rm_blockcount, &frec); } /* Transform a bnobt irec into a fsmap */ @@ -409,21 +423,14 @@ xfs_getfsmap_datadev_bnobt_helper( const struct xfs_alloc_rec_incore *rec, void *priv) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_NULL, /* "free" */ + .rec_key = rec->ar_startblock, + }; struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; - - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno, - rec->ar_startblock); - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; - - return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0); + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, + rec->ar_startblock, rec->ar_blockcount, &frec); } /* Set rmap flags based on the getfsmap flags */ @@ -467,12 +474,11 @@ __xfs_getfsmap_datadev( void *priv) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; struct xfs_btree_cur *bt_cur = NULL; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; - xfs_agnumber_t start_ag; - xfs_agnumber_t end_ag; + xfs_agnumber_t start_ag, end_ag; uint64_t eofs; int error = 0; @@ -520,13 +526,13 @@ __xfs_getfsmap_datadev( start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); - for_each_perag_range(mp, start_ag, end_ag, pag) { + while ((pag = xfs_perag_next_range(mp, pag, start_ag, end_ag))) { /* * Set the AG high key from the fsmap high key if this * is the last AG that we're querying. */ - info->pag = pag; - if (pag->pag_agno == end_ag) { + info->group = pag_group(pag); + if (pag_agno(pag) == end_ag) { info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsb); info->high.rm_offset = XFS_BB_TO_FSBT(mp, @@ -548,9 +554,9 @@ __xfs_getfsmap_datadev( if (error) break; - trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno, + trace_xfs_fsmap_low_group_key(mp, info->dev, pag_agno(pag), &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno, + trace_xfs_fsmap_high_group_key(mp, info->dev, pag_agno(pag), &info->high); error = query_fn(tp, info, &bt_cur, priv); @@ -561,7 +567,7 @@ __xfs_getfsmap_datadev( * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) + if (pag_agno(pag) == start_ag) memset(&info->low, 0, sizeof(info->low)); /* @@ -569,13 +575,13 @@ __xfs_getfsmap_datadev( * before we drop the reference to the perag when the loop * terminates. */ - if (pag->pag_agno == end_ag) { + if (pag_agno(pag) == end_ag) { info->last = true; error = query_fn(tp, info, &bt_cur, priv); if (error) break; } - info->pag = NULL; + info->group = NULL; } if (bt_cur) @@ -585,9 +591,9 @@ __xfs_getfsmap_datadev( xfs_trans_brelse(tp, info->agf_bp); info->agf_bp = NULL; } - if (info->pag) { - xfs_perag_rele(info->pag); - info->pag = NULL; + if (info->group) { + xfs_perag_rele(pag); + info->group = NULL; } else if (pag) { /* loop termination case */ xfs_perag_rele(pag); @@ -606,13 +612,13 @@ xfs_getfsmap_datadev_rmapbt_query( { /* Report any gap at the end of the last AG. */ if (info->last) - return xfs_getfsmap_datadev_helper(*curpp, &info->high, info); + return xfs_getfsmap_rmapbt_helper(*curpp, &info->high, info); /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag); + to_perag(info->group)); return xfs_rmap_query_range(*curpp, &info->low, &info->high, - xfs_getfsmap_datadev_helper, info); + xfs_getfsmap_rmapbt_helper, info); } /* Execute a getfsmap query against the regular data device rmapbt. */ @@ -643,7 +649,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag); + to_perag(info->group)); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], @@ -672,9 +678,12 @@ xfs_getfsmap_logdev( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { + struct xfs_fsmap_irec frec = { + .start_daddr = 0, + .rec_key = 0, + .owner = XFS_RMAP_OWN_LOG, + }; struct xfs_mount *mp = tp->t_mountp; - struct xfs_rmap_irec rmap; - xfs_daddr_t rec_daddr, len_daddr; xfs_fsblock_t start_fsb, end_fsb; uint64_t eofs; @@ -689,51 +698,52 @@ xfs_getfsmap_logdev( if (keys[0].fmr_length > 0) info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); + trace_xfs_fsmap_low_linear_key(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_linear_key(mp, info->dev, end_fsb); if (start_fsb > 0) return 0; /* Fabricate an rmap entry for the external log device. */ - rmap.rm_startblock = 0; - rmap.rm_blockcount = mp->m_sb.sb_logblocks; - rmap.rm_owner = XFS_RMAP_OWN_LOG; - rmap.rm_offset = 0; - rmap.rm_flags = 0; - - rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); - len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); - return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); + frec.len_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + return xfs_getfsmap_helper(tp, info, &frec); } #ifdef CONFIG_XFS_RT /* Transform a rtbitmap "record" into a fsmap */ STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_NULL, /* "free" */ + }; + struct xfs_mount *mp = rtg_mount(rtg); struct xfs_getfsmap_info *info = priv; - struct xfs_rmap_irec irec; - xfs_rtblock_t rtbno; - xfs_daddr_t rec_daddr, len_daddr; + xfs_rtblock_t start_rtb = + xfs_rtx_to_rtb(rtg, rec->ar_startext); + uint64_t rtbcount = + xfs_rtbxlen_to_blen(mp, rec->ar_extcount); - rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); - rec_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_startblock = rtbno; - - rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); - len_daddr = XFS_FSB_TO_BB(mp, rtbno); - irec.rm_blockcount = rtbno; - - irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ - irec.rm_offset = 0; - irec.rm_flags = 0; + /* + * For an info->last query, we're looking for a gap between the last + * mapping emitted and the high key specified by userspace. If the + * user's query spans less than 1 fsblock, then info->high and + * info->low will have the same rm_startblock, which causes rec_daddr + * and next_daddr to be the same. Therefore, use the end_daddr that + * we calculated from userspace's high key to synthesize the record. + * Note that if the btree query found a mapping, there won't be a gap. + */ + if (info->last) + frec.start_daddr = info->end_daddr + 1; + else + frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb); - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); + frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount); + return xfs_getfsmap_helper(tp, info, &frec); } /* Execute a getfsmap query against the realtime device rtbitmap. */ @@ -743,58 +753,83 @@ xfs_getfsmap_rtdev_rtbitmap( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { - - struct xfs_rtalloc_rec ahigh = { 0 }; struct xfs_mount *mp = tp->t_mountp; - xfs_rtblock_t start_rtb; - xfs_rtblock_t end_rtb; - xfs_rtxnum_t high; + xfs_rtblock_t start_rtbno, end_rtbno; + xfs_rtxnum_t start_rtx, end_rtx; + xfs_rgnumber_t start_rgno, end_rgno; + struct xfs_rtgroup *rtg = NULL; uint64_t eofs; int error; - eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = XFS_BB_TO_FSBT(mp, - keys[0].fmr_physical + keys[0].fmr_length); - end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_UNKNOWN; /* Adjust the low key if we are continuing from where we left off. */ + start_rtbno = xfs_daddr_to_rtb(mp, + keys[0].fmr_physical + keys[0].fmr_length); if (keys[0].fmr_length > 0) { - info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + info->low_daddr = xfs_rtb_to_daddr(mp, start_rtbno); if (info->low_daddr >= eofs) return 0; } + start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); + start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); - trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); - trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); + end_rtbno = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); + trace_xfs_fsmap_low_linear_key(mp, info->dev, start_rtbno); + trace_xfs_fsmap_high_linear_key(mp, info->dev, end_rtbno); - /* - * Set up query parameters to return free rtextents covering the range - * we want. - */ - high = xfs_rtb_to_rtxup(mp, end_rtb); - error = xfs_rtalloc_query_range(mp, tp, xfs_rtb_to_rtx(mp, start_rtb), - high, xfs_getfsmap_rtdev_rtbitmap_helper, info); - if (error) - goto err; + end_rtx = -1ULL; - /* - * Report any gaps at the end of the rtbitmap by simulating a null - * rmap starting at the block after the end of the query range. - */ - info->last = true; - ahigh.ar_startext = min(mp->m_sb.sb_rextents, high); + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { + if (rtg_rgno(rtg) == end_rgno) + end_rtx = xfs_rtb_to_rtx(mp, + end_rtbno + mp->m_sb.sb_rextsize - 1); + + info->group = rtg_group(rtg); + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_range(rtg, tp, start_rtx, end_rtx, + xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + break; + + /* + * Report any gaps at the end of the rtbitmap by simulating a + * zero-length free extent starting at the rtx after the end + * of the query range. + */ + if (rtg_rgno(rtg) == end_rgno) { + struct xfs_rtalloc_rec ahigh = { + .ar_startext = min(end_rtx + 1, + rtg->rtg_extents), + }; + + info->last = true; + error = xfs_getfsmap_rtdev_rtbitmap_helper(rtg, tp, + &ahigh, info); + if (error) + break; + } + + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + info->group = NULL; + start_rtx = 0; + } + + /* loop termination case */ + if (rtg) { + if (info->group) { + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + info->group = NULL; + } + xfs_rtgroup_rele(rtg); + } - error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info); - if (error) - goto err; -err: - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); return error; } #endif /* CONFIG_XFS_RT */ @@ -898,7 +933,10 @@ xfs_getfsmap( struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; - struct xfs_getfsmap_info info = { NULL }; + struct xfs_getfsmap_info info = { + .fsmap_recs = fsmap_recs, + .head = head, + }; bool use_rmap; int i; int error = 0; @@ -963,9 +1001,6 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.end_daddr = XFS_BUF_DADDR_NULL; - info.fsmap_recs = fsmap_recs; - info.head = head; /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { @@ -978,17 +1013,23 @@ xfs_getfsmap( break; /* - * If this device number matches the high key, we have - * to pass the high key to the handler to limit the - * query results. If the device number exceeds the - * low key, zero out the low key so that we get - * everything from the beginning. + * If this device number matches the high key, we have to pass + * the high key to the handler to limit the query results, and + * set the end_daddr so that we can synthesize records at the + * end of the query range or device. */ if (handlers[i].dev == head->fmh_keys[1].fmr_device) { dkeys[1] = head->fmh_keys[1]; info.end_daddr = min(handlers[i].nr_sectors - 1, dkeys[1].fmr_physical); + } else { + info.end_daddr = handlers[i].nr_sectors - 1; } + + /* + * If the device number exceeds the low key, zero out the low + * key so that we get everything from the beginning. + */ if (handlers[i].dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); @@ -1003,7 +1044,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; - info.pag = NULL; + info.group = NULL; info.low_daddr = XFS_BUF_DADDR_NULL; info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index a0bcc38486a5..06e492fd479d 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -28,6 +28,21 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; +/* internal fsmap record format */ +struct xfs_fsmap_irec { + xfs_daddr_t start_daddr; + xfs_daddr_t len_daddr; + uint64_t owner; /* extent owner */ + uint64_t offset; /* offset within the owner */ + unsigned int rm_flags; /* rmap state flags */ + + /* + * rmapbt startblock corresponding to start_daddr, if the record came + * from an rmap btree. + */ + xfs_agblock_t rec_key; +}; + int xfs_ioc_getfsmap(struct xfs_inode *ip, struct fsmap_head __user *arg); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3643cc843f62..28dde215c899 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -87,6 +87,7 @@ xfs_growfs_data_private( struct xfs_mount *mp, /* mount point for filesystem */ struct xfs_growfs_data *in) /* growfs data input struct */ { + xfs_agnumber_t oagcount = mp->m_sb.sb_agcount; struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; @@ -94,7 +95,6 @@ xfs_growfs_data_private( xfs_rfsblock_t nb, nb_div, nb_mod; int64_t delta; bool lastag_extended = false; - xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; struct xfs_perag *last_pag; @@ -138,16 +138,14 @@ xfs_growfs_data_private( if (delta == 0) return 0; - oagcount = mp->m_sb.sb_agcount; - /* allocate the new per-ag structures */ - if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); - if (error) - return error; - } else if (nagcount < oagcount) { - /* TODO: shrinking the entire AGs hasn't yet completed */ + /* TODO: shrinking the entire AGs hasn't yet completed */ + if (nagcount < oagcount) return -EINVAL; - } + + /* allocate the new per-ag structures */ + error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax); + if (error) + return error; if (delta > 0) error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, @@ -164,9 +162,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, last_pag, &lastag_extended); } else { - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, - "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SHRINK); error = xfs_ag_shrink_space(last_pag, &tp, -delta); } xfs_perag_put(last_pag); @@ -231,7 +227,7 @@ out_trans_cancel: xfs_trans_cancel(tp); out_free_unused_perag: if (nagcount > oagcount) - xfs_free_unused_perag_range(mp, oagcount, nagcount); + xfs_free_perag_range(mp, oagcount, nagcount); return error; } @@ -530,13 +526,12 @@ int xfs_fs_reserve_ag_blocks( struct xfs_mount *mp) { - xfs_agnumber_t agno; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; int error = 0; int err2; mp->m_finobt_nores = false; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { err2 = xfs_ag_resv_init(pag, NULL); if (err2 && !error) error = err2; @@ -558,9 +553,8 @@ void xfs_fs_unreserve_ag_blocks( struct xfs_mount *mp) { - xfs_agnumber_t agno; - struct xfs_perag *pag; + struct xfs_perag *pag = NULL; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index 49e5e5f04e60..f19fce557354 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -85,22 +85,23 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = EMPTY_FD; struct path path; int error; struct xfs_inode *ip; if (cmd == XFS_IOC_FD_TO_HANDLE) { - f = fdget(hreq->fd); - if (!fd_file(f)) + CLASS(fd, f)(hreq->fd); + + if (fd_empty(f)) return -EBADF; - inode = file_inode(fd_file(f)); + path = fd_file(f)->f_path; + path_get(&path); } else { error = user_path_at(AT_FDCWD, hreq->path, 0, &path); if (error) return error; - inode = d_inode(path.dentry); } + inode = d_inode(path.dentry); ip = XFS_I(inode); /* @@ -134,10 +135,7 @@ xfs_find_handle( error = 0; out_put: - if (cmd == XFS_IOC_FD_TO_HANDLE) - fdput(f); - else - path_put(&path); + path_put(&path); return error; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 10f116d093a2..c7c2e6561998 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -18,6 +18,22 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_quota_defs.h" +#include "xfs_rtgroup.h" + +static void +xfs_health_unmount_group( + struct xfs_group *xg, + bool *warn) +{ + unsigned int sick = 0; + unsigned int checked = 0; + + xfs_group_measure_sickness(xg, &sick, &checked); + if (sick) { + trace_xfs_group_unfixed_corruption(xg, sick); + *warn = true; + } +} /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -28,8 +44,8 @@ void xfs_health_unmount( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; + struct xfs_rtgroup *rtg = NULL; unsigned int sick = 0; unsigned int checked = 0; bool warn = false; @@ -38,20 +54,12 @@ xfs_health_unmount( return; /* Measure AG corruption levels. */ - for_each_perag(mp, agno, pag) { - xfs_ag_measure_sickness(pag, &sick, &checked); - if (sick) { - trace_xfs_ag_unfixed_corruption(mp, agno, sick); - warn = true; - } - } + while ((pag = xfs_perag_next(mp, pag))) + xfs_health_unmount_group(pag_group(pag), &warn); - /* Measure realtime volume corruption levels. */ - xfs_rt_measure_sickness(mp, &sick, &checked); - if (sick) { - trace_xfs_rt_unfixed_corruption(mp, sick); - warn = true; - } + /* Measure realtime group corruption levels. */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_health_unmount_group(rtg_group(rtg), &warn); /* * Measure fs corruption and keep the sample around for the warning. @@ -150,65 +158,6 @@ xfs_fs_measure_sickness( spin_unlock(&mp->m_sb_lock); } -/* Mark unhealthy realtime metadata. */ -void -xfs_rt_mark_sick( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_sick(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Mark realtime metadata as having been checked and found unhealthy by fsck. */ -void -xfs_rt_mark_corrupt( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_corrupt(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick |= mask; - mp->m_rt_checked |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Mark a realtime metadata healed. */ -void -xfs_rt_mark_healthy( - struct xfs_mount *mp, - unsigned int mask) -{ - ASSERT(!(mask & ~XFS_SICK_RT_ALL)); - trace_xfs_rt_mark_healthy(mp, mask); - - spin_lock(&mp->m_sb_lock); - mp->m_rt_sick &= ~mask; - if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY)) - mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY; - mp->m_rt_checked |= mask; - spin_unlock(&mp->m_sb_lock); -} - -/* Sample which realtime metadata are unhealthy. */ -void -xfs_rt_measure_sickness( - struct xfs_mount *mp, - unsigned int *sick, - unsigned int *checked) -{ - spin_lock(&mp->m_sb_lock); - *sick = mp->m_rt_sick; - *checked = mp->m_rt_checked; - spin_unlock(&mp->m_sb_lock); -} - /* Mark unhealthy per-ag metadata given a raw AG number. */ void xfs_agno_mark_sick( @@ -226,63 +175,95 @@ xfs_agno_mark_sick( xfs_perag_put(pag); } +static inline void +xfs_group_check_mask( + struct xfs_group *xg, + unsigned int mask) +{ + if (xg->xg_type == XG_TYPE_AG) + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); + else + ASSERT(!(mask & ~XFS_SICK_RG_ALL)); +} + /* Mark unhealthy per-ag metadata. */ void -xfs_ag_mark_sick( - struct xfs_perag *pag, +xfs_group_mark_sick( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_sick(xg, mask); - spin_lock(&pag->pag_state_lock); - pag->pag_sick |= mask; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + xg->xg_sick |= mask; + spin_unlock(&xg->xg_state_lock); } -/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */ +/* + * Mark per-group metadata as having been checked and found unhealthy by fsck. + */ void -xfs_ag_mark_corrupt( - struct xfs_perag *pag, +xfs_group_mark_corrupt( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_corrupt(xg, mask); - spin_lock(&pag->pag_state_lock); - pag->pag_sick |= mask; - pag->pag_checked |= mask; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + xg->xg_sick |= mask; + xg->xg_checked |= mask; + spin_unlock(&xg->xg_state_lock); } -/* Mark per-ag metadata ok. */ +/* + * Mark per-group metadata ok. + */ void -xfs_ag_mark_healthy( - struct xfs_perag *pag, +xfs_group_mark_healthy( + struct xfs_group *xg, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_ALL)); - trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); - - spin_lock(&pag->pag_state_lock); - pag->pag_sick &= ~mask; - if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY)) - pag->pag_sick &= ~XFS_SICK_AG_SECONDARY; - pag->pag_checked |= mask; - spin_unlock(&pag->pag_state_lock); + xfs_group_check_mask(xg, mask); + trace_xfs_group_mark_healthy(xg, mask); + + spin_lock(&xg->xg_state_lock); + xg->xg_sick &= ~mask; + if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY)) + xg->xg_sick &= ~XFS_SICK_AG_SECONDARY; + xg->xg_checked |= mask; + spin_unlock(&xg->xg_state_lock); } /* Sample which per-ag metadata are unhealthy. */ void -xfs_ag_measure_sickness( - struct xfs_perag *pag, +xfs_group_measure_sickness( + struct xfs_group *xg, unsigned int *sick, unsigned int *checked) { - spin_lock(&pag->pag_state_lock); - *sick = pag->pag_sick; - *checked = pag->pag_checked; - spin_unlock(&pag->pag_state_lock); + spin_lock(&xg->xg_state_lock); + *sick = xg->xg_sick; + *checked = xg->xg_checked; + spin_unlock(&xg->xg_state_lock); +} + +/* Mark unhealthy per-rtgroup metadata given a raw rt group number. */ +void +xfs_rgno_mark_sick( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + unsigned int mask) +{ + struct xfs_rtgroup *rtg = xfs_rtgroup_get(mp, rgno); + + /* per-rtgroup structure not set up yet? */ + if (!rtg) + return; + + xfs_group_mark_sick(rtg_group(rtg), mask); + xfs_rtgroup_put(rtg); } /* Mark the unhealthy parts of an inode. */ @@ -369,6 +350,9 @@ struct ioctl_sick_map { unsigned int ioctl_mask; }; +#define for_each_sick_map(map, m) \ + for ((m) = (map); (m) < (map) + ARRAY_SIZE(map); (m)++) + static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS}, { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, @@ -376,13 +360,13 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, - { 0, 0 }, + { XFS_SICK_FS_METADIR, XFS_FSOP_GEOM_SICK_METADIR }, + { XFS_SICK_FS_METAPATH, XFS_FSOP_GEOM_SICK_METAPATH }, }; static const struct ioctl_sick_map rt_map[] = { - { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, - { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, - { 0, 0 }, + { XFS_SICK_RG_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, + { XFS_SICK_RG_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, }; static inline void @@ -404,6 +388,7 @@ xfs_fsop_geom_health( struct xfs_mount *mp, struct xfs_fsop_geom *geo) { + struct xfs_rtgroup *rtg = NULL; const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; @@ -412,12 +397,14 @@ xfs_fsop_geom_health( geo->checked = 0; xfs_fs_measure_sickness(mp, &sick, &checked); - for (m = fs_map; m->sick_mask; m++) + for_each_sick_map(fs_map, m) xfgeo_health_tick(geo, sick, checked, m); - xfs_rt_measure_sickness(mp, &sick, &checked); - for (m = rt_map; m->sick_mask; m++) - xfgeo_health_tick(geo, sick, checked, m); + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + for_each_sick_map(rt_map, m) + xfgeo_health_tick(geo, sick, checked, m); + } } static const struct ioctl_sick_map ag_map[] = { @@ -432,7 +419,6 @@ static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES }, - { 0, 0 }, }; /* Fill out ag geometry health info. */ @@ -448,8 +434,8 @@ xfs_ag_geom_health( ageo->ag_sick = 0; ageo->ag_checked = 0; - xfs_ag_measure_sickness(pag, &sick, &checked); - for (m = ag_map; m->sick_mask; m++) { + xfs_group_measure_sickness(pag_group(pag), &sick, &checked); + for_each_sick_map(ag_map, m) { if (checked & m->sick_mask) ageo->ag_checked |= m->ioctl_mask; if (sick & m->sick_mask) @@ -457,6 +443,34 @@ xfs_ag_geom_health( } } +static const struct ioctl_sick_map rtgroup_map[] = { + { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, + { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, + { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY }, +}; + +/* Fill out rtgroup geometry health info. */ +void +xfs_rtgroup_geom_health( + struct xfs_rtgroup *rtg, + struct xfs_rtgroup_geometry *rgeo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + rgeo->rg_sick = 0; + rgeo->rg_checked = 0; + + xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); + for_each_sick_map(rtgroup_map, m) { + if (checked & m->sick_mask) + rgeo->rg_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + rgeo->rg_sick |= m->ioctl_mask; + } +} + static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, @@ -471,7 +485,6 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE }, - { 0, 0 }, }; /* Fill out bulkstat health info. */ @@ -488,7 +501,7 @@ xfs_bulkstat_health( bs->bs_checked = 0; xfs_inode_measure_sickness(ip, &sick, &checked); - for (m = ino_map; m->sick_mask; m++) { + for_each_sick_map(ino_map, m) { if (checked & m->sick_mask) bs->bs_checked |= m->ioctl_mask; if (sick & m->sick_mask) @@ -527,24 +540,13 @@ void xfs_btree_mark_sick( struct xfs_btree_cur *cur) { - switch (cur->bc_ops->type) { - case XFS_BTREE_TYPE_MEM: - /* no health state tracking for ephemeral btrees */ - return; - case XFS_BTREE_TYPE_AG: + if (xfs_btree_is_bmap(cur->bc_ops)) { + xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); + /* no health state tracking for ephemeral btrees */ + } else if (cur->bc_ops->type != XFS_BTREE_TYPE_MEM) { + ASSERT(cur->bc_group); ASSERT(cur->bc_ops->sick_mask); - xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); - return; - case XFS_BTREE_TYPE_INODE: - if (xfs_btree_is_bmap(cur->bc_ops)) { - xfs_bmap_mark_sick(cur->bc_ino.ip, - cur->bc_ino.whichfork); - return; - } - fallthrough; - default: - ASSERT(0); - return; + xfs_group_mark_sick(cur->bc_group, cur->bc_ops->sick_mask); } } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index a680e5b82672..7b6c026d01a1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -25,6 +25,9 @@ #include "xfs_ag.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_metafile.h" #include <linux/iversion.h> @@ -204,7 +207,7 @@ xfs_reclaim_work_queue( { rcu_read_lock(); - if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { + if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } @@ -219,15 +222,14 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); if (!xfs_is_blockgc_enabled(mp)) return; rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_blockgc_wq, - &pag->pag_blockgc_work, + queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); } @@ -239,7 +241,6 @@ xfs_perag_set_inode_tag( xfs_agino_t agino, unsigned int tag) { - struct xfs_mount *mp = pag->pag_mount; bool was_tagged; lockdep_assert_held(&pag->pag_ici_lock); @@ -253,13 +254,13 @@ xfs_perag_set_inode_tag( if (was_tagged) return; - /* propagate the tag up into the perag radix tree */ - xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); + /* propagate the tag up into the pag xarray tree */ + xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag)); /* start background work */ switch (tag) { case XFS_ICI_RECLAIM_TAG: - xfs_reclaim_work_queue(mp); + xfs_reclaim_work_queue(pag_mount(pag)); break; case XFS_ICI_BLOCKGC_TAG: xfs_blockgc_queue(pag); @@ -276,8 +277,6 @@ xfs_perag_clear_inode_tag( xfs_agino_t agino, unsigned int tag) { - struct xfs_mount *mp = pag->pag_mount; - lockdep_assert_held(&pag->pag_ici_lock); /* @@ -295,9 +294,8 @@ xfs_perag_clear_inode_tag( if (radix_tree_tagged(&pag->pag_ici_root, tag)) return; - /* clear the tag from the perag radix tree */ - xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag)); - + /* clear the tag from the pag xarray */ + xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag)); trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); } @@ -310,22 +308,9 @@ xfs_perag_grab_next_tag( struct xfs_perag *pag, int tag) { - unsigned long index = 0; - - if (pag) { - index = pag->pag_agno + 1; - xfs_perag_rele(pag); - } - - rcu_read_lock(); - pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag)); - if (pag) { - trace_xfs_perag_grab_next_tag(pag, _RET_IP_); - if (!atomic_inc_not_zero(&pag->pag_active_ref)) - pag = NULL; - } - rcu_read_unlock(); - return pag; + return to_perag(xfs_group_grab_next_mark(mp, + pag ? pag_group(pag) : NULL, + ici_tag_to_mark(tag), XG_TYPE_AG)); } /* @@ -847,6 +832,77 @@ out_error_or_again: } /* + * Get a metadata inode. + * + * The metafile type must match the file mode exactly, and for files in the + * metadata directory tree, it must match the inode's metatype exactly. + */ +int +xfs_trans_metafile_iget( + struct xfs_trans *tp, + xfs_ino_t ino, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; + umode_t mode; + int error; + + error = xfs_iget(mp, tp, ino, 0, 0, &ip); + if (error == -EFSCORRUPTED || error == -EINVAL) + goto whine; + if (error) + return error; + + if (VFS_I(ip)->i_nlink == 0) + goto bad_rele; + + if (metafile_type == XFS_METAFILE_DIR) + mode = S_IFDIR; + else + mode = S_IFREG; + if (inode_wrong_type(VFS_I(ip), mode)) + goto bad_rele; + if (xfs_has_metadir(mp)) { + if (!xfs_is_metadir_inode(ip)) + goto bad_rele; + if (metafile_type != ip->i_metatype) + goto bad_rele; + } + + *ipp = ip; + return 0; +bad_rele: + xfs_irele(ip); +whine: + xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino, + metafile_type); + xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); + return -EFSCORRUPTED; +} + +/* Grab a metadata file if the caller doesn't already have a transaction. */ +int +xfs_metafile_iget( + struct xfs_mount *mp, + xfs_ino_t ino, + enum xfs_metafile_type metafile_type, + struct xfs_inode **ipp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); + xfs_trans_cancel(tp); + return error; +} + +/* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have @@ -1014,7 +1070,7 @@ xfs_reclaim_inodes( if (xfs_want_reclaim_sick(mp)) icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; - while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) { + while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { xfs_ail_push_all_sync(mp->m_ail); xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } @@ -1056,7 +1112,7 @@ long xfs_reclaim_inodes_count( struct xfs_mount *mp) { - XA_STATE (xas, &mp->m_perags, 0); + XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0); long reclaimable = 0; struct xfs_perag *pag; @@ -1280,14 +1336,17 @@ xfs_inode_clear_eofblocks_tag( } /* - * Set ourselves up to free CoW blocks from this file. If it's already clean - * then we can bail out quickly, but otherwise we must back off if the file - * is undergoing some kind of write. + * Prepare to free COW fork blocks from an inode. */ static bool xfs_prep_free_cowblocks( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_icwalk *icw) { + bool sync; + + sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); + /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1299,16 +1358,22 @@ xfs_prep_free_cowblocks( } /* - * If the mapping is dirty or under writeback we cannot touch the - * CoW fork. Leave it alone if we're in the midst of a directio. + * A cowblocks trim of an inode can have a significant effect on + * fragmentation even when a reasonable COW extent size hint is set. + * Therefore, we prefer to not process cowblocks unless they are clean + * and idle. We can never process a cowblocks inode that is dirty or has + * in-flight I/O under any circumstances, because outstanding writeback + * or dio expects targeted COW fork blocks exist through write + * completion where they can be remapped into the data fork. + * + * Therefore, the heuristic used here is to never process inodes + * currently opened for write from background (i.e. non-sync) scans. For + * sync scans, use the pagecache/dio state of the inode to ensure we + * never free COW fork blocks out from under pending I/O. */ - if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || - atomic_read(&VFS_I(ip)->i_dio_count)) + if (!sync && inode_is_open_for_write(VFS_I(ip))) return false; - - return true; + return xfs_can_free_cowblocks(ip); } /* @@ -1337,7 +1402,7 @@ xfs_inode_free_cowblocks( if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; - if (!xfs_prep_free_cowblocks(ip)) + if (!xfs_prep_free_cowblocks(ip, icw)) return 0; if (!xfs_icwalk_match(ip, icw)) @@ -1366,7 +1431,7 @@ xfs_inode_free_cowblocks( * Check again, nobody else should be able to dirty blocks or change * the reflink iflag now that we have the first two locks held. */ - if (xfs_prep_free_cowblocks(ip)) + if (xfs_prep_free_cowblocks(ip, icw)) ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); return ret; } @@ -1392,13 +1457,12 @@ void xfs_blockgc_stop( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; if (!xfs_clear_blockgc_enabled(mp)) return; - for_each_perag(mp, agno, pag) + while ((pag = xfs_perag_next(mp, pag))) cancel_delayed_work_sync(&pag->pag_blockgc_work); trace_xfs_blockgc_stop(mp, __return_address); } @@ -1490,7 +1554,7 @@ xfs_blockgc_worker( { struct xfs_perag *pag = container_of(to_delayed_work(work), struct xfs_perag, pag_blockgc_work); - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); int error; trace_xfs_blockgc_worker(mp, __return_address); @@ -1498,7 +1562,7 @@ xfs_blockgc_worker( error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", - pag->pag_agno, error); + pag_agno(pag), error); xfs_blockgc_queue(pag); } @@ -1539,8 +1603,7 @@ xfs_blockgc_flush_all( * queued, it will not be requeued. Then flush whatever is left. */ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) - mod_delayed_work(pag->pag_mount->m_blockgc_wq, - &pag->pag_blockgc_work, 0); + mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0); while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) flush_delayed_work(&pag->pag_blockgc_work); @@ -1679,7 +1742,7 @@ xfs_icwalk_ag( enum xfs_icwalk_goal goal, struct xfs_icwalk *icw) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); uint32_t first_index; int last_error = 0; int skipped; @@ -1732,7 +1795,7 @@ restart: * us to see this inode, so another lookup from the * same index will not find it again. */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bcc277fc0a83..c8ad2606f928 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -43,6 +43,7 @@ #include "xfs_parent.h" #include "xfs_xattr.h" #include "xfs_inode_util.h" +#include "xfs_metafile.h" struct kmem_cache *xfs_inode_cache; @@ -341,8 +342,7 @@ xfs_lock_inumorder( { uint class = 0; - ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | - XFS_ILOCK_RTSUM))); + ASSERT(!(lock_mode & XFS_ILOCK_PARENT)); ASSERT(xfs_lockdep_subclass_ok(subclass)); if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { @@ -554,8 +554,20 @@ xfs_lookup( if (error) goto out_free_name; + /* + * Fail if a directory entry in the regular directory tree points to + * a metadata file. + */ + if (XFS_IS_CORRUPT(dp->i_mount, xfs_is_metadir_inode(*ipp))) { + xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR); + error = -EFSCORRUPTED; + goto out_irele; + } + return 0; +out_irele: + xfs_irele(*ipp); out_free_name: if (ci_name) kfree(ci_name->name); @@ -1295,7 +1307,7 @@ xfs_inode_needs_inactive( return false; /* Metadata inodes require explicit resource cleanup. */ - if (xfs_is_metadata_inode(ip)) + if (xfs_is_internal_inode(ip)) return false; /* Want to clean out the cow blocks if there are any. */ @@ -1388,7 +1400,7 @@ xfs_inactive( goto out; /* Metadata inodes require explicit resource cleanup. */ - if (xfs_is_metadata_inode(ip)) + if (xfs_is_internal_inode(ip)) goto out; /* Try to clean out the cow blocks if there are any. */ @@ -1409,7 +1421,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) + xfs_inode_has_filedata(ip))) truncate = 1; if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { @@ -1514,9 +1526,8 @@ xfs_iunlink_reload_next( xfs_agino_t next_agino) { struct xfs_perag *pag = agibp->b_pag; - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode *next_ip = NULL; - xfs_ino_t ino; int error; ASSERT(next_agino != NULLAGINO); @@ -1530,7 +1541,7 @@ xfs_iunlink_reload_next( xfs_info_ratelimited(mp, "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", - next_agino, pag->pag_agno); + next_agino, pag_agno(pag)); /* * Use an untrusted lookup just to be cautious in case the AGI has been @@ -1538,8 +1549,8 @@ xfs_iunlink_reload_next( * but we'd rather shut down now since we're already running in a weird * situation. */ - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); - error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + error = xfs_iget(mp, tp, xfs_agino_to_ino(pag, next_agino), + XFS_IGET_UNTRUSTED, 0, &next_ip); if (error) { xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return error; @@ -1573,7 +1584,7 @@ xfs_ifree_mark_inode_stale( struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode_log_item *iip; struct xfs_inode *ip; @@ -3041,7 +3052,7 @@ xfs_inode_alloc_unitsize( /* Should we always be using copy on write for file writes? */ bool xfs_is_always_cow_inode( - struct xfs_inode *ip) + const struct xfs_inode *ip) { return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97ed912306fd..1648dc5a8068 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -65,6 +65,7 @@ typedef struct xfs_inode { uint16_t i_flushiter; /* incremented on flush */ }; uint8_t i_forkoff; /* attr fork offset >> 3 */ + enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */ uint16_t i_diflags; /* XFS_DIFLAG_... */ uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ @@ -100,7 +101,7 @@ static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) return ip->i_prev_unlinked != 0; } -static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) +static inline bool xfs_inode_has_attr_fork(const struct xfs_inode *ip) { return ip->i_forkoff > 0; } @@ -230,7 +231,7 @@ xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) } static inline int -__xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) +__xfs_iflags_test(const struct xfs_inode *ip, unsigned long flags) { return (ip->i_flags & flags); } @@ -271,42 +272,60 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) return ret; } -static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; } -static inline bool xfs_is_metadata_inode(const struct xfs_inode *ip) +static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip) +{ + return ip->i_diflags2 & XFS_DIFLAG2_METADATA; +} + +static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + /* Any file in the metadata directory tree is a metadata inode. */ + if (xfs_has_metadir(mp)) + return xfs_is_metadir_inode(ip); + + /* + * Before metadata directories, the only metadata inodes were the + * three quota files, the realtime bitmap, and the realtime summary. + */ return ip->i_ino == mp->m_sb.sb_rbmino || ip->i_ino == mp->m_sb.sb_rsumino || xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } -bool xfs_is_always_cow_inode(struct xfs_inode *ip); +bool xfs_is_always_cow_inode(const struct xfs_inode *ip); -static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) { return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } +static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) +{ + return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. */ -static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) +static inline bool xfs_inode_has_cow_data(const struct xfs_inode *ip) { return ip->i_cowfp && ip->i_cowfp->if_bytes; } -static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) +static inline bool xfs_inode_has_bigtime(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } -static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) +static inline bool xfs_inode_has_large_extent_counts(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } @@ -315,7 +334,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) * Decide if this file is a realtime file whose data allocation unit is larger * than a single filesystem block. */ -static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) +static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) { return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } @@ -327,6 +346,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool +xfs_inode_can_atomicwrite( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + return false; + if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + return false; + + return true; +} + /* * In-core inode flags. */ @@ -429,9 +463,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly * limited to the subclasses we can represent via nesting. We need at least * 5 inodes nest depth for the ILOCK through rename, and we also have to support - * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP - * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all - * 8 subclasses supported by lockdep. + * XFS_ILOCK_PARENT, which gives 6 subclasses. That's 6 of the 8 subclasses + * supported by lockdep. * * This also means we have to number the sub-classes in the lowest bits of * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep @@ -457,8 +490,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) * ILOCK values * 0-4 subclass values * 5 PARENT subclass (not nestable) - * 6 RTBITMAP subclass (not nestable) - * 7 RTSUM subclass (not nestable) + * 6 unused + * 7 unused * */ #define XFS_IOLOCK_SHIFT 16 @@ -473,12 +506,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) -#define XFS_ILOCK_RTBITMAP_VAL 6u -#define XFS_ILOCK_RTSUM_VAL 7u #define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT) -#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT) #define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ @@ -620,9 +649,9 @@ void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( - struct xfs_inode *ip) + const struct xfs_inode *ip) { - return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); + return VFS_IC(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); } int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b509cbd191f4..912f0b1bc3cb 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -556,7 +556,6 @@ xfs_inode_to_log_dinode( to->di_projid_lo = ip->i_projid & 0xffff; to->di_projid_hi = ip->i_projid >> 16; - memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); @@ -590,10 +589,16 @@ xfs_inode_to_log_dinode( /* dummy value for initialisation */ to->di_crc = 0; + + if (xfs_is_metadir_inode(ip)) + to->di_metatype = ip->i_metatype; + else + to->di_metatype = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); + to->di_metatype = 0; } xfs_inode_to_log_dinode_iext_counters(ip, to); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index dbdab4ce7c44..e70d2611456b 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -175,7 +175,7 @@ xfs_log_dinode_to_disk( to->di_mode = cpu_to_be16(from->di_mode); to->di_version = from->di_version; to->di_format = from->di_format; - to->di_onlink = 0; + to->di_metatype = cpu_to_be16(from->di_metatype); to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); to->di_nlink = cpu_to_be32(from->di_nlink); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a20d426ef021..f95103325318 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include "xfs_file.h" #include "xfs_exchrange.h" #include "xfs_handle.h" +#include "xfs_rtgroup.h" #include <linux/mount.h> #include <linux/fileattr.h> @@ -233,6 +234,10 @@ xfs_bulk_ireq_setup( if (hdr->flags & XFS_BULK_IREQ_NREXT64) breq->flags |= XFS_IBULK_NREXT64; + /* Caller wants to see metadata directories in bulkstat output. */ + if (hdr->flags & XFS_BULK_IREQ_METADIR) + breq->flags |= XFS_IBULK_METADIR; + return 0; } @@ -323,6 +328,9 @@ xfs_ioc_inumbers( if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) return -EFAULT; + if (hdr.flags & XFS_BULK_IREQ_METADIR) + return -EINVAL; + error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); if (error == -ECANCELED) goto out_teardown; @@ -396,6 +404,38 @@ xfs_ioc_ag_geometry( return 0; } +STATIC int +xfs_ioc_rtgroup_geometry( + struct xfs_mount *mp, + void __user *arg) +{ + struct xfs_rtgroup *rtg; + struct xfs_rtgroup_geometry rgeo; + int error; + + if (copy_from_user(&rgeo, arg, sizeof(rgeo))) + return -EFAULT; + if (rgeo.rg_flags) + return -EINVAL; + if (memchr_inv(&rgeo.rg_reserved, 0, sizeof(rgeo.rg_reserved))) + return -EINVAL; + if (!xfs_has_rtgroups(mp)) + return -EINVAL; + + rtg = xfs_rtgroup_get(mp, rgeo.rg_number); + if (!rtg) + return -EINVAL; + + error = xfs_rtgroup_get_geometry(rtg, &rgeo); + xfs_rtgroup_put(rtg); + if (error) + return error; + + if (copy_to_user(arg, &rgeo, sizeof(rgeo))) + return -EFAULT; + return 0; +} + /* * Linux extended inode flags interface. */ @@ -481,7 +521,7 @@ xfs_ioctl_setattr_xflags( if (rtflag != XFS_IS_REALTIME_INODE(ip)) { /* Can't change realtime flag if any extents are allocated. */ - if (ip->i_df.if_nextents || ip->i_delayed_blks) + if (xfs_inode_has_filedata(ip)) return -EINVAL; /* @@ -602,7 +642,7 @@ xfs_ioctl_setattr_check_extsize( if (!fa->fsx_valid) return 0; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; @@ -881,41 +921,29 @@ xfs_ioc_swapext( xfs_swapext_t *sxp) { xfs_inode_t *ip, *tip; - struct fd f, tmp; - int error = 0; /* Pull information for the target fd */ - f = fdget((int)sxp->sx_fdtarget); - if (!fd_file(f)) { - error = -EINVAL; - goto out; - } + CLASS(fd, f)((int)sxp->sx_fdtarget); + if (fd_empty(f)) + return -EINVAL; if (!(fd_file(f)->f_mode & FMODE_WRITE) || !(fd_file(f)->f_mode & FMODE_READ) || - (fd_file(f)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_file; - } + (fd_file(f)->f_flags & O_APPEND)) + return -EBADF; - tmp = fdget((int)sxp->sx_fdtmp); - if (!fd_file(tmp)) { - error = -EINVAL; - goto out_put_file; - } + CLASS(fd, tmp)((int)sxp->sx_fdtmp); + if (fd_empty(tmp)) + return -EINVAL; if (!(fd_file(tmp)->f_mode & FMODE_WRITE) || !(fd_file(tmp)->f_mode & FMODE_READ) || - (fd_file(tmp)->f_flags & O_APPEND)) { - error = -EBADF; - goto out_put_tmp_file; - } + (fd_file(tmp)->f_flags & O_APPEND)) + return -EBADF; if (IS_SWAPFILE(file_inode(fd_file(f))) || - IS_SWAPFILE(file_inode(fd_file(tmp)))) { - error = -EINVAL; - goto out_put_tmp_file; - } + IS_SWAPFILE(file_inode(fd_file(tmp)))) + return -EINVAL; /* * We need to ensure that the fds passed in point to XFS inodes @@ -923,37 +951,22 @@ xfs_ioc_swapext( * control over what the user passes us here. */ if (fd_file(f)->f_op != &xfs_file_operations || - fd_file(tmp)->f_op != &xfs_file_operations) { - error = -EINVAL; - goto out_put_tmp_file; - } + fd_file(tmp)->f_op != &xfs_file_operations) + return -EINVAL; ip = XFS_I(file_inode(fd_file(f))); tip = XFS_I(file_inode(fd_file(tmp))); - if (ip->i_mount != tip->i_mount) { - error = -EINVAL; - goto out_put_tmp_file; - } - - if (ip->i_ino == tip->i_ino) { - error = -EINVAL; - goto out_put_tmp_file; - } + if (ip->i_mount != tip->i_mount) + return -EINVAL; - if (xfs_is_shutdown(ip->i_mount)) { - error = -EIO; - goto out_put_tmp_file; - } + if (ip->i_ino == tip->i_ino) + return -EINVAL; - error = xfs_swap_extents(ip, tip, sxp); + if (xfs_is_shutdown(ip->i_mount)) + return -EIO; - out_put_tmp_file: - fdput(tmp); - out_put_file: - fdput(f); - out: - return error; + return xfs_swap_extents(ip, tip, sxp); } static int @@ -1021,7 +1034,7 @@ xfs_ioc_setlabel( * buffered reads from userspace (i.e. from blkid) are invalidated, * and userspace will see the newly-written label. */ - error = xfs_sync_sb_buf(mp); + error = xfs_sync_sb_buf(mp, true); if (error) goto out; /* @@ -1032,6 +1045,8 @@ xfs_ioc_setlabel( mutex_unlock(&mp->m_growlock); invalidate_bdev(mp->m_ddev_targp->bt_bdev); + if (xfs_has_rtsb(mp) && mp->m_rtdev_targp) + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); out: mnt_drop_write_file(filp); @@ -1189,7 +1204,16 @@ xfs_file_ioctl( struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; - da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = target->bt_logical_sectorsize; + + /* + * See xfs_report_dioalign() for an explanation about why this + * reports a value larger than the sector size for COW inodes. + */ + if (xfs_is_cow_inode(ip)) + da.d_miniosz = xfs_inode_alloc_unitsize(ip); + else + da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) @@ -1216,6 +1240,8 @@ xfs_file_ioctl( case XFS_IOC_AG_GEOMETRY: return xfs_ioc_ag_geometry(mp, arg); + case XFS_IOC_RTGROUP_GEOMETRY: + return xfs_ioc_rtgroup_geometry(mp, arg); case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1e11f48814c0..50fa3ef89f6c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -24,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_quota.h" +#include "xfs_rtgroup.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" @@ -115,7 +116,9 @@ xfs_bmbt_to_iomap( iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { - iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); + xfs_daddr_t daddr = xfs_fsb_to_db(ip, imap->br_startblock); + + iomap->addr = BBTOB(daddr); if (mapping_flags & IOMAP_DAX) iomap->addr += target->bt_dax_part_off; @@ -124,6 +127,14 @@ xfs_bmbt_to_iomap( else iomap->type = IOMAP_MAPPED; + /* + * Mark iomaps starting at the first sector of a RTG as merge + * boundary so that each I/O completions is contained to a + * single RTG. + */ + if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(mp) && + xfs_rtbno_is_group_start(mp, imap->br_startblock)) + iomap->flags |= IOMAP_F_BOUNDARY; } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); @@ -342,16 +353,26 @@ xfs_quota_need_throttle( xfs_fsblock_t alloc_blocks) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; + if (XFS_IS_REALTIME_INODE(ip)) { + res = &dq->q_rtb; + pre = &dq->q_rtb_prealloc; + } else { + res = &dq->q_blk; + pre = &dq->q_blk_prealloc; + } + /* no hi watermark, no throttle */ - if (!dq->q_prealloc_hi_wmark) + if (!pre->q_prealloc_hi_wmark) return false; /* under the lo watermark, no throttle */ - if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) + if (res->reserved + alloc_blocks < pre->q_prealloc_lo_wmark) return false; return true; @@ -366,22 +387,35 @@ xfs_quota_calc_throttle( int64_t *qfreesp) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; int64_t freesp; int shift = 0; + if (!dq) { + res = NULL; + pre = NULL; + } else if (XFS_IS_REALTIME_INODE(ip)) { + res = &dq->q_rtb; + pre = &dq->q_rtb_prealloc; + } else { + res = &dq->q_blk; + pre = &dq->q_blk_prealloc; + } + /* no dq, or over hi wmark, squash the prealloc completely */ - if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { + if (!res || res->reserved >= pre->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } - freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; - if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + freesp = pre->q_prealloc_hi_wmark - res->reserved; + if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; - if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + if (freesp < pre->q_low_space[XFS_QLOWSP_3_PCNT]) shift += 2; - if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + if (freesp < pre->q_low_space[XFS_QLOWSP_1_PCNT]) shift += 2; } @@ -501,8 +535,8 @@ xfs_iomap_prealloc_size( alloc_blocks); if (unlikely(XFS_IS_REALTIME_INODE(ip))) - freesp = xfs_rtx_to_rtb(mp, - xfs_iomap_freesp(&mp->m_frextents, + freesp = xfs_rtbxlen_to_blen(mp, + xfs_iomap_freesp(&mp->m_frextents, mp->m_low_rtexts, &shift)); else freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, @@ -707,7 +741,7 @@ imap_needs_cow( return false; /* when zeroing we don't have to COW holes or unwritten extents */ - if (flags & IOMAP_ZERO) { + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_state == XFS_EXT_UNWRITTEN) @@ -975,6 +1009,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; + unsigned int iomap_flags = 0; u64 seq; if (xfs_is_shutdown(mp)) @@ -1145,6 +1180,11 @@ xfs_buffered_write_iomap_begin( } } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap_flags |= IOMAP_F_NEW; if (allocfork == XFS_COW_FORK) { error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, end_fsb - offset_fsb, prealloc_blocks, &cmap, @@ -1162,19 +1202,11 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - /* - * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch - * them out if the write happens to fail. - */ - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); - found_imap: - seq = xfs_iomap_inode_sequence(ip, 0); + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); convert_delay: xfs_iunlock(ip, lockmode); @@ -1188,20 +1220,20 @@ convert_delay: return 0; found_cow: - seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); if (error) goto out_unlock; - seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, - IOMAP_F_SHARED, seq); + } else { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); } - xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + iomap_flags |= IOMAP_F_SHARED; + seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); - return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq); out_unlock: xfs_iunlock(ip, lockmode); @@ -1215,7 +1247,10 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { - xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); + xfs_bmap_punch_delalloc_range(XFS_I(inode), + (iomap->flags & IOMAP_F_SHARED) ? + XFS_COW_FORK : XFS_DATA_FORK, + offset, offset + length); } static int @@ -1227,8 +1262,38 @@ xfs_buffered_write_iomap_end( unsigned flags, struct iomap *iomap) { - iomap_file_buffered_write_punch_delalloc(inode, offset, length, written, - flags, iomap, &xfs_buffered_write_delalloc_punch); + loff_t start_byte, end_byte; + + /* If we didn't reserve the blocks, we're not allowed to punch them. */ + if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW)) + return 0; + + /* + * iomap_page_mkwrite() will never fail in a way that requires delalloc + * extents that it allocated to be revoked. Hence never try to release + * them here. + */ + if (flags & IOMAP_FAULT) + return 0; + + /* Nothing to do if we've written the entire delalloc extent */ + start_byte = iomap_last_written_block(inode, offset, written); + end_byte = round_up(offset + length, i_blocksize(inode)); + if (start_byte >= end_byte) + return 0; + + /* For zeroing operations the callers already hold invalidate_lock. */ + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { + rwsem_assert_held_write(&inode->i_mapping->invalidate_lock); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + } else { + filemap_invalidate_lock(inode->i_mapping); + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, + iomap, xfs_buffered_write_delalloc_punch); + filemap_invalidate_unlock(inode->i_mapping); + } + return 0; } @@ -1237,15 +1302,6 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_end = xfs_buffered_write_iomap_end, }; -/* - * iomap_page_mkwrite() will never fail in a way that requires delalloc extents - * that it allocated to be revoked. Hence we do not need an .iomap_end method - * for this operation. - */ -const struct iomap_ops xfs_page_mkwrite_iomap_ops = { - .iomap_begin = xfs_buffered_write_iomap_begin, -}; - static int xfs_read_iomap_begin( struct inode *inode, @@ -1435,6 +1491,8 @@ xfs_zero_range( { struct inode *inode = VFS_I(ip); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 4da13440bae9..8347268af727 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -48,7 +48,6 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_buffered_write_iomap_ops; -extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..40289fe6f5b2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -42,7 +42,9 @@ * held. For regular files, the lock order is the other way around - the * mmap_lock is taken during the page fault, and then we lock the ilock to do * block mapping. Hence we need a different class for the directory ilock so - * that lockdep can tell them apart. + * that lockdep can tell them apart. Directories in the metadata directory + * tree get a separate class so that lockdep reports will warn us if someone + * ever tries to lock regular directories after locking metadata directories. */ static struct lock_class_key xfs_nondir_ilock_class; static struct lock_class_key xfs_dir_ilock_class; @@ -570,6 +572,46 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_report_dioalign( + struct xfs_inode *ip, + struct kstat *stat) +{ + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + + /* + * For COW inodes, we can only perform out of place writes of entire + * allocation units (blocks or RT extents). + * For writes smaller than the allocation unit, we must fall back to + * buffered I/O to perform read-modify-write cycles. At best this is + * highly inefficient; at worst it leads to page cache invalidation + * races. Tell applications to avoid this by reporting the larger write + * alignment in dio_offset_align, and the smaller read alignment in + * dio_read_offset_align. + */ + stat->dio_read_offset_align = bdev_logical_block_size(bdev); + if (xfs_is_cow_inode(ip)) + stat->dio_offset_align = xfs_inode_alloc_unitsize(ip); + else + stat->dio_offset_align = stat->dio_read_offset_align; +} + +static void +xfs_report_atomic_write( + struct xfs_inode *ip, + struct kstat *stat) +{ + unsigned int unit_min = 0, unit_max = 0; + + if (xfs_inode_can_atomicwrite(ip)) + unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; + generic_fill_statx_atomic_writes(stat, unit_min, unit_max); +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -597,8 +639,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +651,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -635,14 +673,10 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; case S_IFREG: - if (request_mask & STATX_DIOALIGN) { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - struct block_device *bdev = target->bt_bdev; - - stat->result_mask |= STATX_DIOALIGN; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - } + if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) + xfs_report_dioalign(ip, stat); + if (request_mask & STATX_WRITE_ATOMIC) + xfs_report_atomic_write(ip, stat); fallthrough; default: stat->blksize = xfs_stat_blksize(ip); @@ -1289,6 +1323,7 @@ xfs_setup_inode( { struct inode *inode = &ip->i_vnode; gfp_t gfp_mask; + bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; inode->i_state |= I_NEW; @@ -1300,6 +1335,16 @@ xfs_setup_inode( i_size_write(inode, ip->i_disk_size); xfs_diflags_to_iflags(ip, true); + /* + * Mark our metadata files as private so that LSMs and the ACL code + * don't try to add their own metadata or reason about these files, + * and users cannot ever obtain file handles to them. + */ + if (is_meta) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + if (S_ISDIR(inode->i_mode)) { /* * We set the i_rwsem class here to avoid potential races with diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c0757ab99495..1fa1c0564b0c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -36,6 +36,14 @@ struct xfs_bstat_chunk { struct xfs_bulkstat *buf; }; +static inline bool +want_metadir_file( + struct xfs_inode *ip, + struct xfs_ibulk *breq) +{ + return xfs_is_metadir_inode(ip) && (breq->flags & XFS_IBULK_METADIR); +} + /* * Fill out the bulkstat info for a single inode and report it somewhere. * @@ -69,9 +77,6 @@ xfs_bulkstat_one_int( vfsuid_t vfsuid; vfsgid_t vfsgid; - if (xfs_internal_inum(mp, ino)) - goto out_advance; - error = xfs_iget(mp, tp, ino, (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), XFS_ILOCK_SHARED, &ip); @@ -97,8 +102,28 @@ xfs_bulkstat_one_int( vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); + /* + * If caller wants files from the metadata directories, push out the + * bare minimum information for enabling scrub. + */ + if (want_metadir_file(ip, bc->breq)) { + memset(buf, 0, sizeof(*buf)); + buf->bs_ino = ino; + buf->bs_gen = inode->i_generation; + buf->bs_mode = inode->i_mode & S_IFMT; + xfs_bulkstat_health(ip, buf); + buf->bs_version = XFS_BULKSTAT_VERSION_V5; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_irele(ip); + + error = bc->formatter(bc->breq, buf); + if (!error || error == -ECANCELED) + goto out_advance; + goto out; + } + /* If this is a private inode, don't leak its details to userspace. */ - if (IS_PRIVATE(inode)) { + if (IS_PRIVATE(inode) || xfs_is_sb_inum(mp, ino)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); error = -EINVAL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 1659f13f17a8..f10e8f8f2335 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -22,6 +22,9 @@ struct xfs_ibulk { /* Fill out the bs_extents64 field if set. */ #define XFS_IBULK_NREXT64 (1U << 1) +/* Signal that we can return metadata directories. */ +#define XFS_IBULK_METADIR (1U << 2) + /* * Advance the user buffer pointer by one record of the given size. If the * buffer is now full, return the appropriate error code. diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c index 2ddccb172fa0..1fd70a7aed63 100644 --- a/fs/xfs/xfs_iunlink_item.c +++ b/fs/xfs/xfs_iunlink_item.c @@ -52,14 +52,14 @@ xfs_iunlink_log_dinode( struct xfs_trans *tp, struct xfs_iunlink_item *iup) { - struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip = iup->ip; struct xfs_dinode *dip; struct xfs_buf *ibp; + xfs_agino_t old_ptr; int offset; int error; - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); + error = xfs_imap_to_bp(tp->t_mountp, tp, &ip->i_imap, &ibp); if (error) return error; /* @@ -73,22 +73,21 @@ xfs_iunlink_log_dinode( dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); /* Make sure the old pointer isn't garbage. */ - if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) { + old_ptr = be32_to_cpu(dip->di_next_unlinked); + if (old_ptr != iup->old_agino) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } - trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno, - XFS_INO_TO_AGINO(mp, ip->i_ino), - be32_to_cpu(dip->di_next_unlinked), iup->next_agino); + trace_xfs_iunlink_update_dinode(iup, old_ptr); dip->di_next_unlinked = cpu_to_be32(iup->next_agino); offset = ip->i_imap.im_boffset + offsetof(struct xfs_dinode, di_next_unlinked); - xfs_dinode_calc_crc(mp, dip); + xfs_dinode_calc_crc(tp->t_mountp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); return 0; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 86f14ec7c31f..7db3ece370b1 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -100,7 +100,6 @@ xfs_iwalk_ichunk_ra( struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_agnumber_t agno = pag->pag_agno; xfs_agblock_t agbno; struct blk_plug plug; int i; /* inode chunk index */ @@ -114,7 +113,7 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { xfs_buf_readahead(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, agbno), + xfs_agbno_to_daddr(pag, agbno), igeo->blocks_per_cluster * mp->m_bsize, &xfs_inode_buf_ops); } @@ -177,20 +176,19 @@ xfs_iwalk_ag_recs( struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; - xfs_ino_t ino; unsigned int i, j; int error; for (i = 0; i < iwag->nr_recs; i++) { struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; - trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); + trace_xfs_iwalk_ag_rec(pag, irec); if (xfs_pwork_want_abort(&iwag->pwork)) return 0; if (iwag->inobt_walk_fn) { - error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, + error = iwag->inobt_walk_fn(mp, tp, pag_agno(pag), irec, iwag->data); if (error) return error; @@ -208,9 +206,10 @@ xfs_iwalk_ag_recs( continue; /* Otherwise call our function. */ - ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, - irec->ir_startino + j); - error = iwag->iwalk_fn(mp, tp, ino, iwag->data); + error = iwag->iwalk_fn(mp, tp, + xfs_agino_to_ino(pag, + irec->ir_startino + j), + iwag->data); if (error) return error; } @@ -305,7 +304,7 @@ xfs_iwalk_ag_start( return -EFSCORRUPTED; } - iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + iwag->lastino = xfs_agino_to_ino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); /* @@ -406,7 +405,7 @@ xfs_iwalk_ag( int error = 0; /* Set up our cursor at the right place in the inode btree. */ - ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); + ASSERT(pag_agno(pag) == XFS_INO_TO_AGNO(mp, iwag->startino)); agino = XFS_INO_TO_AGINO(mp, iwag->startino); error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); @@ -425,7 +424,7 @@ xfs_iwalk_ag( break; /* Make sure that we always move forward. */ - rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); + rec_fsino = xfs_agino_to_ino(pag, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { xfs_btree_mark_sick(cur); @@ -535,6 +534,37 @@ xfs_iwalk_prefetch( return max(inobt_records, 2U); } +static int +xfs_iwalk_args( + struct xfs_iwalk_ag *iwag, + unsigned int flags) +{ + struct xfs_mount *mp = iwag->mp; + xfs_agnumber_t start_agno; + int error; + + start_agno = XFS_INO_TO_AGNO(iwag->mp, iwag->startino); + ASSERT(start_agno < iwag->mp->m_sb.sb_agcount); + ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); + + error = xfs_iwalk_alloc(iwag); + if (error) + return error; + + while ((iwag->pag = xfs_perag_next_from(mp, iwag->pag, start_agno))) { + error = xfs_iwalk_ag(iwag); + if (error || (flags & XFS_IWALK_SAME_AG)) { + xfs_perag_rele(iwag->pag); + break; + } + iwag->startino = + XFS_AGINO_TO_INO(mp, pag_agno(iwag->pag) + 1, 0); + } + + xfs_iwalk_free(iwag); + return error; +} + /* * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn * will be called for each allocated inode, being passed the inode's number and @@ -563,32 +593,8 @@ xfs_iwalk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - int error; - - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); - - error = xfs_iwalk_alloc(&iwag); - if (error) - return error; - - for_each_perag_from(mp, agno, pag) { - iwag.pag = pag; - error = xfs_iwalk_ag(&iwag); - if (error) - break; - iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) - break; - iwag.pag = NULL; - } - if (iwag.pag) - xfs_perag_rele(pag); - xfs_iwalk_free(&iwag); - return error; + return xfs_iwalk_args(&iwag, flags); } /* Run per-thread iwalk work. */ @@ -640,19 +646,19 @@ xfs_iwalk_threaded( bool polled, void *data) { + xfs_agnumber_t start_agno = XFS_INO_TO_AGNO(mp, startino); struct xfs_pwork_ctl pctl; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + struct xfs_perag *pag = NULL; int error; - ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(start_agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error; - for_each_perag_from(mp, agno, pag) { + while ((pag = xfs_perag_next_from(mp, pag, start_agno))) { struct xfs_iwalk_ag *iwag; if (xfs_pwork_ctl_want_abort(&pctl)) @@ -673,8 +679,8 @@ xfs_iwalk_threaded( iwag->sz_recs = xfs_iwalk_prefetch(inode_records); iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); - startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) + startino = XFS_AGINO_TO_INO(mp, pag_agno(pag) + 1, 0); + if (flags & XFS_IWALK_SAME_AG) break; } if (pag) @@ -748,30 +754,6 @@ xfs_inobt_walk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; - struct xfs_perag *pag; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - int error; - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL)); - - error = xfs_iwalk_alloc(&iwag); - if (error) - return error; - - for_each_perag_from(mp, agno, pag) { - iwag.pag = pag; - error = xfs_iwalk_ag(&iwag); - if (error) - break; - iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); - if (flags & XFS_INOBT_WALK_SAME_AG) - break; - iwag.pag = NULL; - } - - if (iwag.pag) - xfs_perag_rele(pag); - xfs_iwalk_free(&iwag); - return error; + return xfs_iwalk_args(&iwag, flags); } diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 83699089755e..17a5a2c6debb 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -25,7 +25,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, unsigned int flags, xfs_iwalk_fn iwalk_fn, unsigned int inode_records, bool poll, void *data); -/* Only iterate inodes within the same AG as @startino. */ +/* Only iterate within the same AG as @startino. */ #define XFS_IWALK_SAME_AG (1U << 0) #define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) @@ -41,9 +41,4 @@ int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records, void *data); -/* Only iterate inobt records within the same AG as @startino. */ -#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG) - -#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG) - #endif /* __XFS_IWALK_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 26b2f5887b88..05daad8a8d34 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3456,6 +3456,16 @@ xlog_force_shutdown( return false; /* + * Ensure that there is only ever one log shutdown being processed. + * If we allow the log force below on a second pass after shutting + * down the log, we risk deadlocking the CIL push as it may require + * locks on objects the current shutdown context holds (e.g. taking + * buffer locks to abort buffers on last unpin of buf log items). + */ + if (test_and_set_bit(XLOG_SHUTDOWN_STARTED, &log->l_opstate)) + return false; + + /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs @@ -3487,6 +3497,7 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); + ASSERT(0); return false; } spin_unlock(&log->l_icloglock); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 67c539cc9305..13455854365f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -158,6 +158,4 @@ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags); -int xfs_attr_use_log_assist(struct xfs_mount *mp); - #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 391a938d690c..1ca406ec1b40 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -156,7 +156,6 @@ xlog_cil_insert_pcp_aggregate( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { - struct xlog_cil_pcp *cilpcp; int cpu; int count = 0; @@ -171,14 +170,9 @@ xlog_cil_insert_pcp_aggregate( * structures that could have a nonzero space_used. */ for_each_cpu(cpu, &ctx->cil_pcpmask) { - int old, prev; + struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - do { - old = cilpcp->space_used; - prev = cmpxchg(&cilpcp->space_used, old, 0); - } while (old != prev); - count += old; + count += xchg(&cilpcp->space_used, 0); } atomic_add(count, &ctx->space_used); } @@ -910,7 +904,7 @@ xlog_cil_committed( xlog_cil_ail_insert(ctx, abort); xfs_extent_busy_sort(&ctx->busy_extents.extent_list); - xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, + xfs_extent_busy_clear(&ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -920,7 +914,6 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents.extent_list)) { - ctx->busy_extents.mount = mp; ctx->busy_extents.owner = ctx; xfs_discard_extents(mp, &ctx->busy_extents); return; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b8778a4fd6b6..f3d78869e5e5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -458,6 +458,7 @@ struct xlog { #define XLOG_IO_ERROR 2 /* log hit an I/O error, and being shutdown */ #define XLOG_TAIL_WARN 3 /* log tail verify warning issued */ +#define XLOG_SHUTDOWN_STARTED 4 /* xlog_force_shutdown() exclusion */ static inline bool xlog_recovery_needed(struct xlog *log) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ec766b4bc853..0af3d477197b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1818,6 +1818,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { &xlog_attrd_item_ops, &xlog_xmi_item_ops, &xlog_xmd_item_ops, + &xlog_rtefi_item_ops, + &xlog_rtefd_item_ops, }; static const struct xlog_recover_item_ops * @@ -1849,7 +1851,7 @@ xlog_find_item_ops( * from the transaction. However, we can't do that until after we've * replayed all the other items because they may be dependent on the * cancelled buffer and replaying the cancelled buffer can remove it - * form the cancelled buffer table. Hence they have tobe done last. + * form the cancelled buffer table. Hence they have to be done last. * * 3. Inode allocation buffers must be replayed before inode items that * read the buffer and replay changes into it. For filesystems using the @@ -2677,7 +2679,7 @@ xlog_recover_clear_agi_bucket( struct xfs_perag *pag, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_trans *tp; struct xfs_agi *agi; struct xfs_buf *agibp; @@ -2708,7 +2710,7 @@ out_abort: xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, - pag->pag_agno); + pag_agno(pag)); return; } @@ -2718,7 +2720,7 @@ xlog_recover_iunlink_bucket( struct xfs_agi *agi, int bucket) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = pag_mount(pag); struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; xfs_agino_t prev_agino, agino; @@ -2726,9 +2728,8 @@ xlog_recover_iunlink_bucket( agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - error = xfs_iget(mp, NULL, - XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), - 0, 0, &ip); + error = xfs_iget(mp, NULL, xfs_agino_to_ino(pag, agino), 0, 0, + &ip); if (error) break; @@ -2846,10 +2847,9 @@ static void xlog_recover_process_iunlinks( struct xlog *log) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; - for_each_perag(log->l_mp, agno, pag) + while ((pag = xfs_perag_next(log->l_mp, pag))) xlog_recover_iunlink_ag(pag); } @@ -3393,13 +3393,6 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, - &mp->m_maxagi); - if (error) { - xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); - return error; - } - mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 8f495cc23903..6ed485ff2756 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -131,3 +131,54 @@ xfs_buf_alert_ratelimited( __xfs_printk(KERN_ALERT, mp, &vaf); va_end(args); } + +void +xfs_warn_experimental( + struct xfs_mount *mp, + enum xfs_experimental_feat feat) +{ + static const struct { + const char *name; + long opstate; + } features[] = { + [XFS_EXPERIMENTAL_PNFS] = { + .opstate = XFS_OPSTATE_WARNED_PNFS, + .name = "pNFS", + }, + [XFS_EXPERIMENTAL_SCRUB] = { + .opstate = XFS_OPSTATE_WARNED_SCRUB, + .name = "online scrub", + }, + [XFS_EXPERIMENTAL_SHRINK] = { + .opstate = XFS_OPSTATE_WARNED_SHRINK, + .name = "online shrink", + }, + [XFS_EXPERIMENTAL_LARP] = { + .opstate = XFS_OPSTATE_WARNED_LARP, + .name = "logged extended attributes", + }, + [XFS_EXPERIMENTAL_LBS] = { + .opstate = XFS_OPSTATE_WARNED_LBS, + .name = "large block size", + }, + [XFS_EXPERIMENTAL_EXCHRANGE] = { + .opstate = XFS_OPSTATE_WARNED_EXCHRANGE, + .name = "exchange range", + }, + [XFS_EXPERIMENTAL_PPTR] = { + .opstate = XFS_OPSTATE_WARNED_PPTR, + .name = "parent pointer", + }, + [XFS_EXPERIMENTAL_METADIR] = { + .opstate = XFS_OPSTATE_WARNED_METADIR, + .name = "metadata directory tree", + }, + }; + ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); + BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); + + if (xfs_should_warn(mp, features[feat].opstate)) + xfs_warn(mp, + "EXPERIMENTAL %s feature enabled. Use at your own risk!", + features[feat].name); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index cc323775a12c..7fb36ced9df7 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -75,12 +75,6 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) -#define xfs_warn_mount(mp, warntag, fmt, ...) \ -do { \ - if (xfs_should_warn((mp), (warntag))) \ - xfs_warn((mp), (fmt), ##__VA_ARGS__); \ -} while (0) - #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ @@ -96,4 +90,18 @@ extern void xfs_hex_dump(const void *p, int length); void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...); +enum xfs_experimental_feat { + XFS_EXPERIMENTAL_PNFS, + XFS_EXPERIMENTAL_SCRUB, + XFS_EXPERIMENTAL_SHRINK, + XFS_EXPERIMENTAL_LARP, + XFS_EXPERIMENTAL_LBS, + XFS_EXPERIMENTAL_EXCHRANGE, + XFS_EXPERIMENTAL_PPTR, + XFS_EXPERIMENTAL_METADIR, + + XFS_EXPERIMENTAL_MAX, +}; +void xfs_warn_experimental(struct xfs_mount *mp, enum xfs_experimental_feat f); + #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1fdd79c5bfa0..5918f433dba7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -35,6 +35,8 @@ #include "xfs_trace.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" #include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -620,6 +622,22 @@ xfs_mount_setup_inode_geom( xfs_ialloc_setup_geometry(mp); } +/* Mount the metadata directory tree root. */ +STATIC int +xfs_mount_setup_metadir( + struct xfs_mount *mp) +{ + int error; + + /* Load the metadata directory root inode into memory. */ + error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR, + &mp->m_metadirip); + if (error) + xfs_warn(mp, "Failed to load metadir root directory, error %d", + error); + return error; +} + /* Compute maximum possible height for per-AG btree types for this fs. */ static inline void xfs_agbtree_compute_maxlevels( @@ -810,17 +828,24 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, - &mp->m_maxagi); + error = xfs_initialize_perag(mp, 0, sbp->sb_agcount, + mp->m_sb.sb_dblocks, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; } + error = xfs_initialize_rtgroups(mp, 0, sbp->sb_rgcount, + mp->m_sb.sb_rextents); + if (error) { + xfs_warn(mp, "Failed rtgroup init: %d", error); + goto out_free_perag; + } + if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { xfs_warn(mp, "no log defined"); error = -EFSCORRUPTED; - goto out_free_perag; + goto out_free_rtgroup; } error = xfs_inodegc_register_shrinker(mp); @@ -828,6 +853,13 @@ xfs_mountfs( goto out_fail_wait; /* + * If we're resuming quota status, pick up the preliminary qflags from + * the ondisk superblock so that we know if we should recover dquots. + */ + if (xfs_is_resuming_quotaon(mp)) + xfs_qm_resume_quotaon(mp); + + /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or * cancelled. @@ -841,6 +873,14 @@ xfs_mountfs( } /* + * If we're resuming quota status and recovered the log, re-sample the + * qflags from the ondisk superblock now that we've recovered it, just + * in case someone shut down enforcement just before a crash. + */ + if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(mp->m_log)) + xfs_qm_resume_quotaon(mp); + + /* * If logged xattrs are still enabled after log recovery finishes, then * they'll be available until unmount. Otherwise, turn them off. */ @@ -866,6 +906,12 @@ xfs_mountfs( mp->m_features |= XFS_FEAT_ATTR2; } + if (xfs_has_metadir(mp)) { + error = xfs_mount_setup_metadir(mp); + if (error) + goto out_free_metadir; + } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -876,7 +922,7 @@ xfs_mountfs( xfs_warn(mp, "Failed to read root inode 0x%llx, error %d", sbp->sb_rootino, -error); - goto out_log_dealloc; + goto out_free_metadir; } ASSERT(rip != NULL); @@ -1018,6 +1064,9 @@ xfs_mountfs( xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + out_free_metadir: + if (mp->m_metadirip) + xfs_irele(mp->m_metadirip); /* * Inactivate all inodes that might still be in memory after a log @@ -1039,7 +1088,6 @@ xfs_mountfs( * quota inodes. */ xfs_unmount_flush_inodes(mp); - out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: shrinker_free(mp->m_inodegc_shrinker); @@ -1047,8 +1095,10 @@ xfs_mountfs( if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); xfs_buftarg_drain(mp->m_ddev_targp); + out_free_rtgroup: + xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); out_free_perag: - xfs_free_perag(mp); + xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); out_free_dir: xfs_da_unmount(mp); out_remove_uuid: @@ -1091,6 +1141,8 @@ xfs_unmountfs( xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); + if (mp->m_metadirip) + xfs_irele(mp->m_metadirip); xfs_unmount_flush_inodes(mp); @@ -1129,8 +1181,8 @@ xfs_unmountfs( xfs_errortag_clearall(mp); #endif shrinker_free(mp->m_inodegc_shrinker); - xfs_free_perag(mp); - + xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); + xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); @@ -1437,7 +1489,7 @@ xfs_mod_delalloc( if (XFS_IS_REALTIME_INODE(ip)) { percpu_counter_add_batch(&mp->m_delalloc_rtextents, - xfs_rtb_to_rtx(mp, data_delta), + xfs_blen_to_rtbxlen(mp, data_delta), XFS_DELALLOC_BATCH); if (!ind_delta) return; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 96496f39f551..db9dade7d22a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -72,6 +72,40 @@ struct xfs_inodegc { }; /* + * Container for each type of groups, used to look up individual groups and + * describes the geometry. + */ +struct xfs_groups { + struct xarray xa; + + /* + * Maximum capacity of the group in FSBs. + * + * Each group is laid out densely in the daddr space. For the + * degenerate case of a pre-rtgroups filesystem, the incore rtgroup + * pretends to have a zero-block and zero-blklog rtgroup. + */ + uint32_t blocks; + + /* + * Log(2) of the logical size of each group. + * + * Compared to the blocks field above this is rounded up to the next + * power of two, and thus lays out the xfs_fsblock_t/xfs_rtblock_t + * space sparsely with a hole from blocks to (1 << blklog) at the end + * of each group. + */ + uint8_t blklog; + + /* + * Mask to extract the group-relative block number from a FSB. + * For a pre-rtgroups filesystem we pretend to have one very large + * rtgroup, so this mask must be 64-bit. + */ + uint64_t blkmask; +}; + +/* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables * first, then place all the other variables at the end. @@ -85,27 +119,20 @@ typedef struct xfs_mount { struct super_block *m_super; struct xfs_ail *m_ail; /* fs active log item list */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ + struct xfs_buf *m_rtsb_bp; /* realtime superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ - struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ - struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_inode *m_metadirip; /* ptr to metadata directory */ + struct xfs_inode *m_rtdirip; /* ptr to realtime metadir */ struct xfs_quotainfo *m_quotainfo; /* disk quota information */ struct xfs_buftarg *m_ddev_targp; /* data device */ struct xfs_buftarg *m_logdev_targp;/* log device */ struct xfs_buftarg *m_rtdev_targp; /* rt device */ void __percpu *m_inodegc; /* percpu inodegc structures */ - - /* - * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] > the maximum i for which - * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i. - * Reads and writes are serialized by the rsumip inode lock. - */ - uint8_t *m_rsum_cache; struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; @@ -120,9 +147,11 @@ typedef struct xfs_mount { uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ int8_t m_rtxblklog; /* log2 of rextsize, if possible */ + uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ - uint m_blockwmask; /* blockwsize-1 */ + /* number of rt extents per rt bitmap block if rtgroups enabled */ + unsigned int m_rtx_per_rbmblock; uint m_alloc_mxr[2]; /* max alloc btree records */ uint m_alloc_mnr[2]; /* min alloc btree records */ uint m_bmap_dmxr[2]; /* max bmap btree records */ @@ -146,7 +175,7 @@ typedef struct xfs_mount { uint m_allocsize_blocks; /* min write size blocks */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ - uint m_rsumlevels; /* rt summary levels */ + unsigned int m_rsumlevels; /* rt summary levels */ xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ @@ -208,7 +237,7 @@ typedef struct xfs_mount { */ atomic64_t m_allocbt_blks; - struct xarray m_perags; /* per-ag accounting info */ + struct xfs_groups m_groups[XG_TYPE_MAX]; uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ @@ -224,6 +253,7 @@ typedef struct xfs_mount { #endif xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ + atomic_t m_rtgrotor; /* last rtgroup rtpicked */ /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; @@ -298,6 +328,7 @@ typedef struct xfs_mount { #define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ +#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ /* Mount features */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ @@ -353,6 +384,19 @@ __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) +__XFS_HAS_FEAT(metadir, METADIR) + +static inline bool xfs_has_rtgroups(struct xfs_mount *mp) +{ + /* all metadir file systems also allow rtgroups */ + return xfs_has_metadir(mp); +} + +static inline bool xfs_has_rtsb(struct xfs_mount *mp) +{ + /* all rtgroups filesystems with an rt section have an rtsb */ + return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); +} /* * Some features are always on for v5 file systems, allow the compiler to @@ -433,18 +477,30 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about pNFS being used on this fs. */ +#define XFS_OPSTATE_WARNED_PNFS 7 /* Kernel has logged a warning about online fsck being used on this fs. */ -#define XFS_OPSTATE_WARNED_SCRUB 7 +#define XFS_OPSTATE_WARNED_SCRUB 8 /* Kernel has logged a warning about shrink being used on this fs. */ -#define XFS_OPSTATE_WARNED_SHRINK 8 +#define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ -#define XFS_OPSTATE_WARNED_LARP 9 +#define XFS_OPSTATE_WARNED_LARP 10 /* Mount time quotacheck is running */ -#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 +#define XFS_OPSTATE_QUOTACHECK_RUNNING 11 /* Do we want to clear log incompat flags? */ -#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11 +#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 12 /* Filesystem can use logged extended attributes */ -#define XFS_OPSTATE_USE_LARP 12 +#define XFS_OPSTATE_USE_LARP 13 +/* Kernel has logged a warning about blocksize > pagesize on this fs. */ +#define XFS_OPSTATE_WARNED_LBS 14 +/* Kernel has logged a warning about exchange-range being used on this fs. */ +#define XFS_OPSTATE_WARNED_EXCHRANGE 15 +/* Kernel has logged a warning about parent pointers being used on this fs. */ +#define XFS_OPSTATE_WARNED_PPTR 16 +/* Kernel has logged a warning about metadata dirs being used on this fs. */ +#define XFS_OPSTATE_WARNED_METADIR 17 +/* Filesystem should use qflags to determine quotaon status */ +#define XFS_OPSTATE_RESUMING_QUOTAON 18 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -469,9 +525,24 @@ __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) #ifdef CONFIG_XFS_QUOTA __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +__XFS_IS_OPSTATE(resuming_quotaon, RESUMING_QUOTAON) #else -# define xfs_is_quotacheck_running(mp) (false) -#endif +static inline bool xfs_is_quotacheck_running(struct xfs_mount *mp) +{ + return false; +} +static inline bool xfs_is_resuming_quotaon(struct xfs_mount *mp) +{ + return false; +} +static inline void xfs_set_resuming_quotaon(struct xfs_mount *m) +{ +} +static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) +{ + return false; +} +#endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 23d16186e1a3..6f4479deac6d 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,8 +58,7 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - xfs_notice_once(mp, -"Using experimental pNFS feature, use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS); if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7e2307921deb..dc8b1010d4d3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -27,6 +27,9 @@ #include "xfs_ialloc.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" /* * The global quota manager. There is only one of these for the entire @@ -37,7 +40,6 @@ STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp); STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp); -STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it @@ -146,17 +148,29 @@ xfs_qm_dqpurge( * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (error == -EAGAIN) { + xfs_dqfunlock(dqp); + dqp->q_flags &= ~XFS_DQFLAG_FREEING; + goto out_unlock; + } + if (!bp) + goto out_funlock; + + /* + * dqflush completes dqflock on error, and the bwrite ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); - } else if (error == -EAGAIN) { - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; } xfs_dqflock(dqp); } + xfs_dquot_detach_buf(dqp); +out_funlock: ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); @@ -208,6 +222,43 @@ xfs_qm_unmount( } } +static void +xfs_qm_unmount_rt( + struct xfs_mount *mp) +{ + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, 0); + + if (!rtg) + return; + if (rtg->rtg_inodes[XFS_RTGI_BITMAP]) + xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_BITMAP]); + if (rtg->rtg_inodes[XFS_RTGI_SUMMARY]) + xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_SUMMARY]); + xfs_rtgroup_rele(rtg); +} + +STATIC void +xfs_qm_destroy_quotainos( + struct xfs_quotainfo *qi) +{ + if (qi->qi_uquotaip) { + xfs_irele(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + xfs_irele(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + xfs_irele(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } + if (qi->qi_dirip) { + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; + } +} + /* * Called from the vfsops layer. */ @@ -221,28 +272,19 @@ xfs_qm_unmount_quotas( */ ASSERT(mp->m_rootip); xfs_qm_dqdetach(mp->m_rootip); - if (mp->m_rbmip) - xfs_qm_dqdetach(mp->m_rbmip); - if (mp->m_rsumip) - xfs_qm_dqdetach(mp->m_rsumip); + + /* + * For pre-RTG file systems, the RT inodes have quotas attached, + * detach them now. + */ + if (!xfs_has_rtgroups(mp)) + xfs_qm_unmount_rt(mp); /* * Release the quota inodes. */ - if (mp->m_quotainfo) { - if (mp->m_quotainfo->qi_uquotaip) { - xfs_irele(mp->m_quotainfo->qi_uquotaip); - mp->m_quotainfo->qi_uquotaip = NULL; - } - if (mp->m_quotainfo->qi_gquotaip) { - xfs_irele(mp->m_quotainfo->qi_gquotaip); - mp->m_quotainfo->qi_gquotaip = NULL; - } - if (mp->m_quotainfo->qi_pquotaip) { - xfs_irele(mp->m_quotainfo->qi_pquotaip); - mp->m_quotainfo->qi_pquotaip = NULL; - } - } + if (mp->m_quotainfo) + xfs_qm_destroy_quotainos(mp->m_quotainfo); } STATIC int @@ -302,6 +344,8 @@ xfs_qm_need_dqattach( return false; if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; + if (xfs_is_metadir_inode(ip)) + return false; return true; } @@ -324,6 +368,7 @@ xfs_qm_dqattach_locked( return 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + ASSERT(!xfs_is_metadir_inode(ip)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, @@ -412,9 +457,8 @@ static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) - __releases(lru_lock) __acquires(lru_lock) + __releases(&lru->lock) __acquires(&lru->lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); @@ -460,9 +504,19 @@ xfs_qm_dquot_isolate( trace_xfs_dqreclaim_dirty(dqp); /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(lru_lock); + spin_unlock(&lru->lock); + + error = xfs_dquot_use_attached_buf(dqp, &bp); + if (!bp || error == -EAGAIN) { + xfs_dqfunlock(dqp); + goto out_unlock_dirty; + } - error = xfs_qm_dqflush(dqp, &bp); + /* + * dqflush completes dqflock on error, and the delwri ioend + * does it on success. + */ + error = xfs_qm_dqflush(dqp, bp); if (error) goto out_unlock_dirty; @@ -470,6 +524,8 @@ xfs_qm_dquot_isolate( xfs_buf_relse(bp); goto out_unlock_dirty; } + + xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); /* @@ -496,7 +552,6 @@ out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); - spin_lock(lru_lock); return LRU_RETRY; } @@ -616,6 +671,163 @@ xfs_qm_init_timelimits( xfs_qm_dqdestroy(dqp); } +static int +xfs_qm_load_metadir_qinos( + struct xfs_mount *mp, + struct xfs_quotainfo *qi) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + error = xfs_dqinode_load_parent(tp, &qi->qi_dirip); + if (error == -ENOENT) { + /* no quota dir directory, but we'll create one later */ + error = 0; + goto out_trans; + } + if (error) + goto out_trans; + + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_USER, + &qi->qi_uquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_GROUP, + &qi->qi_gquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_dqinode_load(tp, qi->qi_dirip, XFS_DQTYPE_PROJ, + &qi->qi_pquotaip); + if (error && error != -ENOENT) + goto out_trans; + } + + error = 0; +out_trans: + xfs_trans_cancel(tp); + return error; +} + +/* Create quota inodes in the metadata directory tree. */ +STATIC int +xfs_qm_create_metadir_qinos( + struct xfs_mount *mp, + struct xfs_quotainfo *qi) +{ + int error; + + if (!qi->qi_dirip) { + error = xfs_dqinode_mkdir_parent(mp, &qi->qi_dirip); + if (error && error != -EEXIST) + return error; + /* + * If the /quotas dirent points to an inode that isn't + * loadable, qi_dirip will be NULL but mkdir_parent will return + * -EEXIST. In this case the metadir is corrupt, so bail out. + */ + if (XFS_IS_CORRUPT(mp, qi->qi_dirip == NULL)) + return -EFSCORRUPTED; + } + + if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_USER, &qi->qi_uquotaip); + if (error) + return error; + } + + if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_GROUP, &qi->qi_gquotaip); + if (error) + return error; + } + + if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) { + error = xfs_dqinode_metadir_create(qi->qi_dirip, + XFS_DQTYPE_PROJ, &qi->qi_pquotaip); + if (error) + return error; + } + + return 0; +} + +/* + * Add QUOTABIT to sb_versionnum and initialize qflags in preparation for + * creating quota files on a metadir filesystem. + */ +STATIC int +xfs_qm_prep_metadir_sb( + struct xfs_mount *mp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + + xfs_add_quota(mp); + + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; + + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + + return xfs_trans_commit(tp); +} + +/* + * Load existing quota inodes or create them. Since this is a V5 filesystem, + * we don't have to deal with the grp/prjquota switcheroo thing from V4. + */ +STATIC int +xfs_qm_init_metadir_qinos( + struct xfs_mount *mp) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + int error; + + if (!xfs_has_quota(mp)) { + error = xfs_qm_prep_metadir_sb(mp); + if (error) + return error; + } + + error = xfs_qm_load_metadir_qinos(mp, qi); + if (error) + goto out_err; + + error = xfs_qm_create_metadir_qinos(mp, qi); + if (error) + goto out_err; + + /* The only user of the quota dir inode is online fsck */ +#if !IS_ENABLED(CONFIG_XFS_ONLINE_SCRUB) + xfs_irele(qi->qi_dirip); + qi->qi_dirip = NULL; +#endif + return 0; +out_err: + xfs_qm_destroy_quotainos(mp->m_quotainfo); + return error; +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -640,7 +852,10 @@ xfs_qm_init_quotainfo( * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - error = xfs_qm_init_quotainos(mp); + if (xfs_has_metadir(mp)) + error = xfs_qm_init_metadir_qinos(mp); + else + error = xfs_qm_init_quotainos(mp); if (error) goto out_free_lru; @@ -733,6 +948,17 @@ xfs_qm_destroy_quotainfo( mp->m_quotainfo = NULL; } +static inline enum xfs_metafile_type +xfs_qm_metafile_type( + unsigned int flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + return XFS_METAFILE_USRQUOTA; + else if (flags & XFS_QMOPT_GQUOTA) + return XFS_METAFILE_GRPQUOTA; + return XFS_METAFILE_PRJQUOTA; +} + /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes @@ -744,6 +970,7 @@ xfs_qm_qino_alloc( unsigned int flags) { struct xfs_trans *tp; + enum xfs_metafile_type metafile_type = xfs_qm_metafile_type(flags); int error; bool need_alloc = true; @@ -777,9 +1004,10 @@ xfs_qm_qino_alloc( } } if (ino != NULLFSINO) { - error = xfs_iget(mp, NULL, ino, 0, 0, ipp); + error = xfs_metafile_iget(mp, ino, metafile_type, ipp); if (error) return error; + mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; need_alloc = false; @@ -806,6 +1034,8 @@ xfs_qm_qino_alloc( xfs_trans_cancel(tp); return error; } + if (xfs_has_metadir(mp)) + xfs_metafile_set_iflag(tp, *ipp, metafile_type); } /* @@ -1108,6 +1338,10 @@ xfs_qm_quotacheck_dqadjust( return error; } + error = xfs_dquot_attach_buf(NULL, dqp); + if (error) + return error; + trace_xfs_dqadjust(dqp); /* @@ -1153,8 +1387,8 @@ xfs_qm_dqusage_adjust( void *data) { struct xfs_inode *ip; - xfs_qcnt_t nblks; - xfs_filblks_t rtblks = 0; /* total rt blks */ + xfs_filblks_t nblks, rtblks; + unsigned int lock_mode; int error; ASSERT(XFS_IS_QUOTA_ON(mp)); @@ -1189,20 +1423,23 @@ xfs_qm_dqusage_adjust( } } + /* Metadata directory files are not accounted to user-visible quotas. */ + if (xfs_is_metadir_inode(ip)) + goto error0; + ASSERT(ip->i_delayed_blks == 0); + lock_mode = xfs_ilock_data_map_shared(ip); if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); - if (error) + if (error) { + xfs_iunlock(ip, lock_mode); goto error0; - - xfs_bmap_count_leaves(ifp, &rtblks); + } } - - nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); + xfs_iunlock(ip, lock_mode); /* * Add the (disk blocks and inode) resources occupied by this @@ -1287,11 +1524,17 @@ xfs_qm_flush_one( goto out_unlock; } - error = xfs_qm_dqflush(dqp, &bp); + error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) goto out_unlock; + if (!bp) { + error = -EFSCORRUPTED; + goto out_unlock; + } - xfs_buf_delwri_queue(bp, buffer_list); + error = xfs_qm_dqflush(dqp, bp); + if (!error) + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); @@ -1462,10 +1705,11 @@ xfs_qm_mount_quotas( uint sbf; /* - * If quotas on realtime volumes is not supported, we disable - * quotas immediately. + * If quotas on realtime volumes is not supported, disable quotas + * immediately. We only support rtquota if rtgroups are enabled to + * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents) { + if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; @@ -1533,7 +1777,7 @@ xfs_qm_mount_quotas( } if (error) { - xfs_warn(mp, "Failed to initialize disk quotas."); + xfs_warn(mp, "Failed to initialize disk quotas, err %d.", error); return; } } @@ -1552,27 +1796,26 @@ xfs_qm_qino_load( xfs_dqtype_t type, struct xfs_inode **ipp) { - xfs_ino_t ino = NULLFSINO; - - switch (type) { - case XFS_DQTYPE_USER: - ino = mp->m_sb.sb_uquotino; - break; - case XFS_DQTYPE_GROUP: - ino = mp->m_sb.sb_gquotino; - break; - case XFS_DQTYPE_PROJ: - ino = mp->m_sb.sb_pquotino; - break; - default: - ASSERT(0); - return -EFSCORRUPTED; - } - - if (ino == NULLFSINO) - return -ENOENT; - - return xfs_iget(mp, NULL, ino, 0, 0, ipp); + struct xfs_trans *tp; + struct xfs_inode *dp = NULL; + int error; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + if (xfs_has_metadir(mp)) { + error = xfs_dqinode_load_parent(tp, &dp); + if (error) + goto out_cancel; + } + + error = xfs_dqinode_load(tp, dp, type, ipp); + if (dp) + xfs_irele(dp); +out_cancel: + xfs_trans_cancel(tp); + return error; } /* @@ -1666,24 +1909,6 @@ error_rele: } STATIC void -xfs_qm_destroy_quotainos( - struct xfs_quotainfo *qi) -{ - if (qi->qi_uquotaip) { - xfs_irele(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - xfs_irele(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - if (qi->qi_pquotaip) { - xfs_irele(qi->qi_pquotaip); - qi->qi_pquotaip = NULL; - } -} - -STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { @@ -1735,6 +1960,8 @@ xfs_qm_vop_dqalloc( if (!XFS_IS_QUOTA_ON(mp)) return 0; + ASSERT(!xfs_is_metadir_inode(ip)); + lockflags = XFS_ILOCK_EXCL; xfs_ilock(ip, lockflags); @@ -1858,23 +2085,29 @@ xfs_qm_vop_chown( struct xfs_dquot *newdq) { struct xfs_dquot *prevdq; - uint bfield = XFS_IS_REALTIME_INODE(ip) ? - XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - + xfs_filblks_t dblocks, rblocks; + bool isrt = XFS_IS_REALTIME_INODE(ip); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); + ASSERT(!xfs_is_metadir_inode(ip)); /* old dquot */ prevdq = *IO_olddq; ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks)); + xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); + + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_BCOUNT, + -(xfs_qcnt_t)dblocks); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_RTBCOUNT, + -(xfs_qcnt_t)rblocks); xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_BCOUNT, dblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_RTBCOUNT, rblocks); xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* @@ -1884,7 +2117,8 @@ xfs_qm_vop_chown( * (having already bumped up the real counter) so that we don't have * any reservation to give back when we commit. */ - xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, + xfs_trans_mod_dquot(tp, newdq, + isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, -ip->i_delayed_blks); /* @@ -1896,8 +2130,13 @@ xfs_qm_vop_chown( */ tp->t_flags |= XFS_TRANS_DIRTY; xfs_dqlock(prevdq); - ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); - prevdq->q_blk.reserved -= ip->i_delayed_blks; + if (isrt) { + ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); + prevdq->q_rtb.reserved -= ip->i_delayed_blks; + } else { + ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); + prevdq->q_blk.reserved -= ip->i_delayed_blks; + } xfs_dqunlock(prevdq); /* @@ -1951,6 +2190,7 @@ xfs_qm_vop_create_dqattach( return; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + ASSERT(!xfs_is_metadir_inode(ip)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); @@ -1981,6 +2221,8 @@ xfs_inode_near_dquot_enforcement( xfs_dqtype_t type) { struct xfs_dquot *dqp; + struct xfs_dquot_res *res; + struct xfs_dquot_pre *pre; int64_t freesp; /* We only care for quotas that are enabled and enforced. */ @@ -1989,21 +2231,30 @@ xfs_inode_near_dquot_enforcement( return false; if (xfs_dquot_res_over_limits(&dqp->q_ino) || + xfs_dquot_res_over_limits(&dqp->q_blk) || xfs_dquot_res_over_limits(&dqp->q_rtb)) return true; + if (XFS_IS_REALTIME_INODE(ip)) { + res = &dqp->q_rtb; + pre = &dqp->q_rtb_prealloc; + } else { + res = &dqp->q_blk; + pre = &dqp->q_blk_prealloc; + } + /* For space on the data device, check the various thresholds. */ - if (!dqp->q_prealloc_hi_wmark) + if (!pre->q_prealloc_hi_wmark) return false; - if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + if (res->reserved < pre->q_prealloc_lo_wmark) return false; - if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + if (res->reserved >= pre->q_prealloc_hi_wmark) return true; - freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; - if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) + freesp = pre->q_prealloc_hi_wmark - res->reserved; + if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) return true; return false; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e919c7f62f57..35b64bc3a7a8 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -55,6 +55,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; int qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a11436579877..847ba29630e9 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -19,18 +19,24 @@ STATIC void xfs_fill_statvfs_from_dquot( struct kstatfs *statp, + struct xfs_inode *ip, struct xfs_dquot *dqp) { + struct xfs_dquot_res *blkres = &dqp->q_blk; uint64_t limit; - limit = dqp->q_blk.softlimit ? - dqp->q_blk.softlimit : - dqp->q_blk.hardlimit; + if (XFS_IS_REALTIME_MOUNT(ip->i_mount) && + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) + blkres = &dqp->q_rtb; + + limit = blkres->softlimit ? + blkres->softlimit : + blkres->hardlimit; if (limit && statp->f_blocks > limit) { statp->f_blocks = limit; statp->f_bfree = statp->f_bavail = - (statp->f_blocks > dqp->q_blk.reserved) ? - (statp->f_blocks - dqp->q_blk.reserved) : 0; + (statp->f_blocks > blkres->reserved) ? + (statp->f_blocks - blkres->reserved) : 0; } limit = dqp->q_ino.softlimit ? @@ -61,7 +67,7 @@ xfs_qm_statvfs( struct xfs_dquot *dqp; if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { - xfs_fill_statvfs_from_dquot(statp, dqp); + xfs_fill_statvfs_from_dquot(statp, ip, dqp); xfs_qm_dqput(dqp); } } @@ -135,3 +141,21 @@ xfs_qm_newmount( return 0; } + +/* + * If the sysadmin didn't provide any quota mount options, restore the quota + * accounting and enforcement state from the ondisk superblock. Only do this + * for metadir filesystems because this is a behavior change. + */ +void +xfs_qm_resume_quotaon( + struct xfs_mount *mp) +{ + if (!xfs_has_metadir(mp)) + return; + if (xfs_has_norecovery(mp)) + return; + + mp->m_qflags = mp->m_sb.sb_qflags & (XFS_ALL_QUOTA_ACCT | + XFS_ALL_QUOTA_ENFD); +} diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 4eda50ae2d1c..0c78f30fa4a3 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -427,19 +427,6 @@ xfs_qm_scall_getquota_fill_qc( dst->d_ino_timer = 0; dst->d_rt_spc_timer = 0; } - -#ifdef DEBUG - if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) { - if ((dst->d_space > dst->d_spc_softlimit) && - (dst->d_spc_softlimit > 0)) { - ASSERT(dst->d_spc_timer != 0); - } - if ((dst->d_ino_count > dqp->q_ino.softlimit) && - (dqp->q_ino.softlimit > 0)) { - ASSERT(dst->d_ino_timer != 0); - } - } -#endif } /* Return the quota information for the dquot matching id. */ diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 23d71a55bbc0..d7565462af3d 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -29,6 +29,11 @@ struct xfs_buf; (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) +#define XFS_IS_DQDETACHED(ip) \ + ((ip)->i_udquot == NULL && \ + (ip)->i_gdquot == NULL && \ + (ip)->i_pdquot == NULL) + #define XFS_QM_NEED_QUOTACHECK(mp) \ ((XFS_IS_UQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ @@ -96,7 +101,8 @@ extern void xfs_trans_free_dqinfo(struct xfs_trans *); extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); -extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *tp, + bool already_locked); int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, @@ -120,10 +126,12 @@ extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +void xfs_qm_resume_quotaon(struct xfs_mount *mp); extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); +int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks); # ifdef CONFIG_XFS_LIVE_HOOKS void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip, @@ -166,7 +174,7 @@ static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, { } #define xfs_trans_apply_dquot_deltas(tp) -#define xfs_trans_unreserve_and_mod_dquots(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp, a) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, bool force) @@ -197,11 +205,17 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_dqrele(d) do { (d) = (d); } while(0) #define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_resume_quotaon(mp) ((void)0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) #define xfs_inode_near_dquot_enforcement(ip, type) (false) +static inline int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return 0; +} + # ifdef CONFIG_XFS_LIVE_HOOKS # define xfs_dqtrx_hook_enable() ((void)0) # define xfs_dqtrx_hook_disable() ((void)0) @@ -209,12 +223,6 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #endif /* CONFIG_XFS_QUOTA */ -static inline int -xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) -{ - return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); -} - static inline void xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks) { diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 27398512b179..bede1c96c330 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -244,7 +244,7 @@ xfs_refcount_update_diff_items( struct xfs_refcount_intent *ra = ci_entry(a); struct xfs_refcount_intent *rb = ci_entry(b); - return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; + return ra->ri_group->xg_gno - rb->ri_group->xg_gno; } /* Log refcount updates in the intent item. */ @@ -330,7 +330,7 @@ xfs_refcount_defer_add( trace_xfs_refcount_defer(mp, ri); - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock); + ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG); xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } @@ -341,7 +341,7 @@ xfs_refcount_update_cancel_item( { struct xfs_refcount_intent *ri = ci_entry(item); - xfs_perag_intent_put(ri->ri_pag); + xfs_group_intent_put(ri->ri_group); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -431,7 +431,8 @@ xfs_cui_recover_work( ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; - ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock); + ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock, + XG_TYPE_AG); xfs_defer_add_item(dfp, &ri->ri_list); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6fde6ec8092f..b11769c009ef 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -144,7 +144,7 @@ xfs_reflink_find_shared( if (error) return error; - cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); + cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -894,14 +894,13 @@ int xfs_reflink_recover_cow( struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t agno; + struct xfs_perag *pag = NULL; int error = 0; if (!xfs_has_reflink(mp)) return 0; - for_each_perag(mp, agno, pag) { + while ((pag = xfs_perag_next(mp, pag))) { error = xfs_refcount_recover_cow_leftovers(mp, pag); if (error) { xfs_perag_rele(pag); @@ -1595,6 +1594,9 @@ xfs_reflink_clear_inode_flag( ASSERT(xfs_is_reflink_inode(ip)); + if (!xfs_can_free_cowblocks(ip)) + return 0; + error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); if (error || needs_flag) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index fb55e4ce49fa..4a58e4533671 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,6 +6,25 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +/* + * Check whether it is safe to free COW fork blocks from an inode. It is unsafe + * to do so when an inode has dirty cache or I/O in-flight, even if no shared + * extents exist in the data fork, because outstanding I/O may target blocks + * that were speculatively allocated to the COW fork. + */ +static inline bool +xfs_can_free_cowblocks(struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + if ((inode->i_state & I_DIRTY_PAGES) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || + atomic_read(&inode->i_dio_count)) + return false; + return true; +} + extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 88b5580e1e19..76b3c0ed3b4f 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -243,7 +243,7 @@ xfs_rmap_update_diff_items( struct xfs_rmap_intent *ra = ri_entry(a); struct xfs_rmap_intent *rb = ri_entry(b); - return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; + return ra->ri_group->xg_gno - rb->ri_group->xg_gno; } /* Log rmap updates in the intent item. */ @@ -353,7 +353,8 @@ xfs_rmap_defer_add( trace_xfs_rmap_defer(mp, ri); - ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock); + ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock, + XG_TYPE_AG); xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } @@ -364,7 +365,7 @@ xfs_rmap_update_cancel_item( { struct xfs_rmap_intent *ri = ri_entry(item); - xfs_perag_intent_put(ri->ri_pag); + xfs_group_intent_put(ri->ri_group); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -494,7 +495,7 @@ xfs_rui_recover_work( ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock); + ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, XG_TYPE_AG); xfs_defer_add_item(dfp, &ri->ri_list); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 3a2005a1e673..fcfa6e0eb3ad 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -25,6 +25,11 @@ #include "xfs_quota.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_da_format.h" +#include "xfs_metafile.h" +#include "xfs_rtgroup.h" +#include "xfs_error.h" +#include "xfs_trace.h" /* * Return whether there are any free extents in the size range given @@ -38,14 +43,14 @@ xfs_rtany_summary( xfs_fileoff_t bbno, /* bitmap block number */ int *maxlog) /* out: max log2 extent size free */ { - struct xfs_mount *mp = args->mp; + uint8_t *rsum_cache = args->rtg->rtg_rsum_cache; int error; int log; /* loop counter, log2 of ext. size */ xfs_suminfo_t sum; /* summary data */ - /* There are no extents at levels >= m_rsum_cache[bbno]. */ - if (mp->m_rsum_cache) { - high = min(high, mp->m_rsum_cache[bbno] - 1); + /* There are no extents at levels >= rsum_cache[bbno]. */ + if (rsum_cache) { + high = min(high, rsum_cache[bbno] - 1); if (low > high) { *maxlog = -1; return 0; @@ -77,12 +82,11 @@ xfs_rtany_summary( *maxlog = -1; out: /* There were no extents at levels > log. */ - if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; + if (rsum_cache && log + 1 < rsum_cache[bbno]) + rsum_cache[bbno] = log + 1; return 0; } - /* * Copy and transform the summary file, given the old and new * parameters in the mount structures. @@ -149,7 +153,7 @@ xfs_rtallocate_range( /* * Find the next allocated block (end of free extent). */ - error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1, &postblock); if (error) return error; @@ -211,14 +215,14 @@ xfs_rtalloc_align_len( */ static inline xfs_rtxlen_t xfs_rtallocate_clamp_len( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rtxnum_t startrtx, xfs_rtxlen_t rtxlen, xfs_rtxlen_t prod) { xfs_rtxlen_t ret; - ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + ret = min(rtg->rtg_extents, startrtx + rtxlen) - startrtx; return xfs_rtalloc_align_len(ret, prod); } @@ -253,10 +257,11 @@ xfs_rtallocate_extent_block( * Loop over all the extents starting in this bitmap block up to the * end of the rt volume, looking for one that's long enough. */ - end = min(mp->m_sb.sb_rextents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - 1; + end = min(args->rtg->rtg_extents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - + 1; for (i = xfs_rbmblock_to_rtx(mp, bbno); i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - scanlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); + scanlen = xfs_rtallocate_clamp_len(args->rtg, i, maxlen, prod); if (scanlen < minlen) break; @@ -341,7 +346,6 @@ xfs_rtallocate_extent_exact( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - struct xfs_mount *mp = args->mp; xfs_rtxnum_t next; /* next rtext to try (dummy) */ xfs_rtxlen_t alloclen; /* candidate length */ xfs_rtxlen_t scanlen; /* number of free rtx to look for */ @@ -352,7 +356,7 @@ xfs_rtallocate_extent_exact( ASSERT(maxlen % prod == 0); /* Make sure we don't run off the end of the rt volume. */ - scanlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); + scanlen = xfs_rtallocate_clamp_len(args->rtg, start, maxlen, prod); if (scanlen < minlen) return -ENOSPC; @@ -413,11 +417,10 @@ xfs_rtallocate_extent_near( ASSERT(maxlen % prod == 0); /* - * If the block number given is off the end, silently set it to - * the last block. + * If the block number given is off the end, silently set it to the last + * block. */ - if (start >= mp->m_sb.sb_rextents) - start = mp->m_sb.sb_rextents - 1; + start = min(start, args->rtg->rtg_extents - 1); /* * Try the exact allocation first. @@ -649,19 +652,30 @@ xfs_rtallocate_extent_size( return -ENOSPC; } +static void +xfs_rtunmount_rtg( + struct xfs_rtgroup *rtg) +{ + int i; + + for (i = 0; i < XFS_RTGI_MAX; i++) + xfs_rtginode_irele(&rtg->rtg_inodes[i]); + kvfree(rtg->rtg_rsum_cache); +} + static int xfs_alloc_rsum_cache( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_extlen_t rbmblocks) { /* * The rsum cache is initialized to the maximum value, which is * trivially an upper bound on the maximum level with any free extents. */ - mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); - if (!mp->m_rsum_cache) + rtg->rtg_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (!rtg->rtg_rsum_cache) return -ENOMEM; - memset(mp->m_rsum_cache, -1, rbmblocks); + memset(rtg->rtg_rsum_cache, -1, rbmblocks); return 0; } @@ -698,44 +712,175 @@ out_iolock: return error; } +/* Ensure that the rtgroup metadata inode is loaded, creating it if neeeded. */ +static int +xfs_rtginode_ensure( + struct xfs_rtgroup *rtg, + enum xfs_rtg_inodes type) +{ + struct xfs_trans *tp; + int error; + + if (rtg->rtg_inodes[type]) + return 0; + + error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp); + if (error) + return error; + error = xfs_rtginode_load(rtg, type, tp); + xfs_trans_cancel(tp); + + if (error != -ENOENT) + return 0; + return xfs_rtginode_create(rtg, type, true); +} + +static struct xfs_mount * +xfs_growfs_rt_alloc_fake_mount( + const struct xfs_mount *mp, + xfs_rfsblock_t rblocks, + xfs_agblock_t rextsize) +{ + struct xfs_mount *nmp; + + nmp = kmemdup(mp, sizeof(*mp), GFP_KERNEL); + if (!nmp) + return NULL; + xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb, rextsize); + nmp->m_sb.sb_rblocks = rblocks; + nmp->m_sb.sb_rextents = xfs_blen_to_rtbxlen(nmp, nmp->m_sb.sb_rblocks); + nmp->m_sb.sb_rbmblocks = xfs_rtbitmap_blockcount(nmp); + nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents); + if (xfs_has_rtgroups(nmp)) + nmp->m_sb.sb_rgcount = howmany_64(nmp->m_sb.sb_rextents, + nmp->m_sb.sb_rgextents); + else + nmp->m_sb.sb_rgcount = 1; + nmp->m_rsumblocks = xfs_rtsummary_blockcount(nmp, &nmp->m_rsumlevels); + + if (rblocks > 0) + nmp->m_features |= XFS_FEAT_REALTIME; + + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + return nmp; +} + +/* Free all the new space and return the number of extents actually freed. */ +static int +xfs_growfs_rt_free_new( + struct xfs_rtgroup *rtg, + struct xfs_rtalloc_args *nargs, + xfs_rtbxlen_t *freed_rtx) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgnumber_t rgno = rtg_rgno(rtg); + xfs_rtxnum_t start_rtx = 0, end_rtx; + + if (rgno < mp->m_sb.sb_rgcount) + start_rtx = xfs_rtgroup_extents(mp, rgno); + end_rtx = xfs_rtgroup_extents(nargs->mp, rgno); + + /* + * Compute the first new extent that we want to free, being careful to + * skip past a realtime superblock at the start of the realtime volume. + */ + if (xfs_has_rtsb(nargs->mp) && rgno == 0 && start_rtx == 0) + start_rtx++; + *freed_rtx = end_rtx - start_rtx; + return xfs_rtfree_range(nargs, start_rtx, *freed_rtx); +} + +static xfs_rfsblock_t +xfs_growfs_rt_nrblocks( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize, + xfs_fileoff_t bmbno) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rfsblock_t step; + + step = (bmbno + 1) * mp->m_rtx_per_rbmblock * rextsize; + if (xfs_has_rtgroups(mp)) { + xfs_rfsblock_t rgblocks = mp->m_sb.sb_rgextents * rextsize; + + step = min(rgblocks, step) + rgblocks * rtg_rgno(rtg); + } + + return min(nrblocks, step); +} + +/* + * If the post-grow filesystem will have an rtsb; we're initializing the first + * rtgroup; and the filesystem didn't have a realtime section, write the rtsb + * now, and attach the rtsb buffer to the real mount. + */ +static int +xfs_growfs_rt_init_rtsb( + const struct xfs_rtalloc_args *nargs, + const struct xfs_rtgroup *rtg, + const struct xfs_rtalloc_args *args) +{ + struct xfs_mount *mp = args->mp; + struct xfs_buf *rtsb_bp; + int error; + + if (!xfs_has_rtsb(nargs->mp)) + return 0; + if (rtg_rgno(rtg) > 0) + return 0; + if (mp->m_sb.sb_rblocks) + return 0; + + error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), + 0, &rtsb_bp); + if (error) + return error; + + rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR; + rtsb_bp->b_ops = &xfs_rtsb_buf_ops; + + xfs_update_rtsb(rtsb_bp, mp->m_sb_bp); + mp->m_rtsb_bp = rtsb_bp; + error = xfs_bwrite(rtsb_bp); + xfs_buf_unlock(rtsb_bp); + return error; +} + static int xfs_growfs_rt_bmblock( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, xfs_rfsblock_t nrblocks, xfs_agblock_t rextsize, xfs_fileoff_t bmbno) { - struct xfs_inode *rbmip = mp->m_rbmip; - struct xfs_inode *rsumip = mp->m_rsumip; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; + struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; struct xfs_rtalloc_args args = { .mp = mp, + .rtg = rtg, }; struct xfs_rtalloc_args nargs = { + .rtg = rtg, }; struct xfs_mount *nmp; - xfs_rfsblock_t nrblocks_step; xfs_rtbxlen_t freed_rtx; int error; - - nrblocks_step = (bmbno + 1) * NBBY * mp->m_sb.sb_blocksize * rextsize; - - nmp = nargs.mp = kmemdup(mp, sizeof(*mp), GFP_KERNEL); + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = nargs.mp = xfs_growfs_rt_alloc_fake_mount(mp, + xfs_growfs_rt_nrblocks(rtg, nrblocks, rextsize, bmbno), + rextsize); if (!nmp) return -ENOMEM; - /* - * Calculate new sb and mount fields for this round. - */ - nmp->m_sb.sb_rextsize = rextsize; - xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb); - nmp->m_sb.sb_rbmblocks = bmbno + 1; - nmp->m_sb.sb_rblocks = min(nrblocks, nrblocks_step); - nmp->m_sb.sb_rextents = xfs_rtb_to_rtx(nmp, nmp->m_sb.sb_rblocks); - nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents); - nmp->m_rsumlevels = nmp->m_sb.sb_rextslog + 1; - nmp->m_rsumblocks = xfs_rtsummary_blockcount(mp, nmp->m_rsumlevels, - nmp->m_sb.sb_rbmblocks); + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); /* * Recompute the growfsrt reservation from the new rsumsize, so that the @@ -748,8 +893,8 @@ xfs_growfs_rt_bmblock( goto out_free; nargs.tp = args.tp; - xfs_rtbitmap_lock(mp); - xfs_rtbitmap_trans_join(args.tp); + xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP); + xfs_rtgroup_trans_join(args.tp, args.rtg, XFS_RTGLOCK_BITMAP); /* * Update the bitmap inode's size ondisk and incore. We need to update @@ -780,6 +925,10 @@ xfs_growfs_rt_bmblock( goto out_cancel; } + error = xfs_growfs_rt_init_rtsb(&nargs, rtg, &args); + if (error) + goto out_cancel; + /* * Update superblock fields. */ @@ -798,12 +947,14 @@ xfs_growfs_rt_bmblock( if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); /* * Free the new extent. */ - freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; - error = xfs_rtfree_range(&nargs, mp->m_sb.sb_rextents, freed_rtx); + error = xfs_growfs_rt_free_new(rtg, &nargs, &freed_rtx); xfs_rtbuf_cache_relse(&nargs); if (error) goto out_cancel; @@ -818,7 +969,6 @@ xfs_growfs_rt_bmblock( */ mp->m_rsumlevels = nmp->m_rsumlevels; mp->m_rsumblocks = nmp->m_rsumblocks; - xfs_mount_sb_set_rextsize(mp, &mp->m_sb); /* * Recompute the growfsrt reservation from the new rsumsize. @@ -844,6 +994,15 @@ out_free: return error; } +static xfs_rtxnum_t +xfs_last_rtgroup_extents( + struct xfs_mount *mp) +{ + return mp->m_sb.sb_rextents - + ((xfs_rtxnum_t)(mp->m_sb.sb_rgcount - 1) * + mp->m_sb.sb_rgextents); +} + /* * Calculate the last rbmblock currently used. * @@ -851,34 +1010,235 @@ out_free: */ static xfs_fileoff_t xfs_last_rt_bmblock( - struct xfs_mount *mp) + struct xfs_rtgroup *rtg) { - xfs_fileoff_t bmbno = mp->m_sb.sb_rbmblocks; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgnumber_t rgno = rtg_rgno(rtg); + xfs_fileoff_t bmbno = 0; + + ASSERT(!mp->m_sb.sb_rgcount || rgno >= mp->m_sb.sb_rgcount - 1); + + if (mp->m_sb.sb_rgcount && rgno == mp->m_sb.sb_rgcount - 1) { + xfs_rtxnum_t nrext = xfs_last_rtgroup_extents(mp); + + /* Also fill up the previous block if not entirely full. */ + bmbno = xfs_rtbitmap_blockcount_len(mp, nrext); + if (xfs_rtx_to_rbmword(mp, nrext) != 0) + bmbno--; + } - /* Skip the current block if it is exactly full. */ - if (xfs_rtx_to_rbmword(mp, mp->m_sb.sb_rextents) != 0) - bmbno--; return bmbno; } /* + * Allocate space to the bitmap and summary files, as necessary. + */ +static int +xfs_growfs_rt_alloc_blocks( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize, + xfs_extlen_t *nrbmblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; + struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; + xfs_extlen_t orbmblocks = 0; + xfs_extlen_t orsumblocks = 0; + struct xfs_mount *nmp; + int error = 0; + + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, rextsize); + if (!nmp) + return -ENOMEM; + *nrbmblocks = nmp->m_sb.sb_rbmblocks; + + if (xfs_has_rtgroups(mp)) { + /* + * For file systems with the rtgroups feature, the RT bitmap and + * summary are always fully allocated, which means that we never + * need to grow the existing files. + * + * But we have to be careful to only fill the bitmap until the + * end of the actually used range. + */ + if (rtg_rgno(rtg) == nmp->m_sb.sb_rgcount - 1) + *nrbmblocks = xfs_rtbitmap_blockcount_len(nmp, + xfs_last_rtgroup_extents(nmp)); + + if (mp->m_sb.sb_rgcount && + rtg_rgno(rtg) == mp->m_sb.sb_rgcount - 1) + goto out_free; + } else { + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + orbmblocks = XFS_B_TO_FSB(mp, rbmip->i_disk_size); + orsumblocks = XFS_B_TO_FSB(mp, rsumip->i_disk_size); + } + + error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_BITMAP, orbmblocks, + nmp->m_sb.sb_rbmblocks, NULL); + if (error) + goto out_free; + error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_SUMMARY, orsumblocks, + nmp->m_rsumblocks, NULL); +out_free: + kfree(nmp); + return error; +} + +static int +xfs_growfs_rtg( + struct xfs_mount *mp, + xfs_rgnumber_t rgno, + xfs_rfsblock_t nrblocks, + xfs_agblock_t rextsize) +{ + uint8_t *old_rsum_cache = NULL; + xfs_extlen_t bmblocks; + xfs_fileoff_t bmbno; + struct xfs_rtgroup *rtg; + unsigned int i; + int error; + + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) + return -EINVAL; + + for (i = 0; i < XFS_RTGI_MAX; i++) { + error = xfs_rtginode_ensure(rtg, i); + if (error) + goto out_rele; + } + + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); + if (error) + goto out_rele; + + if (bmblocks != rtg_mount(rtg)->m_sb.sb_rbmblocks) { + old_rsum_cache = rtg->rtg_rsum_cache; + error = xfs_alloc_rsum_cache(rtg, bmblocks); + if (error) + goto out_rele; + } + + for (bmbno = xfs_last_rt_bmblock(rtg); bmbno < bmblocks; bmbno++) { + error = xfs_growfs_rt_bmblock(rtg, nrblocks, rextsize, bmbno); + if (error) + goto out_error; + } + + if (old_rsum_cache) + kvfree(old_rsum_cache); + xfs_rtgroup_rele(rtg); + return 0; + +out_error: + /* + * Reset rtg_extents to the old value if adding more blocks failed. + */ + xfs_rtgroup_calc_geometry(mp, rtg, rtg_rgno(rtg), mp->m_sb.sb_rgcount, + mp->m_sb.sb_rextents); + if (old_rsum_cache) { + kvfree(rtg->rtg_rsum_cache); + rtg->rtg_rsum_cache = old_rsum_cache; + } +out_rele: + xfs_rtgroup_rele(rtg); + return error; +} + +static int +xfs_growfs_check_rtgeom( + const struct xfs_mount *mp, + xfs_rfsblock_t rblocks, + xfs_extlen_t rextsize) +{ + struct xfs_mount *nmp; + int error = 0; + + nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize); + if (!nmp) + return -ENOMEM; + + /* + * New summary size can't be more than half the size of the log. This + * prevents us from getting a log overflow, since we'll log basically + * the whole summary file at once. + */ + if (nmp->m_rsumblocks > (mp->m_sb.sb_logblocks >> 1)) + error = -EINVAL; + + kfree(nmp); + return error; +} + +/* + * Compute the new number of rt groups and ensure that /rtgroups exists. + * + * Changing the rtgroup size is not allowed (even if the rt volume hasn't yet + * been initialized) because the userspace ABI doesn't support it. + */ +static int +xfs_growfs_rt_prep_groups( + struct xfs_mount *mp, + xfs_rfsblock_t rblocks, + xfs_extlen_t rextsize, + xfs_rgnumber_t *new_rgcount) +{ + int error; + + *new_rgcount = howmany_64(rblocks, mp->m_sb.sb_rgextents * rextsize); + if (*new_rgcount > XFS_MAX_RGNUMBER) + return -EINVAL; + + /* Make sure the /rtgroups dir has been created */ + if (!mp->m_rtdirip) { + struct xfs_trans *tp; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + error = xfs_rtginode_load_parent(tp); + xfs_trans_cancel(tp); + + if (error == -ENOENT) + error = xfs_rtginode_mkdir_parent(mp); + if (error) + return error; + } + + return 0; +} + +static bool +xfs_grow_last_rtg( + struct xfs_mount *mp) +{ + if (!xfs_has_rtgroups(mp)) + return true; + if (mp->m_sb.sb_rgcount == 0) + return false; + return xfs_rtgroup_extents(mp, mp->m_sb.sb_rgcount - 1) <= + mp->m_sb.sb_rgextents; +} + +/* * Grow the realtime area of the filesystem. */ int xfs_growfs_rt( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_rt_t *in) /* growfs rt input struct */ + struct xfs_mount *mp, + struct xfs_growfs_rt *in) { - xfs_fileoff_t bmbno; /* bitmap block number */ - struct xfs_buf *bp; /* temporary buffer */ - int error; /* error return value */ - xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_rtxnum_t nrextents; /* new number of realtime extents */ - xfs_extlen_t nrsumblocks; /* new number of summary blocks */ - xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ - xfs_extlen_t rsumblocks; /* current number of rt summary blks */ - uint8_t *rsum_cache; /* old summary cache */ - xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; + xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; + xfs_rgnumber_t new_rgcount = 1; + xfs_rgnumber_t rgno; + struct xfs_buf *bp; + xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -889,15 +1249,9 @@ xfs_growfs_rt( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; - /* - * Mount should fail if the rt bitmap/summary files don't load, but - * we'll check anyway. - */ - error = -EINVAL; - if (!mp->m_rbmip || !mp->m_rsumip) - goto out_unlock; /* Shrink not supported. */ + error = -EINVAL; if (in->newblocks <= mp->m_sb.sb_rblocks) goto out_unlock; /* Can only change rt extent size when adding rt volume. */ @@ -911,7 +1265,9 @@ xfs_growfs_rt( /* Unsupported realtime features. */ error = -EOPNOTSUPP; - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) + if (xfs_has_quota(mp) && !xfs_has_rtgroups(mp)) + goto out_unlock; + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) goto out_unlock; error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); @@ -930,80 +1286,64 @@ xfs_growfs_rt( /* * Calculate new parameters. These are the final values to be reached. */ - nrextents = div_u64(in->newblocks, in->extsize); - if (nrextents == 0) { - error = -EINVAL; - goto out_unlock; - } - nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrsumblocks = xfs_rtsummary_blockcount(mp, - xfs_compute_rextslog(nrextents) + 1, nrbmblocks); - - /* - * New summary size can't be more than half the size of - * the log. This prevents us from getting a log overflow, - * since we'll log basically the whole summary file at once. - */ - if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) { - error = -EINVAL; + error = -EINVAL; + if (in->newblocks < in->extsize) goto out_unlock; - } - /* - * Get the old block counts for bitmap and summary inodes. - * These can't change since other growfs callers are locked out. - */ - rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size); - rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size); - /* - * Allocate space to the bitmap and summary files, as necessary. - */ - error = xfs_rtfile_initialize_blocks(mp->m_rbmip, rbmblocks, - nrbmblocks, NULL); - if (error) - goto out_unlock; - error = xfs_rtfile_initialize_blocks(mp->m_rsumip, rsumblocks, - nrsumblocks, NULL); + /* Make sure the new fs size won't cause problems with the log. */ + error = xfs_growfs_check_rtgeom(mp, in->newblocks, in->extsize); if (error) goto out_unlock; - rsum_cache = mp->m_rsum_cache; - if (nrbmblocks != mp->m_sb.sb_rbmblocks) { - error = xfs_alloc_rsum_cache(mp, nrbmblocks); + if (xfs_has_rtgroups(mp)) { + error = xfs_growfs_rt_prep_groups(mp, in->newblocks, + in->extsize, &new_rgcount); if (error) goto out_unlock; } - /* Initialize the free space bitmap one bitmap block at a time. */ - for (bmbno = xfs_last_rt_bmblock(mp); bmbno < nrbmblocks; bmbno++) { - error = xfs_growfs_rt_bmblock(mp, in->newblocks, in->extsize, - bmbno); + if (xfs_grow_last_rtg(mp)) { + error = xfs_growfs_rtg(mp, old_rgcount - 1, in->newblocks, + in->extsize); if (error) - goto out_free; + goto out_unlock; } - if (old_rextsize != in->extsize) { - error = xfs_growfs_rt_fixup_extsize(mp); + for (rgno = old_rgcount; rgno < new_rgcount; rgno++) { + xfs_rtbxlen_t rextents = div_u64(in->newblocks, in->extsize); + + error = xfs_rtgroup_alloc(mp, rgno, new_rgcount, rextents); if (error) - goto out_free; + goto out_unlock; + + error = xfs_growfs_rtg(mp, rgno, in->newblocks, in->extsize); + if (error) { + struct xfs_rtgroup *rtg; + + rtg = xfs_rtgroup_grab(mp, rgno); + if (!WARN_ON_ONCE(!rtg)) { + xfs_rtunmount_rtg(rtg); + xfs_rtgroup_rele(rtg); + xfs_rtgroup_free(mp, rgno); + } + break; + } } - /* Update secondary superblocks now the physical grow has completed */ - error = xfs_update_secondary_sbs(mp); + if (!error && old_rextsize != in->extsize) + error = xfs_growfs_rt_fixup_extsize(mp); -out_free: /* - * If we had to allocate a new rsum_cache, we either need to free the - * old one (if we succeeded) or free the new one and restore the old one - * (if there was an error). + * Update secondary superblocks now the physical grow has completed. + * + * Also do this in case of an error as we might have already + * successfully updated one or more RTGs and incremented sb_rgcount. */ - if (rsum_cache != mp->m_rsum_cache) { - if (error) { - kvfree(mp->m_rsum_cache); - mp->m_rsum_cache = rsum_cache; - } else { - kvfree(rsum_cache); - } + if (!xfs_is_shutdown(mp)) { + int error2 = xfs_update_secondary_sbs(mp); + + if (!error) + error = error2; } out_unlock: @@ -1011,6 +1351,56 @@ out_unlock: return error; } +/* Read the realtime superblock and attach it to the mount. */ +int +xfs_rtmount_readsb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp; + int error; + + if (!xfs_has_rtsb(mp)) + return 0; + if (mp->m_sb.sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); + return -ENODEV; + } + + /* m_blkbb_log is not set up yet */ + error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, + mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp, + &xfs_rtsb_buf_ops); + if (error) { + xfs_warn(mp, "rt sb validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + + mp->m_rtsb_bp = bp; + xfs_buf_unlock(bp); + return 0; +} + +/* Detach the realtime superblock from the mount and free it. */ +void +xfs_rtmount_freesb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp = mp->m_rtsb_bp; + + if (!bp) + return; + + xfs_buf_lock(bp); + mp->m_rtsb_bp = NULL; + xfs_buf_relse(bp); +} + /* * Initialize realtime fields in the mount structure. */ @@ -1019,22 +1409,19 @@ xfs_rtmount_init( struct xfs_mount *mp) /* file system mount structure */ { struct xfs_buf *bp; /* buffer for last block of subvolume */ - struct xfs_sb *sbp; /* filesystem superblock copy in mount */ xfs_daddr_t d; /* address of last block of subvolume */ int error; - sbp = &mp->m_sb; - if (sbp->sb_rblocks == 0) + if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { xfs_warn(mp, "Filesystem has a realtime volume, use rtdev=device option"); return -ENODEV; } - mp->m_rsumlevels = sbp->sb_rextslog + 1; - mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, - mp->m_sb.sb_rbmblocks); - mp->m_rbmip = mp->m_rsumip = NULL; + + mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); + /* * Check that the realtime section is an ok size. */ @@ -1058,7 +1445,7 @@ xfs_rtmount_init( static int xfs_rtalloc_count_frextent( - struct xfs_mount *mp, + struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv) @@ -1080,12 +1467,18 @@ xfs_rtalloc_reinit_frextents( uint64_t val = 0; int error; - xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP); - error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent, - &val); - xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP); - if (error) - return error; + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + error = xfs_rtalloc_query_all(rtg, NULL, + xfs_rtalloc_count_frextent, &val); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); + if (error) { + xfs_rtgroup_rele(rtg); + return error; + } + } spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; @@ -1101,17 +1494,12 @@ xfs_rtalloc_reinit_frextents( */ static inline int xfs_rtmount_iread_extents( - struct xfs_inode *ip, - unsigned int lock_class) + struct xfs_trans *tp, + struct xfs_inode *ip) { - struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class); + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) @@ -1124,54 +1512,67 @@ xfs_rtmount_iread_extents( } out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class); - xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } +static int +xfs_rtmount_rtg( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_rtgroup *rtg) +{ + int error, i; + + for (i = 0; i < XFS_RTGI_MAX; i++) { + error = xfs_rtginode_load(rtg, i, tp); + if (error) + return error; + + if (rtg->rtg_inodes[i]) { + error = xfs_rtmount_iread_extents(tp, + rtg->rtg_inodes[i]); + if (error) + return error; + } + } + + return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); +} + /* * Get the bitmap and summary inodes and the summary cache into the mount * structure at mount time. */ -int /* error */ +int xfs_rtmount_inodes( - xfs_mount_t *mp) /* file system mount structure */ + struct xfs_mount *mp) { - int error; /* error return value */ - xfs_sb_t *sbp; + struct xfs_trans *tp; + struct xfs_rtgroup *rtg = NULL; + int error; - sbp = &mp->m_sb; - error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); - if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - ASSERT(mp->m_rbmip != NULL); - error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP); - if (error) - goto out_rele_bitmap; - - error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); - if (xfs_metadata_is_sick(error)) - xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY); - if (error) - goto out_rele_bitmap; - ASSERT(mp->m_rsumip != NULL); - - error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM); - if (error) - goto out_rele_summary; + if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) { + error = xfs_rtginode_load_parent(tp); + if (error) + goto out_cancel; + } - error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); - if (error) - goto out_rele_summary; - return 0; + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_rtmount_rtg(mp, tp, rtg); + if (error) { + xfs_rtgroup_rele(rtg); + xfs_rtunmount_inodes(mp); + break; + } + } -out_rele_summary: - xfs_irele(mp->m_rsumip); -out_rele_bitmap: - xfs_irele(mp->m_rbmip); +out_cancel: + xfs_trans_cancel(tp); return error; } @@ -1179,11 +1580,11 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { - kvfree(mp->m_rsum_cache); - if (mp->m_rbmip) - xfs_irele(mp->m_rbmip); - if (mp->m_rsumip) - xfs_irele(mp->m_rsumip); + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) + xfs_rtunmount_rtg(rtg); + xfs_rtginode_irele(&mp->m_rtdirip); } /* @@ -1195,28 +1596,29 @@ xfs_rtunmount_inodes( */ static xfs_rtxnum_t xfs_rtpick_extent( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ + struct xfs_rtgroup *rtg, + struct xfs_trans *tp, xfs_rtxlen_t len) /* allocation length (rtextents) */ { - xfs_rtxnum_t b; /* result rtext */ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; + xfs_rtxnum_t b = 0; /* result rtext */ int log2; /* log of sequence number */ uint64_t resid; /* residual after log removed */ uint64_t seq; /* sequence number of file creation */ struct timespec64 ts; /* timespec in inode */ - xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - ts = inode_get_atime(VFS_I(mp->m_rbmip)); - if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { - mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; + ts = inode_get_atime(VFS_I(rbmip)); + if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { + rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; seq = 0; } else { seq = ts.tv_sec; } - if ((log2 = xfs_highbit64(seq)) == -1) - b = 0; - else { + log2 = xfs_highbit64(seq); + if (log2 != -1) { resid = seq - (1ULL << log2); b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> (log2 + 1); @@ -1226,8 +1628,8 @@ xfs_rtpick_extent( b = mp->m_sb.sb_rextents - len; } ts.tv_sec = seq + 1; - inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); - xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + inode_set_atime_to_ts(VFS_I(rbmip), ts); + xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE); return b; } @@ -1260,9 +1662,118 @@ xfs_rtalloc_align_minmax( *raminlen = newminlen; } +/* Given a free extent, find any part of it that isn't busy, if possible. */ +STATIC bool +xfs_rtalloc_check_busy( + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, + xfs_rtxlen_t minlen_rtx, + xfs_rtxlen_t maxlen_rtx, + xfs_rtxlen_t len_rtx, + xfs_rtxlen_t prod, + xfs_rtxnum_t rtx, + xfs_rtxlen_t *reslen, + xfs_rtxnum_t *resrtx, + unsigned *busy_gen) +{ + struct xfs_rtgroup *rtg = args->rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_agblock_t rgbno = xfs_rtx_to_rgbno(rtg, rtx); + xfs_rgblock_t min_rgbno = xfs_rtx_to_rgbno(rtg, start); + xfs_extlen_t minlen = xfs_rtxlen_to_extlen(mp, minlen_rtx); + xfs_extlen_t len = xfs_rtxlen_to_extlen(mp, len_rtx); + xfs_extlen_t diff; + bool busy; + + busy = xfs_extent_busy_trim(rtg_group(rtg), minlen, + xfs_rtxlen_to_extlen(mp, maxlen_rtx), &rgbno, &len, + busy_gen); + + /* + * If we have a largish extent that happens to start before min_rgbno, + * see if we can shift it into range... + */ + if (rgbno < min_rgbno && rgbno + len > min_rgbno) { + diff = min_rgbno - rgbno; + if (len > diff) { + rgbno += diff; + len -= diff; + } + } + + if (prod > 1 && len >= minlen) { + xfs_rgblock_t aligned_rgbno = roundup(rgbno, prod); + + diff = aligned_rgbno - rgbno; + + *resrtx = xfs_rgbno_to_rtx(mp, aligned_rgbno); + *reslen = xfs_extlen_to_rtxlen(mp, + diff >= len ? 0 : len - diff); + } else { + *resrtx = xfs_rgbno_to_rtx(mp, rgbno); + *reslen = xfs_extlen_to_rtxlen(mp, len); + } + + return busy; +} + +/* + * Adjust the given free extent so that it isn't busy, or flush the log and + * wait for the space to become unbusy. Only needed for rtgroups. + */ +STATIC int +xfs_rtallocate_adjust_for_busy( + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, + xfs_rtxlen_t minlen, + xfs_rtxlen_t maxlen, + xfs_rtxlen_t *len, + xfs_rtxlen_t prod, + xfs_rtxnum_t *rtx) +{ + xfs_rtxnum_t resrtx; + xfs_rtxlen_t reslen; + unsigned busy_gen; + bool busy; + int error; + +again: + busy = xfs_rtalloc_check_busy(args, start, minlen, maxlen, *len, prod, + *rtx, &reslen, &resrtx, &busy_gen); + if (!busy) + return 0; + + if (reslen < minlen || (start != 0 && resrtx != *rtx)) { + /* + * Enough of the extent was busy that we cannot satisfy the + * allocation, or this is a near allocation and the start of + * the extent is busy. Flush the log and wait for the busy + * situation to resolve. + */ + trace_xfs_rtalloc_extent_busy(args->rtg, start, minlen, maxlen, + *len, prod, *rtx, busy_gen); + + error = xfs_extent_busy_flush(args->tp, rtg_group(args->rtg), + busy_gen, 0); + if (error) + return error; + + goto again; + } + + /* Some of the free space wasn't busy, hand that back to the caller. */ + trace_xfs_rtalloc_extent_busy_trim(args->rtg, *rtx, *len, resrtx, + reslen); + *len = reslen; + *rtx = resrtx; + + return 0; +} + static int -xfs_rtallocate( +xfs_rtallocate_rtg( struct xfs_trans *tp, + xfs_rgnumber_t rgno, xfs_rtblock_t bno_hint, xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, @@ -1282,12 +1793,33 @@ xfs_rtallocate( xfs_rtxlen_t len = 0; int error = 0; + args.rtg = xfs_rtgroup_grab(args.mp, rgno); + if (!args.rtg) + return -ENOSPC; + /* - * Lock out modifications to both the RT bitmap and summary inodes. + * We need to lock out modifications to both the RT bitmap and summary + * inodes for finding free space in xfs_rtallocate_extent_{near,size} + * and join the bitmap and summary inodes for the actual allocation + * down in xfs_rtallocate_range. + * + * For RTG-enabled file system we don't want to join the inodes to the + * transaction until we are committed to allocate to allocate from this + * RTG so that only one inode of each type is locked at a time. + * + * But for pre-RTG file systems we need to already to join the bitmap + * inode to the transaction for xfs_rtpick_extent, which bumps the + * sequence number in it, so we'll have to join the inode to the + * transaction early here. + * + * This is all a bit messy, but at least the mess is contained in + * this function. */ if (!*rtlocked) { - xfs_rtbitmap_lock(args.mp); - xfs_rtbitmap_trans_join(tp); + xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP); + if (!xfs_has_rtgroups(args.mp)) + xfs_rtgroup_trans_join(tp, args.rtg, + XFS_RTGLOCK_BITMAP); *rtlocked = true; } @@ -1295,10 +1827,10 @@ xfs_rtallocate( * For an allocation to an empty file at offset 0, pick an extent that * will space things out in the rt area. */ - if (bno_hint) + if (bno_hint != NULLFSBLOCK) start = xfs_rtb_to_rtx(args.mp, bno_hint); - else if (initial_user_data) - start = xfs_rtpick_extent(args.mp, tp, maxlen); + else if (!xfs_has_rtgroups(args.mp) && initial_user_data) + start = xfs_rtpick_extent(args.rtg, tp, maxlen); if (start) { error = xfs_rtallocate_extent_near(&args, start, minlen, maxlen, @@ -1318,8 +1850,20 @@ xfs_rtallocate( prod, &rtx); } - if (error) + if (error) { + if (xfs_has_rtgroups(args.mp)) + goto out_unlock; goto out_release; + } + + if (xfs_has_rtgroups(args.mp)) { + error = xfs_rtallocate_adjust_for_busy(&args, start, minlen, + maxlen, &len, prod, &rtx); + if (error) + goto out_unlock; + + xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP); + } error = xfs_rtallocate_range(&args, rtx, len); if (error) @@ -1328,12 +1872,64 @@ xfs_rtallocate( xfs_trans_mod_sb(tp, wasdel ? XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, -(long)len); - *bno = xfs_rtx_to_rtb(args.mp, rtx); + *bno = xfs_rtx_to_rtb(args.rtg, rtx); *blen = xfs_rtxlen_to_extlen(args.mp, len); out_release: + xfs_rtgroup_rele(args.rtg); xfs_rtbuf_cache_relse(&args); return error; +out_unlock: + xfs_rtgroup_unlock(args.rtg, XFS_RTGLOCK_BITMAP); + *rtlocked = false; + goto out_release; +} + +static int +xfs_rtallocate_rtgs( + struct xfs_trans *tp, + xfs_fsblock_t bno_hint, + xfs_rtxlen_t minlen, + xfs_rtxlen_t maxlen, + xfs_rtxlen_t prod, + bool wasdel, + bool initial_user_data, + xfs_rtblock_t *bno, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rgnumber_t start_rgno, rgno; + int error; + + /* + * For now this just blindly iterates over the RTGs for an initial + * allocation. We could try to keep an in-memory rtg_longest member + * to avoid the locking when just looking for big enough free space, + * but for now this keeps things simple. + */ + if (bno_hint != NULLFSBLOCK) + start_rgno = xfs_rtb_to_rgno(mp, bno_hint); + else + start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) % + mp->m_sb.sb_rgcount; + + rgno = start_rgno; + do { + bool rtlocked = false; + + error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen, + prod, wasdel, initial_user_data, &rtlocked, + bno, blen); + if (error != -ENOSPC) + return error; + ASSERT(!rtlocked); + + if (++rgno == mp->m_sb.sb_rgcount) + rgno = 0; + bno_hint = NULLFSBLOCK; + } while (rgno != start_rgno); + + return -ENOSPC; } static int @@ -1430,9 +2026,16 @@ retry: if (xfs_bmap_adjacent(ap)) bno_hint = ap->blkno; - error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod, - ap->wasdel, initial_user_data, &rtlocked, - &ap->blkno, &ap->length); + if (xfs_has_rtgroups(ap->ip->i_mount)) { + error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen, + prod, ap->wasdel, initial_user_data, + &ap->blkno, &ap->length); + } else { + error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen, + prod, ap->wasdel, initial_user_data, + &rtlocked, &ap->blkno, &ap->length); + } + if (error == -ENOSPC) { if (!noalign) { /* diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index a6836da9bebe..8e2a07b8174b 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -12,6 +12,10 @@ struct xfs_mount; struct xfs_trans; #ifdef CONFIG_XFS_RT +/* rtgroup superblock initialization */ +int xfs_rtmount_readsb(struct xfs_mount *mp); +void xfs_rtmount_freesb(struct xfs_mount *mp); + /* * Initialize realtime fields in the mount structure. */ @@ -42,6 +46,8 @@ int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_growfs_rt(mp,in) (-ENOSYS) # define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_rtmount_readsb(mp) (0) +# define xfs_rtmount_freesb(mp) ((void)0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index ed97d72caa66..ffb52725c2a8 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -115,10 +115,11 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) static int xqm_proc_show(struct seq_file *m, void *v) { - /* maximum; incore; ratio free to inuse; freelist */ - seq_printf(m, "%d\t%d\t%d\t%u\n", + /* maximum; incore; ratio free to inuse; freelist; rtquota */ + seq_printf(m, "%d\t%d\t%d\t%u\t%s\n", 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT), - 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1)); + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1), + IS_ENABLED(CONFIG_XFS_RT) ? "rtquota" : "quota"); return 0; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fbb3a1594c0d..394fdf3bb535 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -45,6 +45,7 @@ #include "xfs_rtbitmap.h" #include "xfs_exchmaps_item.h" #include "xfs_parent.h" +#include "xfs_rtalloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -66,6 +67,9 @@ enum xfs_dax_mode { XFS_DAX_NEVER = 2, }; +/* Were quota mount options provided? Must use the upper 16 bits of qflags. */ +#define XFS_QFLAGS_MNTOPTS (1U << 31) + static void xfs_mount_set_dax_mode( struct xfs_mount *mp, @@ -238,7 +242,7 @@ xfs_set_inode_alloc_perag( xfs_ino_t ino, xfs_agnumber_t max_metadata) { - if (!xfs_is_inode32(pag->pag_mount)) { + if (!xfs_is_inode32(pag_mount(pag))) { set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); return false; @@ -251,7 +255,7 @@ xfs_set_inode_alloc_perag( } set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate); - if (pag->pag_agno < max_metadata) + if (pag_agno(pag) < max_metadata) set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); else clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate); @@ -873,21 +877,21 @@ xfs_fs_statfs( ffree = statp->f_files - (icount - ifree); statp->f_ffree = max_t(int64_t, ffree, 0); - - if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && - ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == - (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) - xfs_qm_statvfs(ip, statp); - if (XFS_IS_REALTIME_MOUNT(mp) && (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { s64 freertx; statp->f_blocks = sbp->sb_rblocks; freertx = percpu_counter_sum_positive(&mp->m_frextents); - statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx); + statp->f_bavail = statp->f_bfree = + xfs_rtbxlen_to_blen(mp, freertx); } + if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) + xfs_qm_statvfs(ip, statp); + return 0; } @@ -1144,6 +1148,7 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); xfs_unmountfs(mp); + xfs_rtmount_freesb(mp); xfs_freesb(mp); xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); @@ -1261,6 +1266,8 @@ xfs_fs_parse_param( int size = 0; int opt; + BUILD_BUG_ON(XFS_QFLAGS_MNTOPTS & XFS_MOUNT_QUOTA_ALL); + opt = fs_parse(fc, xfs_fs_parameters, param, &result); if (opt < 0) return opt; @@ -1338,32 +1345,39 @@ xfs_fs_parse_param( case Opt_noquota: parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_qnoenforce: case Opt_uqnoenforce: parsing_mp->m_qflags |= XFS_UQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_pquota: case Opt_prjquota: parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_pqnoenforce: parsing_mp->m_qflags |= XFS_PQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_gquota: case Opt_grpquota: parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_gqnoenforce: parsing_mp->m_qflags |= XFS_GQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; + parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS; return 0; case Opt_discard: parsing_mp->m_features |= XFS_FEAT_DISCARD; @@ -1430,7 +1444,8 @@ xfs_fs_validate_params( return -EINVAL; } - if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) { + if (!IS_ENABLED(CONFIG_XFS_QUOTA) && + (mp->m_qflags & ~XFS_QFLAGS_MNTOPTS)) { xfs_warn(mp, "quota support not available in this kernel."); return -EINVAL; } @@ -1657,9 +1672,7 @@ xfs_fs_fill_super( goto out_free_sb; } - xfs_warn(mp, -"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.", - mp->m_sb.sb_blocksize); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LBS); } /* Ensure this filesystem fits in the page cache limits */ @@ -1691,10 +1704,14 @@ xfs_fs_fill_super( goto out_free_sb; } - error = xfs_filestream_mount(mp); + error = xfs_rtmount_readsb(mp); if (error) goto out_free_sb; + error = xfs_filestream_mount(mp); + if (error) + goto out_free_rtsb; + /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. @@ -1733,6 +1750,9 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } + if (xfs_has_metadir(mp)) + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, @@ -1755,12 +1775,18 @@ xfs_fs_fill_super( } if (xfs_has_exchange_range(mp)) - xfs_warn(mp, - "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE); if (xfs_has_parent(mp)) - xfs_warn(mp, - "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR); + + /* + * If no quota mount options were provided, maybe we'll try to pick + * up the quota accounting and enforcement flags from the ondisk sb. + */ + if (!(mp->m_qflags & XFS_QFLAGS_MNTOPTS)) + xfs_set_resuming_quotaon(mp); + mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; error = xfs_mountfs(mp); if (error) @@ -1781,6 +1807,8 @@ xfs_fs_fill_super( out_filestream_unmount: xfs_filestream_unmount(mp); + out_free_rtsb: + xfs_rtmount_freesb(mp); out_free_sb: xfs_freesb(mp); out_free_scrub_stats: @@ -1800,7 +1828,7 @@ xfs_fs_fill_super( out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - goto out_free_sb; + goto out_free_rtsb; } static int @@ -1946,6 +1974,8 @@ xfs_fs_reconfigure( int flags = fc->sb_flags; int error; + new_mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS; + /* version 5 superblocks always support version counters. */ if (xfs_has_crc(mp)) fc->sb_flags |= SB_I_VERSION; @@ -2011,17 +2041,20 @@ static const struct fs_context_operations xfs_context_ops = { * mount option parsing having already been performed as this can be called from * fsopen() before any parameters have been set. */ -static int xfs_init_fs_context( +static int +xfs_init_fs_context( struct fs_context *fc) { struct xfs_mount *mp; + int i; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); if (!mp) return -ENOMEM; spin_lock_init(&mp->m_sb_lock); - xa_init(&mp->m_perags); + for (i = 0; i < XG_TYPE_MAX; i++) + xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); @@ -2063,7 +2096,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 2af9f274e872..8f530e69c18a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -11,6 +11,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_group.h" #include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -32,6 +33,7 @@ #include "xfs_fsmap.h" #include "xfs_btree_staging.h" #include "xfs_icache.h" +#include "xfs_iunlink_item.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_error.h" @@ -44,6 +46,9 @@ #include "xfs_parent.h" #include "xfs_rmap.h" #include "xfs_refcount.h" +#include "xfs_metafile.h" +#include "xfs_metadir.h" +#include "xfs_rtgroup.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ee9f0b1f548d..7b16cdd72e9d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -72,8 +72,11 @@ struct xfs_btree_cur; struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; +struct xfs_fsmap_irec; +struct xfs_group; struct xfs_rmap_irec; struct xfs_icreate_log; +struct xfs_iunlink_item; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; @@ -93,6 +96,8 @@ struct xfs_attrlist_cursor_kern; struct xfs_extent_free_item; struct xfs_rmap_intent; struct xfs_refcount_intent; +struct xfs_metadir_update; +struct xfs_rtgroup; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -181,7 +186,7 @@ TRACE_EVENT(xlog_intent_recovery_failed, ); DECLARE_EVENT_CLASS(xfs_perag_class, - TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), + TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), TP_ARGS(pag, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) @@ -191,10 +196,11 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->refcount = atomic_read(&pag->pag_ref); - __entry->active_refcount = atomic_read(&pag->pag_active_ref); + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->refcount = atomic_read(&pag->pag_group.xg_ref); + __entry->active_refcount = + atomic_read(&pag->pag_group.xg_active_ref); __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS", @@ -207,18 +213,54 @@ DECLARE_EVENT_CLASS(xfs_perag_class, #define DEFINE_PERAG_REF_EVENT(name) \ DEFINE_EVENT(xfs_perag_class, name, \ - TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \ + TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), \ TP_ARGS(pag, caller_ip)) -DEFINE_PERAG_REF_EVENT(xfs_perag_get); -DEFINE_PERAG_REF_EVENT(xfs_perag_hold); -DEFINE_PERAG_REF_EVENT(xfs_perag_put); -DEFINE_PERAG_REF_EVENT(xfs_perag_grab); -DEFINE_PERAG_REF_EVENT(xfs_perag_grab_next_tag); -DEFINE_PERAG_REF_EVENT(xfs_perag_rele); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count); +TRACE_DEFINE_ENUM(XG_TYPE_AG); +TRACE_DEFINE_ENUM(XG_TYPE_RTG); + +DECLARE_EVENT_CLASS(xfs_group_class, + TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), + TP_ARGS(xg, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_group_type, type) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(int, active_refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; + __entry->refcount = atomic_read(&xg->xg_ref); + __entry->active_refcount = atomic_read(&xg->xg_active_ref); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d %sno 0x%x passive refs %d active refs %d caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->agno, + __entry->refcount, + __entry->active_refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_GROUP_REF_EVENT(name) \ +DEFINE_EVENT(xfs_group_class, name, \ + TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \ + TP_ARGS(xg, caller_ip)) +DEFINE_GROUP_REF_EVENT(xfs_group_get); +DEFINE_GROUP_REF_EVENT(xfs_group_hold); +DEFINE_GROUP_REF_EVENT(xfs_group_put); +DEFINE_GROUP_REF_EVENT(xfs_group_grab); +DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); +DEFINE_GROUP_REF_EVENT(xfs_group_rele); + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -299,15 +341,15 @@ TRACE_EVENT(xfs_inodegc_shrinker_scan, ); DECLARE_EVENT_CLASS(xfs_ag_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), - TP_ARGS(mp, agno), + TP_PROTO(const struct xfs_perag *pag), + TP_ARGS(pag), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); ), TP_printk("dev %d:%d agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -315,8 +357,8 @@ DECLARE_EVENT_CLASS(xfs_ag_class, ); #define DEFINE_AG_EVENT(name) \ DEFINE_EVENT(xfs_ag_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ - TP_ARGS(mp, agno)) + TP_PROTO(const struct xfs_perag *pag), \ + TP_ARGS(pag)) DEFINE_AG_EVENT(xfs_read_agf); DEFINE_AG_EVENT(xfs_alloc_read_agf); @@ -662,7 +704,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) @@ -671,9 +713,9 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->dev = pag_mount(pag)->m_super->s_dev; __entry->ino = ino; - __entry->agno = pag->pag_agno; + __entry->agno = pag_agno(pag); __entry->streams = atomic_read(&pag->pagf_fstrms); ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", @@ -684,15 +726,15 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \ + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), \ TP_ARGS(pag, ino)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), - TP_ARGS(pag, ino, free), + TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -701,16 +743,11 @@ TRACE_EVENT(xfs_filestream_pick, __field(xfs_extlen_t, free) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->dev = pag_mount(pag)->m_super->s_dev; __entry->ino = ino; - if (pag) { - __entry->agno = pag->pag_agno; - __entry->streams = atomic_read(&pag->pagf_fstrms); - } else { - __entry->agno = NULLAGNUMBER; - __entry->streams = 0; - } - __entry->free = free; + __entry->agno = pag_agno(pag); + __entry->streams = atomic_read(&pag->pagf_fstrms); + __entry->free = pag->pagf_freeblks; ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -827,28 +864,32 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); -TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), - TP_ARGS(ip, order, write_fault), +DECLARE_EVENT_CLASS(xfs_fault_class, + TP_PROTO(struct xfs_inode *ip, unsigned int order), + TP_ARGS(ip, order), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(unsigned int, order) - __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->order = order; - __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->order, - __entry->write_fault) + __entry->order) ) +#define DEFINE_FAULT_EVENT(name) \ +DEFINE_EVENT(xfs_fault_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int order), \ + TP_ARGS(ip, order)) +DEFINE_FAULT_EVENT(xfs_read_fault); +DEFINE_FAULT_EVENT(xfs_write_fault); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -899,9 +940,10 @@ TRACE_EVENT(xfs_iomap_prealloc_size, ) TRACE_EVENT(xfs_irec_merge_pre, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), - TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_inobt_rec_incore *rec, + const struct xfs_inobt_rec_incore *nrec), + TP_ARGS(pag, rec, nrec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -911,12 +953,12 @@ TRACE_EVENT(xfs_irec_merge_pre, __field(uint16_t, nholemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; - __entry->holemask = holemask; - __entry->nagino = nagino; - __entry->nholemask = holemask; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->nagino = nrec->ir_startino; + __entry->nholemask = nrec->ir_holemask; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -928,9 +970,9 @@ TRACE_EVENT(xfs_irec_merge_pre, ) TRACE_EVENT(xfs_irec_merge_post, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - uint16_t holemask), - TP_ARGS(mp, agno, agino, holemask), + TP_PROTO(const struct xfs_perag *pag, + const struct xfs_inobt_rec_incore *nrec), + TP_ARGS(pag, nrec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -938,10 +980,10 @@ TRACE_EVENT(xfs_irec_merge_post, __field(uint16_t, holemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; - __entry->holemask = holemask; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); + __entry->agino = nrec->ir_startino; + __entry->holemask = nrec->ir_holemask; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x", MAJOR(__entry->dev), @@ -1639,44 +1681,48 @@ TRACE_EVENT(xfs_bunmap, ); DECLARE_EVENT_CLASS(xfs_extent_busy_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ DEFINE_EVENT(xfs_extent_busy_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_BUSY_EVENT(xfs_extent_busy); DEFINE_BUSY_EVENT(xfs_extent_busy_force); DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); DEFINE_BUSY_EVENT(xfs_extent_busy_clear); TRACE_EVENT(xfs_extent_busy_trim, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, - xfs_agblock_t tbno, xfs_extlen_t tlen), - TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(xg, agbno, len, tbno, tlen), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) @@ -1684,22 +1730,99 @@ TRACE_EVENT(xfs_extent_busy_trim, __field(xfs_extlen_t, tlen) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->tbno = tbno; __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agbno, __entry->len, __entry->tbno, __entry->tlen) ); +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xfs_rtalloc_extent_busy, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t start, + xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, + xfs_rtxlen_t len, xfs_rtxlen_t prod, xfs_rtxnum_t rtx, + unsigned busy_gen), + TP_ARGS(rtg, start, minlen, maxlen, len, prod, rtx, busy_gen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rtxnum_t, start) + __field(xfs_rtxlen_t, minlen) + __field(xfs_rtxlen_t, maxlen) + __field(xfs_rtxlen_t, mod) + __field(xfs_rtxlen_t, prod) + __field(xfs_rtxlen_t, len) + __field(xfs_rtxnum_t, rtx) + __field(unsigned, busy_gen) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->start = start; + __entry->minlen = minlen; + __entry->maxlen = maxlen; + __entry->prod = prod; + __entry->len = len; + __entry->rtx = rtx; + __entry->busy_gen = busy_gen; + ), + TP_printk("dev %d:%d rgno 0x%x startrtx 0x%llx minlen %u maxlen %u " + "prod %u len %u rtx 0%llx busy_gen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->start, + __entry->minlen, + __entry->maxlen, + __entry->prod, + __entry->len, + __entry->rtx, + __entry->busy_gen) +) + +TRACE_EVENT(xfs_rtalloc_extent_busy_trim, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t old_rtx, + xfs_rtxlen_t old_len, xfs_rtxnum_t new_rtx, + xfs_rtxlen_t new_len), + TP_ARGS(rtg, old_rtx, old_len, new_rtx, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rtxnum_t, old_rtx) + __field(xfs_rtxnum_t, new_rtx) + __field(xfs_rtxlen_t, old_len) + __field(xfs_rtxlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->old_rtx = old_rtx; + __entry->old_len = old_len; + __entry->new_rtx = new_rtx; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d rgno 0x%x rtx 0x%llx rtxcount 0x%x -> rtx 0x%llx rtxcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->old_rtx, + __entry->old_len, + __entry->new_rtx, + __entry->new_len) +); +#endif /* CONFIG_XFS_RT */ + DECLARE_EVENT_CLASS(xfs_agf_class, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), @@ -1763,10 +1886,10 @@ DEFINE_AGF_EVENT(xfs_agf); DEFINE_AGF_EVENT(xfs_agfl_reset); TRACE_EVENT(xfs_free_extent, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), + TP_ARGS(pag, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -1777,8 +1900,8 @@ TRACE_EVENT(xfs_free_extent, __field(int, haveright) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->agbno = agbno; __entry->len = len; __entry->resv = resv; @@ -2431,23 +2554,26 @@ DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel); DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover); DECLARE_EVENT_CLASS(xfs_discard_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, + xfs_extlen_t len), + TP_ARGS(xg, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->agno = xg->xg_gno; __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->agbno, __entry->len) @@ -2455,9 +2581,9 @@ DECLARE_EVENT_CLASS(xfs_discard_class, #define DEFINE_DISCARD_EVENT(name) \ DEFINE_EVENT(xfs_discard_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ + xfs_extlen_t len), \ + TP_ARGS(xg, agbno, len)) DEFINE_DISCARD_EVENT(xfs_discard_extent); DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); @@ -2547,7 +2673,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->ino = cur->bc_ino.ip->i_ino; break; case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->ino = 0; break; case XFS_BTREE_TYPE_MEM: @@ -2717,6 +2843,7 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, TP_ARGS(mp, free), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) @@ -2724,13 +2851,16 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - __entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + __entry->type = free->xefi_group->xg_type; + __entry->agno = free->xefi_group->xg_gno; + __entry->agbno = xfs_fsb_to_gbno(mp, free->xefi_startblock, + free->xefi_group->xg_type); __entry->len = free->xefi_blockcount; __entry->flags = free->xefi_flags; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, __entry->agbno, __entry->len, @@ -2740,7 +2870,6 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, DEFINE_EVENT(xfs_free_extent_deferred_class, name, \ TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \ TP_ARGS(mp, free)) -DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer); DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred); @@ -2803,7 +2932,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->owner = oinfo->oi_owner; @@ -2848,7 +2977,7 @@ DECLARE_EVENT_CLASS(xfs_btree_error_class, __entry->ino = cur->bc_ino.ip->i_ino; break; case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->ino = 0; break; case XFS_BTREE_TYPE_MEM: @@ -2902,7 +3031,7 @@ TRACE_EVENT(xfs_rmap_convert_state, __entry->ino = cur->bc_ino.ip->i_ino; break; case XFS_BTREE_TYPE_AG: - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->ino = 0; break; case XFS_BTREE_TYPE_MEM: @@ -2937,7 +3066,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = agbno; __entry->len = len; __entry->owner = owner; @@ -3037,11 +3166,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, TP_ARGS(bi), TP_STRUCT__entry( __field(dev_t, dev) - __field(dev_t, opdev) + __field(enum xfs_group_type, type) __field(xfs_agnumber_t, agno) __field(xfs_ino_t, ino) - __field(xfs_agblock_t, agbno) - __field(xfs_fsblock_t, rtbno) + __field(unsigned long long, gbno) __field(int, whichfork) __field(xfs_fileoff_t, l_loff) __field(xfs_filblks_t, l_len) @@ -3050,20 +3178,25 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, ), TP_fast_assign( struct xfs_inode *ip = bi->bi_owner; + struct xfs_mount *mp = ip->i_mount; - __entry->dev = ip->i_mount->m_super->s_dev; - if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) { - __entry->agno = 0; - __entry->agbno = 0; - __entry->rtbno = bi->bi_bmap.br_startblock; - __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev; + __entry->dev = mp->m_super->s_dev; + __entry->type = bi->bi_group->xg_type; + __entry->agno = bi->bi_group->xg_gno; + if (bi->bi_group->xg_type == XG_TYPE_RTG && + !xfs_has_rtgroups(mp)) { + /* + * Legacy rt filesystems do not have allocation groups + * ondisk. We emulate this incore with one gigantic + * rtgroup whose size can exceed a 32-bit block number. + * For this tracepoint, we report group 0 and a 64-bit + * group block number. + */ + __entry->gbno = bi->bi_bmap.br_startblock; } else { - __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, - bi->bi_bmap.br_startblock); - __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, - bi->bi_bmap.br_startblock); - __entry->rtbno = 0; - __entry->opdev = __entry->dev; + __entry->gbno = xfs_fsb_to_gbno(mp, + bi->bi_bmap.br_startblock, + bi->bi_group->xg_type); } __entry->ino = ip->i_ino; __entry->whichfork = bi->bi_whichfork; @@ -3072,14 +3205,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, __entry->l_state = bi->bi_bmap.br_state; __entry->op = bi->bi_type; ), - TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + TP_printk("dev %d:%d op %s ino 0x%llx %sno 0x%x gbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS), - MAJOR(__entry->opdev), MINOR(__entry->opdev), __entry->ino, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->agno, - __entry->agbno, - __entry->rtbno, + __entry->gbno, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, @@ -3110,8 +3242,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, TP_fast_assign( struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->resv = resv; __entry->freeblks = pag->pagf_freeblks; __entry->flcount = pag->pagf_flcount; @@ -3144,11 +3276,10 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); -/* simple AG-based error/%ip tracepoint class */ -DECLARE_EVENT_CLASS(xfs_ag_error_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, +TRACE_EVENT(xfs_ag_resv_init_error, + TP_PROTO(const struct xfs_perag *pag, int error, unsigned long caller_ip), - TP_ARGS(mp, agno, error, caller_ip), + TP_ARGS(pag, error, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -3156,8 +3287,8 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->error = error; __entry->caller_ip = caller_ip; ), @@ -3168,13 +3299,6 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, (char *)__entry->caller_ip) ); -#define DEFINE_AG_ERROR_EVENT(name) \ -DEFINE_EVENT(xfs_ag_error_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \ - unsigned long caller_ip), \ - TP_ARGS(mp, agno, error, caller_ip)) -DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); - /* refcount tracepoint classes */ DECLARE_EVENT_CLASS(xfs_refcount_class, @@ -3189,7 +3313,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = agbno; __entry->len = len; ), @@ -3220,7 +3344,7 @@ TRACE_EVENT(xfs_refcount_lookup, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = agbno; __entry->dir = dir; ), @@ -3246,7 +3370,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -3282,7 +3406,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->domain = irec->rc_domain; __entry->startblock = irec->rc_startblock; __entry->blockcount = irec->rc_blockcount; @@ -3324,7 +3448,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3374,7 +3498,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3429,7 +3553,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->i1_domain = i1->rc_domain; __entry->i1_startblock = i1->rc_startblock; __entry->i1_blockcount = i1->rc_blockcount; @@ -3843,7 +3967,45 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); /* fsmap traces */ -DECLARE_EVENT_CLASS(xfs_fsmap_class, +TRACE_EVENT(xfs_fsmap_mapping, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, + const struct xfs_fsmap_irec *frec), + TP_ARGS(mp, keydev, agno, frec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_daddr_t, start_daddr) + __field(xfs_daddr_t, len_daddr) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->agbno = frec->rec_key; + __entry->start_daddr = frec->start_daddr; + __entry->len_daddr = frec->len_daddr; + __entry->owner = frec->owner; + __entry->offset = frec->offset; + __entry->flags = frec->rm_flags; + ), + TP_printk("dev %d:%d keydev %d:%d agno 0x%x rmapbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->agbno, + __entry->start_daddr, + __entry->len_daddr, + __entry->owner, + __entry->offset, + __entry->flags) +); + +DECLARE_EVENT_CLASS(xfs_fsmap_group_key_class, TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, const struct xfs_rmap_irec *rmap), TP_ARGS(mp, keydev, agno, rmap), @@ -3851,8 +4013,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __field(dev_t, dev) __field(dev_t, keydev) __field(xfs_agnumber_t, agno) - __field(xfs_fsblock_t, bno) - __field(xfs_filblks_t, len) + __field(xfs_agblock_t, agbno) __field(uint64_t, owner) __field(uint64_t, offset) __field(unsigned int, flags) @@ -3861,33 +4022,30 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __entry->dev = mp->m_super->s_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; - __entry->bno = rmap->rm_startblock; - __entry->len = rmap->rm_blockcount; + __entry->agbno = rmap->rm_startblock; __entry->owner = rmap->rm_owner; __entry->offset = rmap->rm_offset; __entry->flags = rmap->rm_flags; ), - TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", + TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, - __entry->bno, - __entry->len, + __entry->agbno, __entry->owner, __entry->offset, __entry->flags) ) -#define DEFINE_FSMAP_EVENT(name) \ -DEFINE_EVENT(xfs_fsmap_class, name, \ +#define DEFINE_FSMAP_GROUP_KEY_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_group_key_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ const struct xfs_rmap_irec *rmap), \ TP_ARGS(mp, keydev, agno, rmap)) -DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); -DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); -DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); +DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_low_group_key); +DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_high_group_key); -DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, - TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), +DECLARE_EVENT_CLASS(xfs_fsmap_linear_key_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_fsblock_t bno), TP_ARGS(mp, keydev, bno), TP_STRUCT__entry( __field(dev_t, dev) @@ -3904,12 +4062,12 @@ DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->bno) ) -#define DEFINE_FSMAP_LINEAR_EVENT(name) \ -DEFINE_EVENT(xfs_fsmap_linear_class, name, \ +#define DEFINE_FSMAP_LINEAR_KEY_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_linear_key_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \ TP_ARGS(mp, keydev, bno)) -DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear); -DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear); +DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_low_linear_key); +DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_high_linear_key); DECLARE_EVENT_CLASS(xfs_getfsmap_class, TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), @@ -4041,9 +4199,9 @@ DEFINE_TRANS_EVENT(xfs_trans_commit_items); DEFINE_TRANS_EVENT(xfs_trans_free_items); TRACE_EVENT(xfs_iunlink_update_bucket, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, xfs_agino_t old_ptr, xfs_agino_t new_ptr), - TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_ARGS(pag, bucket, old_ptr, new_ptr), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4052,8 +4210,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __field(xfs_agino_t, new_ptr) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->bucket = bucket; __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; @@ -4067,9 +4225,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket, ); TRACE_EVENT(xfs_iunlink_update_dinode, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, - xfs_agino_t old_ptr, xfs_agino_t new_ptr), - TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_PROTO(const struct xfs_iunlink_item *iup, xfs_agino_t old_ptr), + TP_ARGS(iup, old_ptr), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4078,11 +4235,12 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __field(xfs_agino_t, new_ptr) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agino = agino; + __entry->dev = pag_mount(iup->pag)->m_super->s_dev; + __entry->agno = pag_agno(iup->pag); + __entry->agino = + XFS_INO_TO_AGINO(iup->ip->i_mount, iup->ip->i_ino); __entry->old_ptr = old_ptr; - __entry->new_ptr = new_ptr; + __entry->new_ptr = iup->next_agino; ), TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -4185,37 +4343,35 @@ DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); -DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); -DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags), - TP_ARGS(mp, agno, flags), +DECLARE_EVENT_CLASS(xfs_group_corrupt_class, + TP_PROTO(const struct xfs_group *xg, unsigned int flags), + TP_ARGS(xg, flags), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_agnumber_t, agno) + __field(enum xfs_group_type, type) + __field(uint32_t, index) __field(unsigned int, flags) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->index = xg->xg_gno; __entry->flags = flags; ), - TP_printk("dev %d:%d agno 0x%x flags 0x%x", + TP_printk("dev %d:%d %sno 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, __entry->flags) + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->index, __entry->flags) ); -#define DEFINE_AG_CORRUPT_EVENT(name) \ -DEFINE_EVENT(xfs_ag_corrupt_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - unsigned int flags), \ - TP_ARGS(mp, agno, flags)) -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); -DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); +#define DEFINE_GROUP_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_group_corrupt_class, name, \ + TP_PROTO(const struct xfs_group *xg, unsigned int flags), \ + TP_ARGS(xg, flags)) +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_sick); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_corrupt); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_healthy); +DEFINE_GROUP_CORRUPT_EVENT(xfs_group_unfixed_corruption); DECLARE_EVENT_CLASS(xfs_inode_corrupt_class, TP_PROTO(struct xfs_inode *ip, unsigned int flags), @@ -4243,29 +4399,10 @@ DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption); -TRACE_EVENT(xfs_iwalk_ag, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino), - TP_ARGS(mp, agno, startino), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startino = startino; - ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->startino) -) - TRACE_EVENT(xfs_iwalk_ag_rec, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + TP_PROTO(const struct xfs_perag *pag, \ struct xfs_inobt_rec_incore *irec), - TP_ARGS(mp, agno, irec), + TP_ARGS(pag, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -4273,8 +4410,8 @@ TRACE_EVENT(xfs_iwalk_ag_rec, __field(uint64_t, freemask) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag_mount(pag)->m_super->s_dev; + __entry->agno = pag_agno(pag); __entry->startino = irec->ir_startino; __entry->freemask = irec->ir_free; ), @@ -4336,7 +4473,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; __assign_str(name); - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; @@ -4451,7 +4588,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); } else { - __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->agno = cur->bc_group->xg_gno; __entry->agbno = be32_to_cpu(ptr->s); } __entry->nr_records = nr_records; @@ -4676,35 +4813,39 @@ TRACE_EVENT(xfs_force_shutdown, ); #ifdef CONFIG_XFS_DRAIN_INTENTS -DECLARE_EVENT_CLASS(xfs_perag_intents_class, - TP_PROTO(struct xfs_perag *pag, void *caller_ip), - TP_ARGS(pag, caller_ip), +DECLARE_EVENT_CLASS(xfs_group_intents_class, + TP_PROTO(const struct xfs_group *xg, void *caller_ip), + TP_ARGS(xg, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_agnumber_t, agno) + __field(enum xfs_group_type, type) + __field(uint32_t, index) __field(long, nr_intents) __field(void *, caller_ip) ), TP_fast_assign( - __entry->dev = pag->pag_mount->m_super->s_dev; - __entry->agno = pag->pag_agno; - __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->dev = xg->xg_mount->m_super->s_dev; + __entry->type = xg->xg_type; + __entry->index = xg->xg_gno; + __entry->nr_intents = + atomic_read(&xg->xg_intents_drain.dr_count); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + TP_printk("dev %d:%d %sno 0x%x intents %ld caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->index, __entry->nr_intents, __entry->caller_ip) ); -#define DEFINE_PERAG_INTENTS_EVENT(name) \ -DEFINE_EVENT(xfs_perag_intents_class, name, \ - TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ - TP_ARGS(pag, caller_ip)) -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); -DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); +#define DEFINE_GROUP_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_group_intents_class, name, \ + TP_PROTO(const struct xfs_group *xg, void *caller_ip), \ + TP_ARGS(xg, caller_ip)) +DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_hold); +DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_rele); +DEFINE_GROUP_INTENTS_EVENT(xfs_group_wait_intents); #endif /* CONFIG_XFS_DRAIN_INTENTS */ @@ -5332,6 +5473,107 @@ DEFINE_EVENT(xfs_getparents_class, name, \ DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin); DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end); +DECLARE_EVENT_CLASS(xfs_metadir_update_class, + TP_PROTO(const struct xfs_metadir_update *upd), + TP_ARGS(upd), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __string(fname, upd->path) + ), + TP_fast_assign( + __entry->dev = upd->dp->i_mount->m_super->s_dev; + __entry->dp_ino = upd->dp->i_ino; + __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO; + __assign_str(fname); + ), + TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(fname), + __entry->ino) +) + +#define DEFINE_METADIR_UPDATE_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_update_class, name, \ + TP_PROTO(const struct xfs_metadir_update *upd), \ + TP_ARGS(upd)) +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_link); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_commit); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_cancel); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_try_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_create); +DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_link); + +DECLARE_EVENT_CLASS(xfs_metadir_update_error_class, + TP_PROTO(const struct xfs_metadir_update *upd, int error), + TP_ARGS(upd, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __field(int, error) + __string(fname, upd->path) + ), + TP_fast_assign( + __entry->dev = upd->dp->i_mount->m_super->s_dev; + __entry->dp_ino = upd->dp->i_ino; + __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO; + __entry->error = error; + __assign_str(fname); + ), + TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(fname), + __entry->ino, + __entry->error) +) + +#define DEFINE_METADIR_UPDATE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_update_error_class, name, \ + TP_PROTO(const struct xfs_metadir_update *upd, int error), \ + TP_ARGS(upd, error)) +DEFINE_METADIR_UPDATE_ERROR_EVENT(xfs_metadir_teardown); + +DECLARE_EVENT_CLASS(xfs_metadir_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, + xfs_ino_t ino), + TP_ARGS(dp, name, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(xfs_ino_t, ino) + __field(int, ftype) + __field(int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + __entry->ino = ino, + __entry->ftype = name->type; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx type %s name '%.*s' ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->namelen, + __get_str(name), + __entry->ino) +) + +#define DEFINE_METADIR_EVENT(name) \ +DEFINE_EVENT(xfs_metadir_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, \ + xfs_ino_t ino), \ + TP_ARGS(dp, name, ino)) +DEFINE_METADIR_EVENT(xfs_metadir_lookup); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bdf3704dc301..4cd25717c9d1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -25,6 +25,8 @@ #include "xfs_dquot.h" #include "xfs_icache.h" #include "xfs_rtbitmap.h" +#include "xfs_rtgroup.h" +#include "xfs_sb.h" struct kmem_cache *xfs_trans_cache; @@ -67,7 +69,7 @@ xfs_trans_free( struct xfs_trans *tp) { xfs_extent_busy_sort(&tp->t_busy); - xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_clear(&tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); xfs_trans_clear_context(tp); @@ -420,6 +422,8 @@ xfs_trans_mod_sb( ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); } tp->t_frextents_delta += delta; + if (xfs_has_rtgroups(mp)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FREXTENTS: /* @@ -429,6 +433,8 @@ xfs_trans_mod_sb( */ ASSERT(delta < 0); tp->t_res_frextents_delta += delta; + if (xfs_has_rtgroups(mp)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_DBLOCKS: tp->t_dblocks_delta += delta; @@ -455,6 +461,10 @@ xfs_trans_mod_sb( case XFS_TRANS_SB_REXTSLOG: tp->t_rextslog_delta += delta; break; + case XFS_TRANS_SB_RGCOUNT: + ASSERT(delta > 0); + tp->t_rgcount_delta += delta; + break; default: ASSERT(0); return; @@ -497,20 +507,22 @@ xfs_trans_apply_sb_deltas( } /* - * Updating frextents requires careful handling because it does not - * behave like the lazysb counters because we cannot rely on log - * recovery in older kenels to recompute the value from the rtbitmap. - * This means that the ondisk frextents must be consistent with the - * rtbitmap. + * sb_frextents was added to the lazy sb counters when the rt groups + * feature was introduced. This is possible because we know that all + * kernels supporting rtgroups will also recompute frextents from the + * realtime bitmap. + * + * For older file systems, updating frextents requires careful handling + * because we cannot rely on log recovery in older kernels to recompute + * the value from the rtbitmap. This means that the ondisk frextents + * must be consistent with the rtbitmap. * * Therefore, log the frextents change to the ondisk superblock and * update the incore superblock so that future calls to xfs_log_sb * write the correct value ondisk. - * - * Don't touch m_frextents because it includes incore reservations, - * and those are handled by the unreserve function. */ - if (tp->t_frextents_delta || tp->t_res_frextents_delta) { + if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && + !xfs_has_rtgroups(tp->t_mountp)) { struct xfs_mount *mp = tp->t_mountp; int64_t rtxdelta; @@ -536,6 +548,18 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rextsize_delta) { be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); + + /* + * Because the ondisk sb records rtgroup size in units of rt + * extents, any time we update the rt extent size we have to + * recompute the ondisk rtgroup block log. The incore values + * will be recomputed in xfs_trans_unreserve_and_mod_sb. + */ + if (xfs_has_rtgroups(tp->t_mountp)) { + sbp->sb_rgblklog = xfs_compute_rgblklog( + be32_to_cpu(sbp->sb_rgextents), + be32_to_cpu(sbp->sb_rextsize)); + } whole = 1; } if (tp->t_rbmblocks_delta) { @@ -554,6 +578,10 @@ xfs_trans_apply_sb_deltas( sbp->sb_rextslog += tp->t_rextslog_delta; whole = 1; } + if (tp->t_rgcount_delta) { + be32_add_cpu(&sbp->sb_rgcount, tp->t_rgcount_delta); + whole = 1; + } xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); if (whole) @@ -618,7 +646,7 @@ xfs_trans_unreserve_and_mod_sb( } ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0); - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (xfs_has_rtgroups(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { rtxdelta += tp->t_frextents_delta; ASSERT(rtxdelta >= 0); } @@ -651,23 +679,21 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_icount += idelta; mp->m_sb.sb_ifree += ifreedelta; /* - * Do not touch sb_frextents here because we are dealing with incore - * reservation. sb_frextents is not part of the lazy sb counters so it - * must be consistent with the ondisk rtbitmap and must never include - * incore reservations. + * Do not touch sb_frextents here because it is handled in + * xfs_trans_apply_sb_deltas for file systems where it isn't a lazy + * counter anyway. */ mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; - mp->m_sb.sb_rextsize += tp->t_rextsize_delta; - if (tp->t_rextsize_delta) { - mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize); - mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize); - } + if (tp->t_rextsize_delta) + xfs_mount_sb_set_rextsize(mp, &mp->m_sb, + mp->m_sb.sb_rextsize + tp->t_rextsize_delta); mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; mp->m_sb.sb_rblocks += tp->t_rblocks_delta; mp->m_sb.sb_rextents += tp->t_rextents_delta; mp->m_sb.sb_rextslog += tp->t_rextslog_delta; + mp->m_sb.sb_rgcount += tp->t_rgcount_delta; spin_unlock(&mp->m_sb_lock); /* @@ -834,29 +860,17 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); - error = xfs_trans_run_precommits(tp); - if (error) { - if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) - xfs_defer_cancel(tp); - goto out_unreserve; - } - /* - * Finish deferred items on final commit. Only permanent transactions - * should ever have deferred ops. + * Commit per-transaction changes that are not already tracked through + * log items. This can add dirty log items to the transaction. */ - WARN_ON_ONCE(!list_empty(&tp->t_dfops) && - !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - if (!regrant && (tp->t_flags & XFS_TRANS_PERM_LOG_RES)) { - error = xfs_defer_finish_noroll(&tp); - if (error) - goto out_unreserve; + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); - /* Run precommits from final tx in defer chain. */ - error = xfs_trans_run_precommits(tp); - if (error) - goto out_unreserve; - } + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; /* * If there is nothing to be logged by the transaction, @@ -881,13 +895,6 @@ __xfs_trans_commit( ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -913,7 +920,7 @@ out_unreserve: * the dqinfo portion to be. All that means is that we have some * (non-persistent) quota reservations that need to be unreserved. */ - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, true); if (tp->t_ticket) { if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); @@ -932,6 +939,20 @@ int xfs_trans_commit( struct xfs_trans *tp) { + /* + * Finish deferred items on final commit. Only permanent transactions + * should ever have deferred ops. + */ + WARN_ON_ONCE(!list_empty(&tp->t_dfops) && + !(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) { + int error = xfs_defer_finish_noroll(&tp); + if (error) { + xfs_trans_cancel(tp); + return error; + } + } + return __xfs_trans_commit(tp, false); } @@ -993,7 +1014,7 @@ xfs_trans_cancel( } #endif xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + xfs_trans_unreserve_and_mod_dquots(tp, false); if (tp->t_ticket) { xfs_log_ticket_ungrant(log, tp->t_ticket); @@ -1262,11 +1283,26 @@ retry: gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; if (udqp || gdqp || pdqp) { + xfs_filblks_t dblocks, rblocks; unsigned int qflags = XFS_QMOPT_RES_REGBLKS; + bool isrt = XFS_IS_REALTIME_INODE(ip); if (force) qflags |= XFS_QMOPT_FORCE_RES; + if (isrt) { + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_cancel; + } + + xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks); + + if (isrt) + rblocks += ip->i_delayed_blks; + else + dblocks += ip->i_delayed_blks; + /* * Reserve enough quota to handle blocks on disk and reserved * for a delayed allocation. We'll actually transfer the @@ -1274,8 +1310,20 @@ retry: * though that part is only semi-transactional. */ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, - pdqp, ip->i_nblocks + ip->i_delayed_blks, - 1, qflags); + pdqp, dblocks, 1, qflags); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } + if (error) + goto out_cancel; + + /* Do the same for realtime. */ + qflags = XFS_QMOPT_RES_RTBLKS | (qflags & XFS_QMOPT_FORCE_RES); + error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, + pdqp, rblocks, 0, qflags); if ((error == -EDQUOT || error == -ENOSPC) && !retried) { xfs_trans_cancel(tp); xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); @@ -1382,5 +1430,8 @@ done: out_cancel: xfs_trans_cancel(tp); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + if (dp != ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f06cc0f41665..71c2e82e4dad 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -148,6 +148,7 @@ typedef struct xfs_trans { int64_t t_rblocks_delta;/* superblock rblocks change */ int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ + int64_t t_rgcount_delta; /* realtime group count */ struct list_head t_items; /* log item descriptors */ struct list_head t_busy; /* list of busy extents */ struct list_head t_dfops; /* deferred operations */ @@ -214,6 +215,7 @@ xfs_trans_read_buf( } struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); +struct xfs_buf *xfs_trans_getrtsb(struct xfs_trans *tp); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 8ede9d099d1f..f56d62dced97 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -360,7 +360,7 @@ xfsaild_resubmit_item( /* protected by ail_lock */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (bp->b_flags & _XBF_INODES) + if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS)) clear_bit(XFS_LI_FAILED, &lip->li_flags); else xfs_clear_li_failed(lip); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index e28ab74af4f0..8e886ecfd69a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -168,12 +168,11 @@ xfs_trans_get_buf_map( /* * Get and lock the superblock buffer for the given transaction. */ -struct xfs_buf * -xfs_trans_getsb( - struct xfs_trans *tp) +static struct xfs_buf * +__xfs_trans_getsb( + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf *bp = tp->t_mountp->m_sb_bp; - /* * Just increment the lock recursion count if the buffer is already * attached to this transaction. @@ -197,6 +196,22 @@ xfs_trans_getsb( return bp; } +struct xfs_buf * +xfs_trans_getsb( + struct xfs_trans *tp) +{ + return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp); +} + +struct xfs_buf * +xfs_trans_getrtsb( + struct xfs_trans *tp) +{ + if (!tp->t_mountp->m_rtsb_bp) + return NULL; + return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp); +} + /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index b368e13424c4..713b6d243e56 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -156,6 +156,8 @@ xfs_trans_mod_ino_dquot( unsigned int field, int64_t delta) { + ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip)); + xfs_trans_mod_dquot(tp, dqp, field, delta); if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { @@ -247,6 +249,8 @@ xfs_trans_mod_dquot_byino( xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; + ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip)); + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta); if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) @@ -602,6 +606,24 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); + + /* + * We've applied the count changes and given back + * whatever reservation we didn't use. Zero out the + * dqtrx fields. + */ + qtrx->qt_blk_res = 0; + qtrx->qt_bcount_delta = 0; + qtrx->qt_delbcnt_delta = 0; + + qtrx->qt_rtblk_res = 0; + qtrx->qt_rtblk_res_used = 0; + qtrx->qt_rtbcount_delta = 0; + qtrx->qt_delrtb_delta = 0; + + qtrx->qt_ino_res = 0; + qtrx->qt_ino_res_used = 0; + qtrx->qt_icount_delta = 0; } } } @@ -638,7 +660,8 @@ xfs_trans_unreserve_and_mod_dquots_hook( */ void xfs_trans_unreserve_and_mod_dquots( - struct xfs_trans *tp) + struct xfs_trans *tp, + bool already_locked) { int i, j; struct xfs_dquot *dqp; @@ -667,10 +690,12 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = false; + locked = already_locked; if (qtrx->qt_blk_res) { - xfs_dqlock(dqp); - locked = true; + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } dqp->q_blk.reserved -= (xfs_qcnt_t)qtrx->qt_blk_res; } @@ -691,7 +716,7 @@ xfs_trans_unreserve_and_mod_dquots( dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } - if (locked) + if (locked && !already_locked) xfs_dqunlock(dqp); } @@ -962,6 +987,8 @@ xfs_trans_reserve_quota_nblks( if (!XFS_IS_QUOTA_ON(mp)) return 0; + if (xfs_is_metadir_inode(ip)) + return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); @@ -1025,3 +1052,14 @@ xfs_trans_free_dqinfo( kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo); tp->t_dqinfo = NULL; } + +int +xfs_quota_reserve_blkres( + struct xfs_inode *ip, + int64_t blocks) +{ + if (XFS_IS_REALTIME_INODE(ip)) + return xfs_trans_reserve_quota_nblks(NULL, ip, 0, blocks, + false); + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); +} diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index eaf849260bd6..0f641a9091ec 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -51,8 +51,7 @@ xfs_attr_grab_log_assist( return error; xfs_set_using_logged_xattrs(mp); - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, - "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LARP); return 0; } |