diff options
Diffstat (limited to 'fs')
192 files changed, 7517 insertions, 4339 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 698c43dd5dc8..9defa12208f9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -179,16 +179,13 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb, int new); +extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern const struct netfs_request_ops v9fs_req_ops; -extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, - struct p9_fid *fid, - struct super_block *sb, int new); +extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb, + struct p9_fid *fid); /* other default globals */ #define V9FS_PORT 564 @@ -230,27 +227,9 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); + return v9fs_fid_iget_dotl(sb, fid); else - return v9fs_inode_from_fid(v9ses, fid, sb, 0); -} - -/** - * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by - * issuing a attribute request - * @v9ses: session information - * @fid: fid to issue attribute request for - * @sb: superblock on which to create inode - * - */ -static inline struct inode * -v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) -{ - if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); - else - return v9fs_inode_from_fid(v9ses, fid, sb, 1); + return v9fs_fid_iget(sb, fid); } #endif diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 0e8418066a48..7923c3c347cb 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,13 +40,16 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, - dev_t rdev); void v9fs_set_netfs_context(struct inode *inode); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, umode_t mode, dev_t rdev); + struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); -ino_t v9fs_qid2ino(struct p9_qid *qid); +#if (BITS_PER_LONG == 32) +#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32))) +#else +#define QID2INO(q) ((ino_t) ((q)->path+2)) +#endif + void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb, unsigned int flags); void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4102759a5cb5..e0d34e4e9076 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -127,7 +127,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) } over = !dir_emit(ctx, st.name, strlen(st.name), - v9fs_qid2ino(&st.qid), dt_type(&st)); + QID2INO(&st.qid), dt_type(&st)); p9stat_free(&st); if (over) return 0; @@ -184,7 +184,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, curdirent.d_name, strlen(curdirent.d_name), - v9fs_qid2ino(&curdirent.qid), + QID2INO(&curdirent.qid), curdirent.d_type)) return 0; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 32572982f72e..360a5304ec03 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -253,9 +253,12 @@ void v9fs_set_netfs_context(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, umode_t mode, dev_t rdev) + struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev) { int err = 0; + struct v9fs_inode *v9inode = V9FS_I(inode); + + memcpy(&v9inode->qid, qid, sizeof(struct p9_qid)); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; @@ -332,36 +335,6 @@ error: } /** - * v9fs_get_inode - helper function to setup an inode - * @sb: superblock - * @mode: mode to setup inode with - * @rdev: The device numbers to set - */ - -struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) -{ - int err; - struct inode *inode; - struct v9fs_session_info *v9ses = sb->s_fs_info; - - p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); - - inode = new_inode(sb); - if (!inode) { - pr_warn("%s (%d): Problem allocating inode\n", - __func__, task_pid_nr(current)); - return ERR_PTR(-ENOMEM); - } - err = v9fs_init_inode(v9ses, inode, mode, rdev); - if (err) { - iput(inode); - return ERR_PTR(err); - } - v9fs_set_netfs_context(inode); - return inode; -} - -/** * v9fs_evict_inode - Remove an inode from the inode cache * @inode: inode to release * @@ -384,82 +357,40 @@ void v9fs_evict_inode(struct inode *inode) #endif } -static int v9fs_test_inode(struct inode *inode, void *data) -{ - int umode; - dev_t rdev; - struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_wstat *st = (struct p9_wstat *)data; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - - umode = p9mode2unixmode(v9ses, st, &rdev); - /* don't match inode of different type */ - if (inode_wrong_type(inode, umode)) - return 0; - - /* compare qid details */ - if (memcmp(&v9inode->qid.version, - &st->qid.version, sizeof(v9inode->qid.version))) - return 0; - - if (v9inode->qid.type != st->qid.type) - return 0; - - if (v9inode->qid.path != st->qid.path) - return 0; - return 1; -} - -static int v9fs_test_new_inode(struct inode *inode, void *data) -{ - return 0; -} - -static int v9fs_set_inode(struct inode *inode, void *data) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_wstat *st = (struct p9_wstat *)data; - - memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); - return 0; -} - -static struct inode *v9fs_qid_iget(struct super_block *sb, - struct p9_qid *qid, - struct p9_wstat *st, - int new) +struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid) { dev_t rdev; int retval; umode_t umode; - unsigned long i_ino; struct inode *inode; + struct p9_wstat *st; struct v9fs_session_info *v9ses = sb->s_fs_info; - int (*test)(struct inode *inode, void *data); - if (new) - test = v9fs_test_new_inode; - else - test = v9fs_test_inode; - - i_ino = v9fs_qid2ino(qid); - inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st); - if (!inode) + inode = iget_locked(sb, QID2INO(&fid->qid)); + if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; + /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - inode->i_ino = i_ino; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto error; + } + umode = p9mode2unixmode(v9ses, st, &rdev); - retval = v9fs_init_inode(v9ses, inode, umode, rdev); + retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev); + v9fs_stat2inode(st, inode, sb, 0); + p9stat_free(st); + kfree(st); if (retval) goto error; - v9fs_stat2inode(st, inode, sb, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); @@ -470,23 +401,6 @@ error: } -struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb, int new) -{ - struct p9_wstat *st; - struct inode *inode = NULL; - - st = p9_client_stat(fid); - if (IS_ERR(st)) - return ERR_CAST(st); - - inode = v9fs_qid_iget(sb, &st->qid, st, new); - p9stat_free(st); - kfree(st); - return inode; -} - /** * v9fs_at_to_dotl_flags- convert Linux specific AT flags to * plan 9 AT flag. @@ -633,7 +547,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, /* * instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, @@ -761,10 +675,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; else if (IS_ERR(fid)) inode = ERR_CAST(fid); - else if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); /* * If we had a rename on the server and a parallel lookup * for the new name, then make sure we instantiate with @@ -1187,26 +1099,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } /** - * v9fs_qid2ino - convert qid into inode number - * @qid: qid to hash - * - * BUG: potential for inode number collisions? - */ - -ino_t v9fs_qid2ino(struct p9_qid *qid) -{ - u64 path = qid->path + 2; - ino_t i = 0; - - if (sizeof(ino_t) == sizeof(path)) - memcpy(&i, &path, sizeof(ino_t)); - else - i = (ino_t) (path ^ (path >> 32)); - - return i; -} - -/** * v9fs_vfs_get_link - follow a symlink path * @dentry: dentry for symlink * @inode: inode for symlink diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 3505227e1704..ef9db3e03506 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -52,78 +52,33 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) return current_fsgid(); } -static int v9fs_test_inode_dotl(struct inode *inode, void *data) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; - - /* don't match inode of different type */ - if (inode_wrong_type(inode, st->st_mode)) - return 0; - - if (inode->i_generation != st->st_gen) - return 0; - - /* compare qid details */ - if (memcmp(&v9inode->qid.version, - &st->qid.version, sizeof(v9inode->qid.version))) - return 0; - - if (v9inode->qid.type != st->qid.type) - return 0; - - if (v9inode->qid.path != st->qid.path) - return 0; - return 1; -} - -/* Always get a new inode */ -static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) -{ - return 0; -} - -static int v9fs_set_inode_dotl(struct inode *inode, void *data) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; - - memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); - inode->i_generation = st->st_gen; - return 0; -} - -static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, - struct p9_qid *qid, - struct p9_fid *fid, - struct p9_stat_dotl *st, - int new) +struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid) { int retval; - unsigned long i_ino; struct inode *inode; + struct p9_stat_dotl *st; struct v9fs_session_info *v9ses = sb->s_fs_info; - int (*test)(struct inode *inode, void *data); - - if (new) - test = v9fs_test_new_inode_dotl; - else - test = v9fs_test_inode_dotl; - i_ino = v9fs_qid2ino(qid); - inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); - if (!inode) + inode = iget_locked(sb, QID2INO(&fid->qid)); + if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; + /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ - inode->i_ino = i_ino; - retval = v9fs_init_inode(v9ses, inode, + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto error; + } + + retval = v9fs_init_inode(v9ses, inode, &fid->qid, st->st_mode, new_decode_dev(st->st_rdev)); + kfree(st); if (retval) goto error; @@ -135,6 +90,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, goto error; unlock_new_inode(inode); + return inode; error: iget_failed(inode); @@ -142,22 +98,6 @@ error: } -struct inode * -v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb, int new) -{ - struct p9_stat_dotl *st; - struct inode *inode = NULL; - - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); - if (IS_ERR(st)) - return ERR_CAST(st); - - inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); - kfree(st); - return inode; -} - struct dotl_openflag_map { int open_flag; int dotl_flag; @@ -307,7 +247,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_fid_iget_dotl(dir->i_sb, fid); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -402,32 +342,17 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, } /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - v9fs_fid_add(dentry, &fid); - v9fs_set_create_acl(inode, fid, dacl, pacl); - d_instantiate(dentry, inode); - err = 0; - } else { - /* - * Not in cached mode. No need to populate - * inode with stat. We need to get an inode - * so that we can set the acl with dentry - */ - inode = v9fs_get_inode(dir->i_sb, mode, 0); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - v9fs_set_create_acl(inode, fid, dacl, pacl); - d_instantiate(dentry, inode); + inode = v9fs_fid_iget_dotl(dir->i_sb, fid); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; } + v9fs_fid_add(dentry, &fid); + v9fs_set_create_acl(inode, fid, dacl, pacl); + d_instantiate(dentry, inode); + err = 0; inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: @@ -709,14 +634,11 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; struct p9_qid qid; - struct inode *inode; struct p9_fid *dfid; struct p9_fid *fid = NULL; - struct v9fs_session_info *v9ses; name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); - v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { @@ -736,36 +658,6 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - /* Now walk from the parent so we can get an unopened fid. */ - fid = p9_client_walk(dfid, 1, &name, 1); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", - err); - goto error; - } - - /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - v9fs_fid_add(dentry, &fid); - d_instantiate(dentry, inode); - err = 0; - } else { - /* Not in cached mode. No need to populate inode with stat */ - inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - d_instantiate(dentry, inode); - } error: p9_fid_put(fid); @@ -888,33 +780,17 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, err); goto error; } - - /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", - err); - goto error; - } - v9fs_set_create_acl(inode, fid, dacl, pacl); - v9fs_fid_add(dentry, &fid); - d_instantiate(dentry, inode); - err = 0; - } else { - /* - * Not in cached mode. No need to populate inode with stat. - * socket syscall returns a fd, so we need instantiate - */ - inode = v9fs_get_inode(dir->i_sb, mode, rdev); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } - v9fs_set_create_acl(inode, fid, dacl, pacl); - d_instantiate(dentry, inode); + inode = v9fs_fid_iget_dotl(dir->i_sb, fid); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; } + v9fs_set_create_acl(inode, fid, dacl, pacl); + v9fs_fid_add(dentry, &fid); + d_instantiate(dentry, inode); + err = 0; error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 941f7d0e0bfa..4236058c7bbd 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -110,7 +110,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - umode_t mode = 0777 | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -140,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); + inode = v9fs_get_inode_from_fid(v9ses, fid, sb); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; @@ -152,32 +151,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } sb->s_root = root; - if (v9fs_proto_dotl(v9ses)) { - struct p9_stat_dotl *st = NULL; - - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto release_sb; - } - d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root), 0); - kfree(st); - } else { - struct p9_wstat *st = NULL; - - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto release_sb; - } - - d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb, 0); - - p9stat_free(st); - kfree(st); - } retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; @@ -271,21 +244,6 @@ done: return res; } -static int v9fs_drop_inode(struct inode *inode) -{ - struct v9fs_session_info *v9ses; - - v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); - /* - * in case of non cached mode always drop the - * inode because we want the inode attribute - * to always match that on the server. - */ - return 1; -} - static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -320,7 +278,6 @@ static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, - .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, diff --git a/fs/Kconfig b/fs/Kconfig index ea2f77446080..a46b0cbc4d8f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -60,7 +60,6 @@ endif # BLOCK config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU - depends on !(ARM || MIPS || SPARC) depends on ZONE_DEVICE || FS_DAX_LIMITED select FS_IOMAP select DAX @@ -261,6 +260,7 @@ menuconfig HUGETLBFS depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) select MEMFD_CREATE + select PADATA if SMP help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 8a67fc427e74..67afe68972d5 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -474,16 +474,6 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, continue; } - /* Don't expose silly rename entries to userspace. */ - if (nlen > 6 && - dire->u.name[0] == '.' && - ctx->actor != afs_lookup_filldir && - ctx->actor != afs_lookup_one_filldir && - memcmp(dire->u.name, ".__afs", 6) == 0) { - ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); - continue; - } - /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 700a27bc8c25..ed04bd1eeae8 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -602,6 +602,8 @@ iterate_address: goto wait_for_more_probe_results; alist = op->estate->addresses; + best_prio = -1; + addr_index = 0; for (i = 0; i < alist->nr_addrs; i++) { if (alist->addrs[i].prio > best_prio) { addr_index = i; @@ -609,9 +611,7 @@ iterate_address: } } - addr_index = READ_ONCE(alist->preferred); - if (!test_bit(addr_index, &set)) - addr_index = __ffs(set); + alist->preferred = addr_index; op->addr_index = addr_index; set_bit(addr_index, &op->addr_tried); @@ -656,12 +656,6 @@ wait_for_more_probe_results: next_server: trace_afs_rotate(op, afs_rotate_trace_next_server, 0); _debug("next"); - ASSERT(op->estate); - alist = op->estate->addresses; - if (op->call_responded && - op->addr_index != READ_ONCE(alist->preferred) && - test_bit(alist->preferred, &op->addr_tried)) - WRITE_ONCE(alist->preferred, op->addr_index); op->estate = NULL; goto pick_server; @@ -690,14 +684,7 @@ no_more_servers: failed: trace_afs_rotate(op, afs_rotate_trace_failed, 0); op->flags |= AFS_OPERATION_STOP; - if (op->estate) { - alist = op->estate->addresses; - if (op->call_responded && - op->addr_index != READ_ONCE(alist->preferred) && - test_bit(alist->preferred, &op->addr_tried)) - WRITE_ONCE(alist->preferred, op->addr_index); - op->estate = NULL; - } + op->estate = NULL; _leave(" = f [failed %d]", afs_op_error(op)); return false; } diff --git a/fs/afs/validation.c b/fs/afs/validation.c index 46b37f2cce7d..32a53fc8dfb2 100644 --- a/fs/afs/validation.c +++ b/fs/afs/validation.c @@ -122,6 +122,9 @@ bool afs_check_validity(const struct afs_vnode *vnode) const struct afs_volume *volume = vnode->volume; time64_t deadline = ktime_get_real_seconds() + 10; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + return true; + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || atomic64_read(&vnode->cb_expires_at) <= deadline || volume->cb_expires_at <= deadline || @@ -389,12 +392,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) key_serial(key)); if (afs_check_validity(vnode)) - return 0; + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; ret = down_write_killable(&vnode->validate_lock); if (ret < 0) goto error; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + ret = -ESTALE; + goto error_unlock; + } + /* Validate a volume after the v_break has changed or the volume * callback expired. We only want to do this once per volume per * v_break change. The actual work will be done when parsing the @@ -448,12 +456,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->cb_ro_snapshot = cb_ro_snapshot; vnode->cb_scrub = cb_scrub; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _debug("file already deleted"); - ret = -ESTALE; - goto error_unlock; - } - /* if the vnode's data version number changed then its contents are * different */ zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1a05cecda7cc..b02796c8a595 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + time_stats.o \ thread_with_file.o \ trace.o \ two_state_shared_lock.o \ @@ -90,3 +91,6 @@ bcachefs-y := \ xattr.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o + +# Silence "note: xyz changed in GCC X.X" messages +subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3e175d8342..c47f72f2bd58 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -29,6 +29,8 @@ #include <linux/sched/task.h> #include <linux/sort.h> +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); + /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { @@ -860,23 +862,28 @@ int bch2_trigger_alloc(struct btree_trans *trans, *bucket_gen(ca, new.k->p.offset) = new_a->gen; bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + percpu_up_read(&c->mark_lock); + +#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) +#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) +#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + if (statechange(a->data_type == BCH_DATA_free) && + bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); + if (statechange(a->data_type == BCH_DATA_need_discard) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + bucket_flushed(new_a)) + bch2_discard_one_bucket_fast(c, new.k->p); - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && + if (statechange(a->data_type == BCH_DATA_cached) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_do_invalidates(c); - if (new_a->data_type == BCH_DATA_need_gc_gens) + if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_do_gc_gens(c); - percpu_up_read(&c->mark_lock); } if ((flags & BTREE_TRIGGER_GC) && @@ -1045,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != discard_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != discard_key_type, + c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1076,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != freespace_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != freespace_key_type, + c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1108,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (a->gen != alloc_gen(k, gens_offset) && - (c->opts.reconstruct_alloc || - fsck_err(c, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); @@ -1167,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); - if (k.k->type != KEY_TYPE_set && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset))) { + if (fsck_err_on(k.k->type != KEY_TYPE_set, + c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1604,6 +1607,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +{ + int ret; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + ret = -EEXIST; + goto out; + } + + ret = darray_push(&c->discard_buckets_in_flight, bucket); +out: + mutex_unlock(&c->discard_buckets_in_flight_lock); + return ret; +} + +static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +{ + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + goto found; + } + BUG(); +found: + mutex_unlock(&c->discard_buckets_in_flight_lock); +} + struct discard_buckets_state { u64 seen; u64 open; @@ -1642,6 +1675,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; + bool discard_locked = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1709,6 +1743,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } + if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + goto out; + + discard_locked = true; + if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* @@ -1740,6 +1779,8 @@ write: count_event(c, bucket_discard); s->discarded++; out: + if (discard_locked) + discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); @@ -1779,6 +1820,93 @@ void bch2_do_discards(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_discard); } +static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +{ + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + goto err; + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void bch2_do_discards_fast_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + + while (1) { + bool got_bucket = false; + struct bpos bucket; + struct bch_dev *ca; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) { + if (i->snapshot) + continue; + + ca = bch_dev_bkey_exists(c, i->inode); + + if (!percpu_ref_tryget(&ca->io_ref)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + continue; + } + + got_bucket = true; + bucket = *i; + i->snapshot = true; + break; + } + mutex_unlock(&c->discard_buckets_in_flight_lock); + + if (!got_bucket) + break; + + if (ca->mi.discard && !c->opts.nochanges) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + + int ret = bch2_trans_do(c, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, bucket)); + bch_err_fn(c, ret); + + percpu_ref_put(&ca->io_ref); + discard_in_flight_remove(c, bucket); + + if (ret) + break; + } + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + if (!percpu_ref_is_dying(&ca->io_ref) && + !discard_in_flight_add(c, bucket) && + bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && + !queue_work(c->write_ref_wq, &c->discard_fast_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, @@ -2210,9 +2338,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } +void bch2_fs_allocator_background_exit(struct bch_fs *c) +{ + darray_exit(&c->discard_buckets_in_flight); +} + void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + mutex_init(&c->discard_buckets_in_flight_lock); INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index e7f7e842ee1b..052b2fac25d6 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); +void bch2_fs_allocator_background_exit(struct bch_fs *); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 633d3223b353..ca58193dd902 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, false); - - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, false); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; @@ -555,8 +551,7 @@ again: goto again; } - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 569b97904da4..8cb35ea572cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -131,8 +131,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_exit(&buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - bch2_inconsistent_error(c); - return -EIO; + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; } else { return 0; } @@ -478,8 +477,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; @@ -555,60 +553,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; + + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; + struct btree_iter iter; + struct btree *b; if (!((1U << btree) & btree_leaf_mask) && !((1U << btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) - break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); + __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); bch2_trans_iter_exit(trans, &iter); return 0; } - } while (bch2_btree_iter_advance(&iter)); + } bch2_trans_iter_exit(trans, &iter); } - *end = BBPOS_MAX; return ret; } @@ -666,62 +665,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) -{ - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; -} - -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; - int ret = 0; - - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; - } - - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; - } - - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; - } - } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); @@ -732,10 +675,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bkey_init(&s.last_flushed.k->k); while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + struct bbpos end; + ret = bch2_get_btree_in_memory_pos(trans, + BIT_ULL(BTREE_ID_backpointers), + BIT_ULL(BTREE_ID_backpointers), + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); if (ret) break; + s.bucket_end = end.pos; + if ( bpos_eq(s.bucket_start, POS_MIN) && !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -763,6 +712,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } @@ -868,6 +820,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h index 5198e94cf3b8..f63893344f80 100644 --- a/fs/bcachefs/bbpos_types.h +++ b/fs/bcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 69d0d60d50e3..339dc3e1dcd3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -212,6 +212,7 @@ #include "recovery_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG @@ -266,6 +267,9 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") __printf(2, 3) +void bch2_print_opts(struct bch_opts *, const char *, ...); + +__printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); #define maybe_dev_to_fs(_c) _Generic((_c), \ @@ -504,6 +508,7 @@ enum gc_phase { GC_PHASE_BTREE_deleted_inodes, GC_PHASE_BTREE_logged_ops, GC_PHASE_BTREE_rebalance_work, + GC_PHASE_BTREE_subvolume_children, GC_PHASE_PENDING_DELETE, }; @@ -593,7 +598,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct bch2_time_stats io_latency[2]; + struct bch2_time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -663,6 +668,8 @@ struct journal_seq_blacklist_table { }; struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; struct journal_key { u64 journal_seq; u32 journal_offset; @@ -671,15 +678,13 @@ struct journal_keys { bool allocated; bool overwritten; struct bkey_i *k; - } *d; + } *data; /* * Gap buffer: instead of all the empty space in the array being at the * end of the buffer - from @nr to @size - the empty space is at @gap. * This means that sequential insertions are O(n) instead of O(n^2). */ size_t gap; - size_t nr; - size_t size; atomic_t ref; bool initial_ref_held; }; @@ -703,6 +708,7 @@ struct btree_trans_buf { x(reflink) \ x(fallocate) \ x(discard) \ + x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ @@ -919,8 +925,6 @@ struct bch_fs { /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; - u64 blocked_allocate; - u64 blocked_allocate_open_bucket; open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; @@ -940,8 +944,11 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct discard_work; struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct bpos) discard_buckets_in_flight; + struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0668b682a21c..bff8750ac0d7 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -189,7 +189,11 @@ struct bversion { __u32 hi; __u64 lo; #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; struct bkey { /* Size of combined key and value, in u64s */ @@ -222,7 +226,36 @@ struct bkey { __u8 pad[1]; #endif -} __packed __aligned(8); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +/* + * The big-endian version of bkey can't be compiled by rustc with the "aligned" + * attr since it doesn't allow types to have both "packed" and "aligned" attrs. + * So for Rust compatibility, don't include this. It can be included in the LE + * version because the "packed" attr is redundant in that case. + * + * History: (quoting Kent) + * + * Specifically, when i was designing bkey, I wanted the header to be no + * bigger than necessary so that bkey_packed could use the rest. That means that + * decently offten extent keys will fit into only 8 bytes, instead of spilling over + * to 16. + * + * But packed_bkey treats the part after the header - the packed section - + * as a single multi word, variable length integer. And bkey, the unpacked + * version, is just a special case version of a bkey_packed; all the packed + * bkey code will work on keys in any packed format, the in-memory + * representation of an unpacked key also is just one type of packed key... + * + * So that constrains the key part of a bkig endian bkey to start right + * after the header. + * + * If we ever do a bkey_v2 and need to expand the hedaer by another byte for + * some reason - that will clean up this wart. + */ +__aligned(8) +#endif +; struct bkey_packed { __u64 _data[0]; @@ -840,7 +873,9 @@ struct bch_sb_field_downgrade { x(snapshot_skiplists, BCH_VERSION(1, 1)) \ x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) + x(member_seq, BCH_VERSION(1, 4)) \ + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ + x(btree_subvolume_children, BCH_VERSION(1, 6)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1275,7 +1310,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(dev_usage, 8) \ x(log, 9) \ x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(write_buffer_keys, 11) \ + x(datetime, 12) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1376,6 +1412,11 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +} __packed __aligned(8); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1482,7 +1523,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_finsert)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 831be01809f2..cf23ff47bed8 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -4,7 +4,7 @@ #include <linux/bug.h> #include "bcachefs_format.h" - +#include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" @@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k) static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; - - EBUG_ON(k->u64s < ret); - return ret; + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, @@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst, memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h new file mode 100644 index 000000000000..c9ae9e42b385 --- /dev/null +++ b/fs/bcachefs/bkey_types.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_TYPES_H +#define _BCACHEFS_BKEY_TYPES_H + +#include "bcachefs_format.h" + +/* + * bkey_i - bkey with inline value + * bkey_s - bkey with split value + * bkey_s_c - bkey with split value, const + */ + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index d7c81beac14a..562561a9a510 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_buf_bytes(b), gfp); + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); + + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = b->c.level + ? bc->pinned_nodes_interior_mask + : bc->pinned_nodes_leaf_mask; + + if ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, c->opts.btree_node_size); + kvfree(c->verify_ondisk); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -711,6 +724,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; + trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -760,8 +776,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (path) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + BUG_ON(!path); + + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -901,7 +918,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -992,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1075,7 +1092,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-EIO); + b = ERR_PTR(-BCH_ERR_btree_node_read_error); goto out; } @@ -1096,7 +1113,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 1102995643b1..584aee7010de 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -389,7 +389,8 @@ again: have_child = dropped_children = false; bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -406,7 +407,7 @@ again: printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(ret == -EIO, c, + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", @@ -478,7 +479,8 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -591,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); - if (!g->gen_valid && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -609,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(p.ptr.gen, g->gen) > 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -631,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) @@ -931,7 +929,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); bkey_init(&prev.k->k); @@ -963,7 +961,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (b->c.level > target_depth) { bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { struct btree *child; @@ -976,7 +975,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b false); ret = PTR_ERR_OR_ZERO(child); - if (ret == -EIO) { + if (bch2_err_matches(ret, EIO)) { bch2_topology_error(c); if (__fsck_err(c, @@ -1190,9 +1189,7 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(c, ca) { - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); + kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); ca->buckets_gc = NULL; free_percpu(ca->usage_gc); @@ -1365,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket gc, *b; + struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; struct bch_alloc_v4 old_convert, new; const struct bch_alloc_v4 *old; - enum bch_data_type type; int ret; old = bch2_alloc_to_v4(k, &old_convert); @@ -1377,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, percpu_down_read(&c->mark_lock); b = gc_bucket(ca, iter->pos.offset); + old_gc = *b; + + if ((old->data_type == BCH_DATA_sb || + old->data_type == BCH_DATA_journal) && + !bch2_dev_is_online(ca)) { + b->data_type = old->data_type; + b->dirty_sectors = old->dirty_sectors; + } /* * b->data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - if (b->data_type != type) { - struct bch_dev_usage *u; - - preempt_disable(); - u = this_cpu_ptr(ca->usage_gc); - u->d[b->data_type].buckets--; - b->data_type = type; - u->d[b->data_type].buckets++; - preempt_enable(); - } - + b->data_type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); gc = *b; percpu_up_read(&c->mark_lock); + if (gc.data_type != old_gc.data_type || + gc.dirty_sectors != old_gc.dirty_sectors) + bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + if (metadata_only && gc.data_type != BCH_DATA_sb && gc.data_type != BCH_DATA_journal && @@ -1410,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (gen_after(old->gen, gc.gen)) return 0; - if (c->opts.reconstruct_alloc || - fsck_err_on(new.data_type != gc.data_type, c, + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", @@ -1422,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -1491,7 +1486,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { for_each_member_device(c, ca) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { @@ -1585,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, " should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -1595,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, new->k.type = KEY_TYPE_deleted; else *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: printbuf_exit(&buf); @@ -1817,10 +1812,10 @@ out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only) ?: - bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only) ?: + bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only); bch2_journal_unblock(&c->journal); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index aa9b6cbe3226..624c8287deb4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, size); + kvfree(p); } static void *btree_bounce_alloc(struct bch_fs *c, size_t size, @@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -581,8 +581,7 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); - bch2_topology_error(c); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); @@ -840,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; + if (k->u64s < bkeyp_key_u64s(&b->format, k)) + return false; + struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); @@ -881,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - /* XXX: validate k->u64s */ + if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_u64s, + "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + goto drop_this_key; + if (!write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -1737,7 +1745,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - ret = -EIO; + ret = -BCH_ERR_btree_node_read_error; goto err; } @@ -1841,7 +1849,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_write_all_failed; + ret = -BCH_ERR_btree_node_write_all_failed; goto err; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3ef338df82f5..51bcdc6c6d1c 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct bkey_s_c k; int ret = 0; - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); @@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, path = &trans->paths[path_idx]; if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out; + goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); @@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } - +out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) @@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans) { unsigned nr = trans->nr_paths * 2; - void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + sizeof(struct btree_trans_paths) + nr * sizeof(struct btree_path) + nr * sizeof(btree_path_idx_t) + 8 + @@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(trans->paths + iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + if (btree_path_node(path, path->level)) + btree_path_set_should_be_locked(path); return 0; } @@ -2305,7 +2307,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) - return bkey_s_c_err(-EIO); + return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2503,6 +2505,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } @@ -2762,6 +2765,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) struct btree_trans *trans = src->trans; *dst = *src; +#ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +#endif if (src->path) __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) @@ -3085,7 +3091,7 @@ void bch2_trans_put(struct btree_trans *trans) trans->paths = NULL; if (paths_allocated != trans->_paths_allocated) - kfree_rcu_mightsleep(paths_allocated); + kvfree_rcu_mightsleep(paths_allocated); if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 719a94a84950..50e04356d72c 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bset.h" +#include "btree_cache.h" #include "btree_journal_iter.h" #include "journal_io.h" @@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) { - return keys->d + idx_to_pos(keys, idx); + return keys->data + idx_to_pos(keys, idx); } static size_t __bch2_journal_key_search(struct journal_keys *keys, @@ -180,10 +182,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; + journal_key_cmp(&n, &keys->data[idx]) == 0) { + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; return 0; } @@ -196,17 +198,17 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); return -BCH_ERR_ENOMEM_journal_key_insert; } /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - keys->d = new_keys.d; + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); + kvfree(keys->data); + keys->data = new_keys.data; keys->nr = new_keys.nr; keys->size = new_keys.size; @@ -216,11 +218,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, journal_iters_move_gap(c, keys->gap, idx); - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; + move_gap(keys, idx); keys->nr++; - keys->d[keys->gap++] = n; + keys->data[keys->gap++] = n; journal_iters_fix(c); @@ -267,10 +268,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos)) + keys->data[idx].overwritten = true; } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -284,16 +285,16 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->d + iter->idx; + struct journal_key *k = iter->keys->data + iter->idx; - while (k < iter->keys->d + iter->keys->size && + while (k < iter->keys->data + iter->keys->size && k->btree_id == iter->btree_id && k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; + k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -334,9 +335,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) iter->pos = bpos_successor(iter->pos); } +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k, ret; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); again: if (iter->at_end) return bkey_s_c_null; @@ -376,17 +406,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b, struct btree_node_iter node_iter, struct bpos pos) { memset(iter, 0, sizeof(*iter)); + iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; @@ -396,15 +427,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter * this version is used by btree_gc before filesystem has gone RW and * multithreaded, so uses the journal_iters list: */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b) { struct btree_node_iter node_iter; bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -415,9 +446,7 @@ void bch2_journal_entries_free(struct bch_fs *c) struct genradix_iter iter; genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); + kvfree(*i); genradix_free(&c->journal_entries); } @@ -437,22 +466,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_put(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key *i; BUG_ON(atomic_read(&keys->ref) <= 0); if (!atomic_dec_and_test(&keys->ref)) return; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); - for (i = keys->d; i < keys->d + keys->nr; i++) + darray_for_each(*keys, i) if (i->allocated) kfree(i->k); - kvfree(keys->d); - keys->d = NULL; + kvfree(keys->data); + keys->data = NULL; keys->nr = keys->gap = keys->size = 0; bch2_journal_entries_free(c); @@ -460,83 +487,38 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - struct journal_key *src, *dst; + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + struct journal_key *dst = keys->data; - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - !journal_key_cmp(src, src + 1)) - src++; + darray_for_each(*keys, src) { + if (src + 1 < &darray_top(*keys) && + !journal_key_cmp(src, src + 1)) + continue; - *dst++ = *src++; + *dst++ = *src; } - keys->nr = dst - keys->d; + keys->nr = dst - keys->data; } int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); - - if (!keys->d) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } + size_t nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; cond_resched(); for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->d[keys->nr++] = (struct journal_key) { + struct journal_key n = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, @@ -544,6 +526,18 @@ int bch2_journal_keys_sort(struct bch_fs *c) .journal_offset = k->_data - i->j._data, }; + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + + BUG_ON(darray_push(keys, n)); + } + nr_read++; } } @@ -551,6 +545,6 @@ int bch2_journal_keys_sort(struct bch_fs *c) __journal_keys_sort(keys); keys->gap = keys->nr; - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 8ca4c100b2e3..c9d19da3ea04 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -15,6 +15,7 @@ struct journal_iter { */ struct btree_and_journal_iter { + struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -22,6 +23,7 @@ struct btree_and_journal_iter { struct journal_iter journal; struct bpos pos; bool at_end; + bool prefetch; }; struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, @@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); +int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, + struct btree_and_journal_iter *); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 74e52fd28abe..8a71d43444b9 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct bkey_i *new_k = NULL; int ret; - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 684397442338..b9b151e693ed 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans) return; trans_for_each_path(trans, path, i) - bch2_btree_path_downgrade(trans, path); + if (path->ref) + bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4a5a64499eb7..9404d96c38f3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/rhashtable.h> +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" @@ -173,6 +174,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + u64 pinned_nodes_leaf_mask; + u64 pinned_nodes_interior_mask; }; struct btree_node_iter { @@ -654,6 +660,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_subvolumes)| \ BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ @@ -727,7 +734,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; - s8 error; + s16 error; }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index c3ff365acce9..a4b40c1656a5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -452,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && bkey_deleted(&i->old_k)) + if (path->cached && !i->old_btree_u64s) return flush_new_cached_update(trans, i, flags, ip); return 0; @@ -788,6 +788,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ struct bkey_i k; bkey_init(&k.k); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index b9382b7b288b..cc7c53e83f89 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos) { - return bch2_btree_bit_mod(trans, btree, pos, false); + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4530b14ff2c3..642213ef9f79 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,8 +25,7 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, @@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); @@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as, static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err; } else if (n3) { @@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree_path *path = trans->paths + path_idx; @@ -1739,7 +1733,7 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + bch2_btree_set_root(as, trans, path, n); + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, + b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, @@ -1845,8 +1892,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } @@ -1916,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; @@ -1987,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); if (ret) goto err; } else { @@ -2485,7 +2530,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c593c925d1e3..3439b03719c7 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); +int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); + int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index ac7844861966..b77e7b382b66 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -574,8 +574,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) { struct journal_keys_to_wb dst; - struct jset_entry *entry; - struct bkey_i *k; int ret = 0; bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); @@ -590,7 +588,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } + spin_lock(&c->journal.lock); buf->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); out: bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 54f7826ac498..c2f46b267b3a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans, (int) bch2_bkey_needs_rebalance(c, old); if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, mod > 0); if (ret) return ret; } @@ -1335,7 +1336,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) @@ -1345,16 +1346,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } if ((c->opts.buckets_nouse && - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { + !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { ret = -BCH_ERR_ENOMEM_buckets_nouse; goto err; } @@ -1397,8 +1398,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvpfree(buckets_nouse, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); @@ -1407,27 +1407,21 @@ err: void bch2_dev_buckets_free(struct bch_dev *ca) { - unsigned i; - - kvpfree(ca->buckets_nouse, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), - sizeof(struct bucket_gens) + ca->mi.nbuckets); + kvfree(ca->buckets_nouse); + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); if (!ca->usage_base) return -BCH_ERR_ENOMEM_usage_init; - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { ca->usage[i] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[i]) return -BCH_ERR_ENOMEM_usage_init; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 226b39c17667..38defa19d52d 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -22,12 +22,6 @@ #include <linux/slab.h> #include <linux/uaccess.h> -__must_check -static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -155,19 +149,35 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) kfree(thr); } -static int bch2_fsck_offline_thread_fn(void *arg) +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); - if (!thr->thr.thr.ret) - bch2_fs_stop(c); + if (IS_ERR(c)) + return PTR_ERR(c); - thread_with_stdio_done(&thr->thr); - return 0; + int ret = 0; + if (test_bit(BCH_FS_errors_fixed, &c->flags)) + ret |= 1; + if (test_bit(BCH_FS_error, &c->flags)) + ret |= 4; + + bch2_fs_stop(c); + + if (ret & 1) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + if (ret & 4) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + + return ret; } +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; @@ -220,9 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_offline_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); err: if (ret < 0) { if (thr) @@ -763,9 +771,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(void *arg) +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = thr->c; c->stdio_filter = current; @@ -793,13 +801,16 @@ static int bch2_fsck_online_thread_fn(void *arg) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - thread_with_stdio_done(&thr->thr); - up(&c->online_fsck_mutex); bch2_ro_ref_put(c); - return 0; + return ret; } +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + static long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) { @@ -840,9 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, goto err; } - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_online_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); err: if (ret < 0) { bch_err_fn(c, ret); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3c761ad6b1c8..4701457f6381 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -558,7 +558,7 @@ got_key: return 0; } -#include "../crypto.h" +#include "crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 33df8cf86bd8..1410365a8891 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return 0; if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_read_init; if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; @@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_initialized(&c->compress_workspace[i->type])) continue; - if (mempool_init_kvpmalloc_pool( + if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) return -BCH_ERR_ENOMEM_compression_workspace_init; } if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvpmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) + mempool_init_kvmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) return -BCH_ERR_ENOMEM_decompression_workspace_init; return 0; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7bdba8507fc9..b1f147e6be4d 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_buf_bytes(b)); + kvfree(n_ondisk); percpu_ref_put(&ca->io_ref); } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 4ae1e9f002a0..d37bd07afbfe 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -144,19 +144,21 @@ fsck_err: return ret; } -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> %llu type %s", - d_name.len, - d_name.name, - d.v->d_type != DT_SUBVOL - ? le64_to_cpu(d.v->d_inum) - : le32_to_cpu(d.v->d_child_subvol), - bch2_d_type_str(d.v->d_type)); + prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + + if (d.v->d_type != DT_SUBVOL) + prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + else + prt_printf(out, "%u -> %u", + le32_to_cpu(d.v->d_parent_subvol), + le32_to_cpu(d.v->d_child_subvol)); + + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, @@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } int bch2_dirent_create_snapshot(struct btree_trans *trans, - u64 dir, u32 snapshot, + u32 dir_subvol, u64 dir, u32 snapshot, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, bch_str_hash_flags_t str_hash_flags) { - subvol_inum zero_inum = { 0 }; + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.inode = dir; dirent->k.p.snapshot = snapshot; - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; return ret; @@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans, struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + unsigned src_update_flags = 0; + bool delete_src, delete_dst; int ret = 0; - if (src_dir.subvol != dst_dir.subvol) - return -EXDEV; - memset(src_inum, 0, sizeof(*src_inum)); memset(dst_inum, 0, sizeof(*dst_inum)); @@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - src_type = bkey_s_c_to_dirent(old_src).v->d_type; - - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) - return -EOPNOTSUPP; - - /* Lookup dst: */ if (mode == BCH_RENAME) { /* @@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_s_c_to_dirent(old_dst), dst_inum); if (ret) goto out; - - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; - - if (dst_type == DT_SUBVOL) - return -EOPNOTSUPP; } if (mode != BCH_RENAME_EXCHANGE) @@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans, } } + if (new_dst->v.d_type == DT_SUBVOL) + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); + + if ((mode == BCH_RENAME_EXCHANGE) && + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; out_set_src: - /* - * If we're deleting a subvolume, we need to really delete the dirent, - * not just emit a whiteout in the current snapshot: + * If we're deleting a subvolume we need to really delete the dirent, + * not just emit a whiteout in the current snapshot - there can only be + * single dirent that points to a given subvolume. + * + * IOW, we don't maintain multiple versions in different snapshots of + * dirents that point to subvolumes - dirents that point to subvolumes + * are only visible in one particular subvolume so it's not necessary, + * and it would be particularly confusing for fsck to have to deal with. */ - if (src_type == DT_SUBVOL) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter); + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && + new_src->k.p.snapshot != old_src.k->p.snapshot; + + delete_dst = old_dst.k && + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && + new_dst->k.p.snapshot != old_dst.k->p.snapshot; + + if (!delete_src || !bkey_deleted(&new_src->k)) { + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); if (ret) goto out; + } - new_src->k.p = src_iter.pos; - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + if (delete_src) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; } - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; + if (delete_dst) { + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; + } if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; @@ -456,41 +472,29 @@ out: return ret; } -int __bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) +int bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - return ret; - - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); + int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); if (ret) return ret; - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, inum); + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); if (ret > 0) ret = -ENOENT; err: if (ret) bch2_trans_iter_exit(trans, iter); - return ret; } @@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, struct btree_iter iter = { NULL }; int ret = lockrestart_do(trans, - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { - ret = -ENOTEMPTY; + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) + continue; + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; break; } bch2_trans_iter_exit(trans, &iter); @@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) u32 snapshot; return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, snapshot); + bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 21ffeb78f02e..bee55cca2aa0 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); -int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, +int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, bch_str_hash_flags_t); @@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, +int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d503af270024..b98e2c2b8bf0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); + kvfree(buf->data[i]); buf->data[i] = NULL; } } @@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index d260ff9bbfeb..43557bebd0f8 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "errcode.h" +#include "trace.h" #include <linux/errname.h> @@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class) return err == class; } -int __bch2_err_class(int err) +int __bch2_err_class(int bch_err) { - err = -err; - BUG_ON((unsigned) err >= BCH_ERR_MAX); + int std_err = -bch_err; + BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) - err = bch2_errcode_parents[err - BCH_ERR_START]; + while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) + std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; + + trace_error_downcast(bch_err, std_err, _RET_IP_); - return -err; + return -std_err; } const char *bch2_blk_status_to_str(blk_status_t status) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8c40c2067a04..af25d8ec60f2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,10 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, mount_option) \ + x(BCH_ERR_mount_option, option_name) \ + x(BCH_ERR_mount_option, option_value) \ + x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -78,6 +82,7 @@ x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -109,6 +114,8 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ @@ -176,6 +183,9 @@ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ + x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -225,7 +235,10 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ - x(EIO, btree_write_all_failed) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ + x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -238,7 +251,8 @@ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) + x(BCH_ERR_nopromote, nopromote_enomem) \ + x(0, need_inode_lock) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d32c8bebe46c..043431206799 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "recovery.h" #include "super.h" #include "thread_with_file.h" @@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -void bch2_topology_error(struct bch_fs *c) +int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + -BCH_ERR_btree_node_read_validate_error; + } } void bch2_fatal_error(struct bch_fs *c) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index fec17d1353d1..94491190e09e 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -30,7 +30,7 @@ struct work_struct; bool bch2_inconsistent_error(struct bch_fs *); -void bch2_topology_error(struct bch_fs *); +int bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent(c, ...) \ ({ \ diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 6bf839d69e84..6219f2c08e4c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -43,6 +43,11 @@ enum bkey_invalid_flags; #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +#define extent_entry_next_safe(_entry, _end) \ + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ + ? extent_entry_next(_entry) \ + : _end) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ @@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (extent_entry_type(_entry)) { \ + switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ @@ -344,7 +349,7 @@ out: \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 66b945be10c2..d8153fe27037 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -24,12 +24,12 @@ struct { \ (fifo)->mask = (fifo)->size \ ? roundup_pow_of_two((fifo)->size) - 1 \ : 0; \ - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + kvfree((fifo)->data); \ (fifo)->data = NULL; \ } while (0) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 1c1ea0f0c692..624e6f963240 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans, u32 new_subvol, dir_snapshot; ret = bch2_subvolume_create(trans, new_inode->bi_inum, + dir.subvol, snapshot_src.subvol, &new_subvol, &snapshot, (flags & BCH_CREATE_SNAPSHOT_RO) != 0); @@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - bool deleting_snapshot) + bool deleting_subvol) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot && !inode_u->bi_subvol) { + if (deleting_subvol && !inode_u->bi_subvol) { ret = -BCH_ERR_ENOENT_not_subvol; goto err; } - if (deleting_snapshot || inode_u->bi_subvol) { + if (inode_u->bi_subvol) { + /* Recursive subvolume destroy not allowed (yet?) */ + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); + if (ret) + goto err; + } + + if (deleting_subvol || inode_u->bi_subvol) { ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; @@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, return ret; } +static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) +{ + struct btree_iter iter; + struct bkey_i_subvolume *s = + bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_CACHED, subvolume); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.fs_path_parent = cpu_to_le32(new_parent); + bch2_trans_iter_exit(trans, &iter); + return 0; +} + int bch2_rename_trans(struct btree_trans *trans, subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, @@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (src_inode_u->bi_subvol && + dst_dir.subvol != src_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + dst_inode_u->bi_subvol && + src_dir.subvol != dst_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); + if (ret) + goto err; + } + + /* Can't move across subvolumes, unless it's a subvolume root: */ + if (src_dir.subvol != dst_dir.subvol && + (!src_inode_u->bi_subvol || + (dst_inum.inum && !dst_inode_u->bi_subvol))) { + ret = -EXDEV; + goto err; + } + + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + + if ((mode == BCH_RENAME_EXCHANGE) && + dst_inode_u->bi_parent_subvol) + dst_inode_u->bi_parent_subvol = src_dir.subvol; + src_inode_u->bi_dir = dst_dir_u->bi_inum; src_inode_u->bi_dir_offset = dst_offset; @@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inum)) { - ret = -ENOTEMPTY; - goto err; + if (S_ISDIR(dst_inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, dst_inum); + if (ret) + goto err; } } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 27710cdd5710..39292e7ef342 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len) + loff_t pos, unsigned len, + bool inode_locked) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); + /* + * If we're not using the inode lock, we need to lock all the folios for + * atomiticity of writes vs. other writes: + */ + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { + ret = -BCH_ERR_need_inode_lock; + goto out; + } + f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) + if (end > inode->v.i_size) { + BUG_ON(!inode_locked); i_size_write(&inode->v, end); + } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; + loff_t pos; + bool inode_locked = false; + ssize_t written = 0, written2 = 0, ret = 0; + + /* + * We don't take the inode lock unless i_size will be changing. Folio + * locks provide exclusion with other writes, and the pagecache add lock + * provides exclusion with truncate and hole punching. + * + * There is one nasty corner case where atomicity would be broken + * without great care: when copying data from userspace to the page + * cache, we do that with faults disable - a page fault would recurse + * back into the filesystem, taking filesystem locks again, and + * deadlock; so it's done with faults disabled, and we fault in the user + * buffer when we aren't holding locks. + * + * If we do part of the write, but we then race and in the userspace + * buffer have been evicted and are no longer resident, then we have to + * drop our folio locks to re-fault them in, breaking write atomicity. + * + * To fix this, we restart the write from the start, if we weren't + * holding the inode lock. + * + * There is another wrinkle after that; if we restart the write from the + * start, and then get an unrecoverable error, we _cannot_ claim to + * userspace that we did not write data we actually did - so we must + * track (written2) the most we ever wrote. + */ + + if ((iocb->ki_flags & IOCB_APPEND) || + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { + inode_lock(&inode->v); + inode_locked = true; + } + + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); + if (ret) { + if (!inode_locked) { + inode_lock(&inode->v); + inode_locked = true; + ret = file_remove_privs_flags(file, 0); + } + if (ret) + goto unlock; + } + + ret = file_update_time(file); + if (ret) + goto unlock; + + pos = iocb->ki_pos; bch2_pagecache_add_get(inode); + if (!inode_locked && + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) + goto get_inode_lock; + do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1004,12 +1072,17 @@ again: } } + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) + goto get_inode_lock; + if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); + if (ret == -BCH_ERR_need_inode_lock) + goto get_inode_lock; if (unlikely(ret < 0)) break; @@ -1030,50 +1103,46 @@ again: } pos += ret; written += ret; + written2 = max(written, written2); + + if (ret != bytes && !inode_locked) + goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); + if (0) { +get_inode_lock: + bch2_pagecache_add_put(inode); + inode_lock(&inode->v); + inode_locked = true; + bch2_pagecache_add_get(inode); + + iov_iter_revert(iter, written); + pos -= written; + written = 0; + ret = 0; + } + } while (iov_iter_count(iter)); bch2_pagecache_add_put(inode); +unlock: + if (inode_locked) + inode_unlock(&inode->v); + + iocb->ki_pos += written; - return written ? written : ret; + ret = max(written, written2) ?: ret; + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); + ssize_t ret = iocb->ki_flags & IOCB_DIRECT + ? bch2_direct_write(iocb, iter) + : bch2_buffered_write(iocb, iter); - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 8cbaba6565b4..828c3d7c8f19 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -51,13 +51,10 @@ enum bch_folio_sector_state { struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - + u8 nr_replicas:4, /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; + replicas_reserved:4; + u8 state; }; struct bch_folio { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542db9..3f073845bbd7 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked inode_u; - struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; + subvol_inum inum = inode_inum(inode); + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + if (unlikely(old != inode)) { + discard_new_inode(&inode->v); + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + return inode; +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_info *inode = + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, + to_bch_ei(new_inode(c->vfs_sb))); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + if (ret && inode) + discard_new_inode(&inode->v); + if (ret) + return ERR_PTR(ret); } - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + return inode; +} - unlock_new_inode(&inode->v); +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + return &inode->v; - return &inode->v; + struct btree_trans *trans = bch2_trans_get(c); + + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (!ret) { + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); + } + bch2_trans_put(trans); + + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; @@ -293,7 +336,6 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -304,36 +346,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -352,23 +365,78 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + + int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + if (ret) + return ERR_PTR(ret); + + struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (bch2_err_matches(ret, ENOENT)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s points to missing inode", buf.buf); + printbuf_exit(&buf); + } + if (ret) + goto err; + + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); +out: + bch2_trans_iter_exit(trans, &dirent_iter); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, @@ -1372,6 +1440,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); if (BCH_SUBVOLUME_SNAP(subvol)) @@ -1572,7 +1641,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1583,10 +1651,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; @@ -1805,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) + if (ret) { + ret = bch2_err_class(ret); return ERR_PTR(ret); + } if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); @@ -1882,6 +1949,7 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + sb->s_uuid = c->sb.user_uuid; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6a760777bafb..f48033be3f6b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -100,8 +100,8 @@ err: } static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) + struct bch_inode_unpacked *inode, + u32 *snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -142,34 +142,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static int fsck_write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __write_inode(trans, inode, snapshot)); - bch_err_fn(trans->c, ret); - return ret; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -280,7 +252,7 @@ create_lostfound: goto err; ret = bch2_dirent_create_snapshot(trans, - root_inode.bi_inum, snapshot, &root_hash_info, + 0, root_inode.bi_inum, snapshot, &root_hash_info, mode_to_type(lostfound->bi_mode), &lostfound_str, lostfound->bi_inum, @@ -303,30 +275,47 @@ static int reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 dirent_snapshot = inode_snapshot; int ret; - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); + if (ret) + return ret; + + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); + } else { + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + } + + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); if (ret) return ret; if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = __write_inode(trans, &lostfound, U32_MAX); + ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } dir_hash = bch2_hash_info_init(trans->c, &lostfound); - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); ret = bch2_dirent_create_snapshot(trans, - lostfound.bi_inum, inode_snapshot, + inode->bi_parent_subvol, lostfound.bi_inum, + dirent_snapshot, &dir_hash, inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, + &name, + inode->bi_subvol ?: inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -334,7 +323,7 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; - return __write_inode(trans, inode, inode_snapshot); + return __bch2_fsck_write_inode(trans, inode, inode_snapshot); } static int remove_backpointer(struct btree_trans *trans, @@ -353,6 +342,27 @@ static int remove_backpointer(struct btree_trans *trans, return ret; } +static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + ret = remove_backpointer(trans, &inode); + bch_err_msg(c, ret, "removing dirent"); + if (ret) + return ret; + + ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + return ret; +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -592,13 +602,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, - u32 snapshot, bool is_whiteout) +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { - struct inode_walker_entry *i; - - snapshot = bch2_snapshot_equiv(c, snapshot); + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + struct inode_walker_entry *i; __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -609,20 +618,24 @@ found: if (snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - size_t pos; - int ret; new.snapshot = snapshot; new.count = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - w->last_pos.inode, snapshot, i->snapshot); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, snapshot, i->snapshot, buf.buf); + printbuf_exit(&buf); while (i > w->inodes.data && i[-1].snapshot > snapshot) --i; - pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new); + size_t pos = i - w->inodes.data; + int ret = darray_insert_item(&w->inodes, pos, new); if (ret) return ERR_PTR(ret); @@ -633,21 +646,21 @@ found: } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos, - bool is_whiteout) + struct inode_walker *w, + struct bkey_s_c k) { - if (w->last_pos.inode != pos.inode) { - int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, pos)) { + } else if (bkey_cmp(w->last_pos, k.k->p)) { darray_for_each(w->inodes, i) i->seen_this_pos = false; } - w->last_pos = pos; + w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); + return lookup_inode_for_snapshot(trans->c, w, k); } static int __get_visible_inodes(struct btree_trans *trans, @@ -722,7 +735,7 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_snapshot(trans, desc, hash_info, + bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, @@ -795,16 +808,93 @@ fsck_err: goto out; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + if (inode->bi_subvol) { + u64 inum; + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); + if (ret) + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); + } + + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k); - if (ret) + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, + struct bch_inode_unpacked *inode, + u32 inode_snapshot, bool *write_inode) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + struct btree_iter dirent_iter = {}; + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); + int ret = bkey_err(d); + if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - bch2_trans_iter_exit(trans, &iter); - return k.k->type == KEY_TYPE_set; + if (fsck_err_on(ret, + c, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || + fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + c, inode_points_to_wrong_dirent, + "inode points to dirent that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + /* + * We just clear the backpointer fields for now. If we find a + * dirent that points to this inode in check_dirents(), we'll + * update it then; then when we get to check_path() if the + * backpointer is still 0 we'll reattach it. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + inode->bi_flags &= ~BCH_INODE_backptr_untrusted; + *write_inode = true; + } + + ret = 0; +fsck_err: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; } static int check_inode(struct btree_trans *trans, @@ -861,7 +951,8 @@ static int check_inode(struct btree_trans *trans, u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -876,7 +967,7 @@ static int check_inode(struct btree_trans *trans, if (ret < 0) return ret; - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, "inode %llu:%u unlinked, but not on deleted list", u.bi_inum, k.k->p.snapshot); ret = 0; @@ -950,8 +1041,49 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_dir || u.bi_dir_offset) { + ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + if (ret) + goto err; + } + + if (fsck_err_on(u.bi_parent_subvol && + (u.bi_subvol == 0 || + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), + c, inode_bi_parent_nonzero, + "inode %llu:%u has subvol %u but nonzero parent subvol %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { + u.bi_parent_subvol = 0; + do_update = true; + } + + if (u.bi_subvol) { + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, inode_bi_subvol_missing, + "inode %llu:%u bi_subvol points to missing subvolume %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), + k.k->p.snapshot), + c, inode_bi_subvol_wrong, + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, + le64_to_cpu(s.inode), + le32_to_cpu(s.snapshot))) { + u.bi_subvol = 0; + u.bi_parent_subvol = 0; + do_update = true; + } + } + if (do_update) { - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -982,28 +1114,6 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1032,7 +1142,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1312,7 +1422,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) goto err; @@ -1481,7 +1591,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1491,86 +1601,106 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - u32 target_snapshot) +static int check_dirent_inode_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) { struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; int ret = 0; + if (inode_points_to_dirent(target, d)) + return 0; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; + return __bch2_fsck_write_inode(trans, target, target_snapshot); } - if (!inode_points_to_dirent(target, d)) { - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; + struct btree_iter bp_iter = { NULL }; + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - bool backpointer_exists = !ret; - ret = 0; + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + goto out; + } - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + c, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target_snapshot, buf.buf)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, - c, inode_dir_multiple_links, - "directory %llu:%u with multiple links\n%s", - target->bi_inum, target_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } - } + ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); + if (ret) + goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), c, dirent_d_type_wrong, @@ -1586,6 +1716,12 @@ static int check_dirent_target(struct btree_trans *trans, bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) @@ -1593,33 +1729,134 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} - if (fsck_err_on(d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), - c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol)) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); +/* find a subvolume that's a descendent of @snapshot: */ +static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { + bch2_trans_iter_exit(trans, &iter); + *subvolid = k.k->p.offset; + goto found; + } + } + if (!ret) + ret = -ENOENT; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) +{ + struct bch_fs *c = trans->c; + struct btree_iter subvol_iter = {}; + struct bch_inode_unpacked subvol_root; + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 parent_snapshot; + u64 parent_inum; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, + "dirent parent_subvol points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), + c, dirent_not_visible_in_parent_subvol, + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", + parent_snapshot, + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + u32 new_parent_subvol; + ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); if (ret) goto err; - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); + if (ret) + goto err; - ret = bch2_trans_update(trans, iter, &n->k_i, 0); + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } + + struct bkey_s_c_subvolume s = + bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, target_subvol), + 0, subvolume); + ret = bkey_err(s.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret) { + if (fsck_err(c, dirent_to_missing_subvol, + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) + return __remove_dirent(trans, d.k->p); + ret = 0; + goto out; + } + + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, + c, subvol_fs_path_parent_wrong, + "subvol with wrong fs_path_parent, should be be %u\n%s", + parent_subvol, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); if (ret) goto err; - d = dirent_i_to_s_c(n); + n->v.fs_path_parent = cpu_to_le32(parent_subvol); } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + c, inode_bi_parent_wrong, + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", + target_inum, + subvol_root.bi_parent_subvol, parent_subvol)) { + subvol_root.bi_parent_subvol = parent_subvol; + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + return ret; out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &subvol_iter); printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -1661,7 +1898,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; @@ -1707,50 +1944,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - struct bch_inode_unpacked subvol_root; - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; - - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) { - ret = __remove_dirent(trans, d.k->p); - goto err; - } - - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - ret = -EINVAL; - goto err; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __write_inode(trans, &subvol_root, target_snapshot); - if (ret) - goto err; - } - - ret = check_dirent_target(trans, iter, d, &subvol_root, - target_snapshot); + ret = check_dirent_to_subvol(trans, iter, d); if (ret) goto err; } else { @@ -1776,12 +1970,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - } - - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) - i->count++; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + } out: err: fsck_err: @@ -1832,7 +2025,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -1919,7 +2112,7 @@ static int check_root_trans(struct btree_trans *trans) 0, NULL); root_inode.bi_inum = inum; - ret = __write_inode(trans, &root_inode, snapshot); + ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); bch_err_msg(c, ret, "writing root inode"); } err: @@ -1936,6 +2129,107 @@ int bch2_check_root(struct bch_fs *c) return ret; } +typedef DARRAY(u32) darray_u32; + +static bool darray_u32_has(darray_u32 *d, u32 v) +{ + darray_for_each(*d, i) + if (*i == v) + return true; + return false; +} + +/* + * We've checked that inode backpointers point to valid dirents; here, it's + * sufficient to check that the subvolume root has a dirent: + */ +static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + return inode.bi_dir != 0; +} + +static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter parent_iter = {}; + darray_u32 subvol_path = {}; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { + ret = darray_push(&subvol_path, k.k->p.offset); + if (ret) + goto err; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + ret = subvol_has_dirent(trans, s); + if (ret < 0) + break; + + if (fsck_err_on(!ret, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + + u32 parent = le32_to_cpu(s.v->fs_path_parent); + + if (darray_u32_has(&subvol_path, parent)) { + if (fsck_err(c, subvol_loop, "subvolume loop")) + ret = reattach_subvol(trans, s); + break; + } + + bch2_trans_iter_exit(trans, &parent_iter); + bch2_trans_iter_init(trans, &parent_iter, + BTREE_ID_subvolumes, POS(0, parent), 0); + k = bch2_btree_iter_peek_slot(&parent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + } +fsck_err: +err: + printbuf_exit(&buf); + darray_exit(&subvol_path); + bch2_trans_iter_exit(trans, &parent_iter); + return ret; +} + +int bch2_check_subvolume_structure(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_path(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + struct pathbuf_entry { u64 inum; u32 snapshot; @@ -1952,89 +2246,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, pathbuf *p, - u64 inum, u32 snapshot) -{ - int ret = darray_push(p, ((struct pathbuf_entry) { - .inum = inum, - .snapshot = snapshot, - })); - - if (ret) - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - p->size); - return ret; -} - /* - * Check that a given inode is reachable from the root: + * Check that a given inode is reachable from its subvolume root - we already + * verified subvolume connectivity: * * XXX: we should also be verifying that inodes are in the right subvolumes */ -static int check_path(struct btree_trans *trans, - pathbuf *p, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; + u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); int ret = 0; - snapshot = bch2_snapshot_equiv(c, snapshot); p->nr = 0; - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + BUG_ON(bch2_inode_unpack(inode_k, &inode)); + + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - if (inode->bi_subvol) { - u64 inum; - - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &parent_snapshot, &inum); - if (ret) - break; - } - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot)); + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, inode)) { + if (!ret && !dirent_points_to_inode(d, &inode)) { bch2_trans_iter_exit(trans, &dirent_iter); ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, inode_unreachable, - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode->bi_inum, snapshot, - bch2_d_type_str(inode_d_type(inode)), - inode->bi_nlink, - inode->bi_dir, - inode->bi_dir_offset)) - ret = reattach_inode(trans, inode, snapshot); - break; + ret = 0; + if (fsck_err(c, inode_unreachable, + "unreachable inode\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, inode_k), + buf.buf))) + ret = reattach_inode(trans, &inode, snapshot); + goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode->bi_mode)) + if (!S_ISDIR(inode.bi_mode)) break; - ret = path_down(c, p, inode->bi_inum, snapshot); - if (ret) { - bch_err(c, "memory allocation failure"); + ret = darray_push(p, ((struct pathbuf_entry) { + .inum = inode.bi_inum, + .snapshot = snapshot, + })); + if (ret) return ret; - } snapshot = parent_snapshot; - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2042,30 +2318,32 @@ static int check_path(struct btree_trans *trans, break; } - if (path_is_dup(p, inode->bi_inum, snapshot)) { + snapshot = inode_k.k->p.snapshot; + + if (path_is_dup(p, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode->bi_inum, snapshot); - - if (!fsck_err(c, dir_loop, "directory structure loop")) - return 0; + pr_err("%llu:%u", inode.bi_inum, snapshot); - ret = remove_backpointer(trans, inode); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + if (fsck_err(c, dir_loop, "directory structure loop")) { + ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); - if (ret) - break; + if (ret) + break; - ret = reattach_inode(trans, inode, snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + ret = reattach_inode(trans, &inode, snapshot); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + } break; } } +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -2077,7 +2355,6 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; @@ -2090,12 +2367,10 @@ int bch2_check_directory_structure(struct bch_fs *c) if (!bkey_is_inode(k.k)) continue; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (u.bi_flags & BCH_INODE_unlinked) + if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, &u, iter.pos.snapshot); + check_path(trans, &path, k); }))); darray_exit(&path); @@ -2291,7 +2566,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __write_inode(trans, &u, k.k->p.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); } fsck_err: return ret; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index da991e8cf27e..a4ef94271784 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); +int bch2_check_subvolume_structure(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 086f0090b03a..2b5e06770ab3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -static int bch2_inode_peek_nowarn(struct btree_trans *trans, +int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) @@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } +int __bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +int bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fsck_write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); + return ret; +} + struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; @@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans, bool old_deleted = bkey_is_deleted_inode(old); bool new_deleted = bkey_is_deleted_inode(new.s_c); if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, new_deleted); if (ret) return ret; } @@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); - if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); + if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + c, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1141,7 +1171,7 @@ fsck_err: bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); goto out; } @@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bool need_another_pass; int ret; again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush + */ + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + need_another_pass = false; /* @@ -1183,12 +1222,8 @@ again: ret; })); - if (!ret && need_another_pass) { - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; + if (!ret && need_another_pass) goto again; - } err: bch2_trans_put(trans); return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b63f312581cf..056298050550 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); +int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); @@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans, return bch2_inode_write_flags(trans, iter, inode, 0); } +int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); +int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); + void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); void bch2_inode_init_late(struct bch_inode_unpacked *, u64, @@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode) return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); } +static inline u32 bch2_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3c574d8873a1..8a556e6d1ab6 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 2c098ac017b3..f137252bccc5 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) bch2_congested_acct(ca, io_latency, now, rw); - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif @@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1067,7 +1068,8 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1449,7 +1452,9 @@ err: bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + "%s(): %s error: %s", __func__, + op->flags & BCH_WRITE_MOVE ? "move" : "user", + bch2_err_str(ret)); op->error = ret; break; } @@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write) bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "misaligned write"); + "%s write error: misaligned write", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); op->error = -EIO; goto err; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bc890776eb57..f314b2e78ec3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = { NULL }; +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_printf(out, "seq:"); + prt_str(out, "seq:"); prt_tab(out); prt_printf(out, "%llu", seq); prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:"); + prt_str(out, "refcount:"); prt_tab(out); prt_printf(out, "%u", journal_state_count(s, i)); prt_newline(out); - prt_printf(out, "size:"); + prt_str(out, "size:"); prt_tab(out); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_printf(out, "expires"); + prt_str(out, "expires:"); prt_tab(out); prt_printf(out, "%li jiffies", buf->expires - jiffies); prt_newline(out); + prt_str(out, "flags:"); + prt_tab(out); + if (buf->noflush) + prt_str(out, "noflush "); + if (buf->must_flush) + prt_str(out, "must_flush "); + if (buf->separate_flush) + prt_str(out, "separate_flush "); + if (buf->need_flush_to_write_buffer) + prt_str(out, "need_flush_to_write_buffer "); + if (buf->write_started) + prt_str(out, "write_started "); + if (buf->write_allocated) + prt_str(out, "write allocated "); + if (buf->write_done) + prt_str(out, "write done"); + prt_newline(out); + printbuf_indent_sub(out, 2); } @@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); -} - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); } static inline struct journal_buf * @@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } +void bch2_journal_do_writes(struct journal *j) +{ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + idx; + + if (w->write_started && !w->write_allocated) + break; + if (w->write_started) + continue; + + if (!journal_state_count(j->reservations, idx)) { + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + + break; + } +} + /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +void bch2_journal_buf_put_final(struct journal *j, u64 seq) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + bch2_journal_do_writes(j); } /* @@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j) BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; buf->need_flush_to_write_buffer = true; + buf->write_started = false; + buf->write_allocated = false; + buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - mod_delayed_work(c->io_complete_wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); + if (nr_unwritten_journal_entries(j) == 1) + mod_delayed_work(j->wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) @@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - long delta; spin_lock(&j->lock); - if (!__journal_entry_is_open(j->reservations)) - goto unlock; - - delta = journal_cur_buf(j)->expires - jiffies; + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; - if (delta > 0) - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -unlock: + if (delta > 0) + mod_delayed_work(j->wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } spin_unlock(&j->lock); } @@ -473,33 +511,32 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + ret = JOURNAL_ERR_journal_full; + can_discard = j->can_discard; + goto out; + } - spin_lock(&j->lock); + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; - /* check once more in case somebody else shut things down... */ - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); + if (bch2_journal_error(j)) return -BCH_ERR_erofs_journal_err; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { + ret = JOURNAL_ERR_max_in_flight; + goto out; } + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { - spin_unlock(&j->lock); - return 0; - } - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - /* - * Don't want to close current journal entry, just need to - * invoke reclaim: - */ - ret = JOURNAL_ERR_journal_full; + ret = 0; goto unlock; } @@ -515,30 +552,30 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j); - - if (ret == JOURNAL_ERR_max_in_flight) { - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true); - if (trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_journal_bufs_to_text(&buf, j); - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - } - count_event(c, journal_entry_full); - } + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); - - if (!ret) +out: + if (ret == JOURNAL_ERR_retry) goto retry; + if (!ret) + return 0; + if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; + if (ret == JOURNAL_ERR_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -674,7 +711,7 @@ recheck_need_open: return ret; seq = res.seq; - buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { @@ -692,8 +729,8 @@ recheck_need_open: } /* - * if write was kicked off without a flush, flush the next sequence - * number instead + * if write was kicked off without a flush, or if we promised it + * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { @@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) goto out; buf->noflush = true; @@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - unsigned ptr; u64 last_seq = cur_seq, nr, seq; genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); @@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); @@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) p = journal_seq_pin(j, seq); p->devs.nr = 0; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) + bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } @@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) void bch2_dev_journal_exit(struct bch_dev *ca) { - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + kfree(ja->bio[i]); + ja->bio[i] = NULL; + } - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; + kfree(ja->buckets); + kfree(ja->bucket_seq); + ja->buckets = NULL; + ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) @@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); - unsigned i, nr_bvecs; ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -BCH_ERR_ENOMEM_dev_journal_init; - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!ca->journal.bio) - return -BCH_ERR_ENOMEM_dev_journal_init; + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); + if (!ja->bio[i]) + return -BCH_ERR_ENOMEM_dev_journal_init; - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) @@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned j, dst = 0; + unsigned dst = 0; - for (i = 0; i < nr; i++) - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + for (unsigned i = 0; i < nr; i++) + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { - for (i = 0; i < ja->nr; i++) + for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } @@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned i; + if (j->wq) + destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); - for (i = 0; i < ARRAY_SIZE(j->buf); i++) - kvpfree(j->buf[i].data, j->buf[i].buf_size); + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + kvfree(j->buf[i].data); free_fifo(&j->pin); } int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned i; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); @@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; + j->buf[i].idx = i; } j->pin.front = j->pin.back = 1; + + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) + return -BCH_ERR_ENOMEM_fs_other_alloc; return 0; } @@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "blocked:\t\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); @@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) list_for_each_entry(pin, &pin_list->list[i], list) { prt_printf(out, "\t%px %ps", pin, pin->flush); prt_newline(out); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 4544ce24bb8a..7c7528f839c5 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u } bool bch2_journal_entry_close(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64, bool); +void bch2_journal_do_writes(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { @@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); } } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 47805193f18c..d76c3c0c203f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,37 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + darray_for_each(j->ptrs, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + u64 offset; + + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); + + if (i != j->ptrs.data) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + i->dev, i->bucket, i->bucket_offset, i->sector); + } +} + +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + + bch2_journal_ptrs_to_text(out, c, j); + + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); + break; + } +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c, BUG_ON(*p != i); *p = NULL; - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + kvfree(i); } -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) { - i->ignore = true; + if (blacklisted) + i->ignore_blacklisted = true; + else + i->ignore_not_dirty = true; if (!c->opts.read_entire_journal) __journal_replay_free(c, i); @@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; - struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; /* Is this entry older than the range we need? */ @@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, jlist->last_seq)) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (le64_to_cpu(i->j.seq) >= last_seq) break; - journal_replay_free(c, i); + + journal_replay_free(c, i, false); } } @@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes)) { - i = dup; - goto found; - } + bool identical = bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes); + bool not_identical = !identical && + entry_ptr.csum_good && + dup->csum_good; + + bool same_device = false; + darray_for_each(dup->ptrs, ptr) + if (ptr->dev == ca->dev_idx) + same_device = true; + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) + goto out; - if (!entry_ptr.csum_good) { - i = dup; - goto found; - } + bch2_journal_replay_to_text(&buf, c, dup); - if (!dup->csum_good) + fsck_err_on(same_device, + c, journal_entry_dup_same_device, + "duplicate journal entry on same device\n %s", + buf.buf); + + fsck_err_on(not_identical, + c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries\n %s", + buf.buf); + + if (entry_ptr.csum_good && !identical) goto replace; - fsck_err(c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - i = dup; - goto found; + goto out; } replace: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; - i->nr_ptrs = 0; - i->csum_good = entry_ptr.csum_good; - i->ignore = false; + darray_init(&i->ptrs); + i->csum_good = entry_ptr.csum_good; + i->ignore_blacklisted = false; + i->ignore_not_dirty = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; - } - /* The first ptr should represent the jset we kept: */ - memcpy(i->ptrs + i->nr_ptrs, - dup->ptrs, - sizeof(dup->ptrs[0]) * dup->nr_ptrs); - i->nr_ptrs += dup->nr_ptrs; + darray_for_each(dup->ptrs, ptr) + darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); + } else { + darray_push(&i->ptrs, entry_ptr); } *_i = i; - return 0; -found: - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { - if (ptr->dev == ca->dev_idx) { - bch_err(c, "duplicate journal entry %llu on same device", - le64_to_cpu(i->j.seq)); - goto out; - } - } - - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - goto out; - } - - i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: + printbuf_exit(&buf); return ret; } @@ -374,7 +398,6 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { - struct bkey_i *k; bool first = true; jset_entry_for_each_key(entry, k) { @@ -741,6 +764,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; + int ret = 0; + + if (journal_entry_err_on(vstruct_bytes(entry) < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } +fsck_err: + return ret; +} + +static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -913,11 +967,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); + n = kvmalloc(new_size, GFP_KERNEL); if (!n) return -BCH_ERR_ENOMEM_journal_read_buf_realloc; - kvpfree(b->data, b->size); + kvfree(b->data); b->data = n; b->size = new_size; return 0; @@ -1102,16 +1156,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!r) continue; - for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + darray_for_each(r->ptrs, i) + if (i->dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, i->sector) + vstruct_sectors(&r->j, c->block_bits); - ja->cur_idx = r->ptrs[i].bucket; + ja->cur_idx = i->bucket; ja->sectors_free = ca->mi.bucket_size - wrote; goto found; } - } } found: mutex_unlock(&jlist->lock); @@ -1144,7 +1197,7 @@ found: ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvpfree(buf.data, buf.size); + kvfree(buf.data); percpu_ref_put(&ca->io_ref); closure_return(cl); return; @@ -1155,27 +1208,6 @@ err: goto out; } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - unsigned i; - - for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); - u64 offset; - - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); - - if (i) - prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - j->ptrs[i].dev, - j->ptrs[i].bucket, - j->ptrs[i].bucket_offset, - j->ptrs[i].sector); - } -} - int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, @@ -1228,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c, i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (!*start_seq) *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { - i->ignore = true; + i->ignore_blacklisted = true; continue; } if (!last_write_torn && !i->csum_good) { last_write_torn = true; - i->ignore = true; + i->ignore_blacklisted = true; continue; } @@ -1280,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); if (seq < *last_seq) { - journal_replay_free(c, i); + journal_replay_free(c, i, false); continue; } @@ -1293,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err_on(!JSET_NO_FLUSH(&i->j), c, jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); - i->ignore = true; + i->ignore_blacklisted = true; } } @@ -1302,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); @@ -1353,32 +1385,31 @@ int bch2_journal_read(struct bch_fs *c, .e.data_type = BCH_DATA_journal, .e.nr_required = 1, }; - unsigned ptr; i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - if (!i->ptrs[ptr].csum_good) - bch_err_dev_offset(ca, i->ptrs[ptr].sector, + if (!ptr->csum_good) + bch_err_dev_offset(ca, ptr->sector, "invalid journal checksum, seq %llu%s", le64_to_cpu(i->j.seq), i->csum_good ? " (had good copy on another device)" : ""); } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs[0].dev), + bch_dev_bkey_exists(c, i->ptrs.data[0].dev), &i->j, - i->ptrs[0].sector, + i->ptrs.data[0].sector, READ); if (ret) goto err; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + darray_for_each(i->ptrs, ptr) + replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; bch2_replicas_entry_sort(&replicas.e); @@ -1547,7 +1578,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1558,7 +1589,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) swap(buf->buf_size, new_size); spin_unlock(&j->lock); - kvpfree(new_buf, new_size); + kvfree(new_buf); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) @@ -1568,12 +1599,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) static CLOSURE_CALLBACK(journal_write_done) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq; + u64 v, seq = le64_to_cpu(w->data->seq); int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -1593,63 +1624,68 @@ static CLOSURE_CALLBACK(journal_write_done) if (err) bch2_fatal_error(c); - spin_lock(&j->lock); - seq = le64_to_cpu(w->data->seq); + closure_debug_destroy(cl); + spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + w->write_done = true; - if (!err) { - if (!JSET_NO_FLUSH(w->data)) { + bool completed = false; + + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + w = j->buf + (seq & JOURNAL_BUF_MASK); + if (!w->write_done) + break; + + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; bch2_do_discards(c); closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); } - } else if (!j->err_seq || seq < j->err_seq) - j->err_seq = seq; - j->seq_ondisk = seq; + j->seq_ondisk = seq; - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - v = atomic64_read(&j->reservations.counter); - do { - old.v = new.v = v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + closure_wake_up(&w->wait); + completed = true; + } - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); + if (completed) { + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - closure_wake_up(&w->wait); - journal_wake(j); + journal_wake(j); + } - if (!journal_state_count(new, new.unwritten_idx) && - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { - spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1659,46 +1695,46 @@ static CLOSURE_CALLBACK(journal_write_done) * previous entries still in flight - the current journal entry * might want to be written now: */ - - spin_unlock(&j->lock); - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); - } else { - spin_unlock(&j->lock); + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) { - struct bch_dev *ca = bio->bi_private; + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); + struct bch_dev *ca = jbio->ca; struct journal *j = &ca->fs->journal; - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; + struct journal_buf *w = j->buf + jbio->buf_idx; if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { + unsigned long flags; + spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } - closure_put(&j->io); + closure_put(&w->io); percpu_ref_put(&ca->io_ref); } static CLOSURE_CALLBACK(do_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct journal_device *ja = &ca->journal; + if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); @@ -1708,7 +1744,7 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - bio = ca->journal.bio; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1727,11 +1763,10 @@ static CLOSURE_CALLBACK(do_journal_write) trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = - le64_to_cpu(w->data->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) @@ -1782,7 +1817,6 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (!wb.wb) bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - struct bkey_i *k; jset_entry_for_each_key(i, k) { ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); if (ret) { @@ -1798,12 +1832,20 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (wb.wb) bch2_journal_keys_to_write_buffer_end(c, &wb); + + spin_lock(&c->journal.lock); w->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + struct jset_entry_datetime *d = + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); + d->entry.type = BCH_JSET_ENTRY_datetime; + d->seconds = cpu_to_le64(ktime_get_real_seconds()); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1893,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * j->nr_noflush_writes++; } else { + w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); @@ -1903,20 +1946,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * CLOSURE_CALLBACK(bch2_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; - struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; + for_each_rw_member(c, ca) + nr_rw_members++; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(!w->write_started); + BUG_ON(w->write_allocated); + BUG_ON(w->write_done); j->write_start_time = local_clock(); spin_lock(&j->lock); + if (nr_rw_members > 1) + w->separate_flush = true; + ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); if (ret) @@ -1956,12 +2007,14 @@ CLOSURE_CALLBACK(bch2_journal_write) * bch2_journal_space_available(): */ w->sectors = 0; + w->write_allocated = true; /* * journal entry has been compacted and allocated, recalculate space * available: */ bch2_journal_space_available(j); + bch2_journal_do_writes(j); spin_unlock(&j->lock); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); @@ -1969,12 +2022,6 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(c, ca) - nr_rw_members++; - - if (nr_rw_members > 1) - w->separate_flush = true; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -1985,25 +2032,29 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (!JSET_NO_FLUSH(w->data)) + closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq)); + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); - bio = ca->journal.bio; + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_PREFLUSH); + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); } } - continue_at(cl, do_journal_write, c->io_complete_wq); + continue_at(cl, do_journal_write, j->wq); return; no_io: - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); return; err: bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index c035e7c108e1..4f1e763ab506 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -2,26 +2,35 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H +#include "darray.h" + +struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration */ struct journal_replay { - struct journal_ptr { - bool csum_good; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; - } ptrs[BCH_REPLICAS_MAX]; - unsigned nr_ptrs; + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; - bool ignore; + bool ignore_blacklisted; + bool ignore_not_dirty; /* must be last: */ struct jset j; }; +static inline bool journal_replay_ignore(struct journal_replay *i) +{ + return !i || i->ignore_blacklisted || i->ignore_not_dirty; +} + static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { @@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, } #define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ + for (struct jset_entry *entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ entry = vstruct_next(entry)) #define jset_entry_for_each_key(_e, _k) \ - for (_k = (_e)->start; \ + for (struct bkey_i *_k = (_e)->start; \ _k < vstruct_last(_e); \ _k = bkey_next(_k)) @@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); +static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c33dca641575..ab811c0dad26 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j) ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], - &j->low_on_space_start, low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j, return; } - reclaim = __journal_pin_drop(j, dst); + bool reclaim = __journal_pin_drop(j, dst); bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); BUG_ON(seq < journal_last_seq(j)); - reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + + spin_unlock(&j->lock); } /** diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 0200e299cfbb..b5303874fc35 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr) return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } -static struct bch_sb_field_journal_seq_blacklist * -blacklist_entry_try_merge(struct bch_fs *c, - struct bch_sb_field_journal_seq_blacklist *bl, - unsigned i) -{ - unsigned nr = blacklist_nr_entries(bl); - - if (le64_to_cpu(bl->start[i].end) >= - le64_to_cpu(bl->start[i + 1].start)) { - bl->start[i].end = bl->start[i + 1].end; - --nr; - memmove(&bl->start[i], - &bl->start[i + 1], - sizeof(bl->start[0]) * (nr - i)); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr)); - BUG_ON(!bl); - } - - return bl; -} - -static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, - u64 start, u64 end) -{ - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -} - int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i, nr; + unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); - for (i = 0; i < nr; i++) { + while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; - if (bl_entry_contig_or_overlaps(e, start, end)) { - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; + if (end < le64_to_cpu(e->start)) + break; + + if (start > le64_to_cpu(e->end)) { + i++; + continue; } + + /* + * Entry is contiguous or overlapping with new entry: merge it + * with new entry, and delete: + */ + + start = min(start, le64_to_cpu(e->start)); + end = max(end, le64_to_cpu(e->end)); + array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, @@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) goto out; } - bl->start[nr].start = cpu_to_le64(start); - bl->start[nr].end = cpu_to_le64(end); -out_write_sb: + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), + .end = cpu_to_le64(end), + })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); @@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) if (!bl) return 0; - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, - GFP_KERNEL); + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 38817c7a0851..8c053cb64ca5 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -18,6 +18,7 @@ * the journal that are being staged or in flight. */ struct journal_buf { + struct closure io; struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); @@ -33,10 +34,14 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; - bool noflush; /* write has already been kicked off, and was noflush */ - bool must_flush; /* something wants a flush */ - bool separate_flush; - bool need_flush_to_write_buffer; + bool noflush:1; /* write has already been kicked off, and was noflush */ + bool must_flush:1; /* something wants a flush */ + bool separate_flush:1; + bool need_flush_to_write_buffer:1; + bool write_started:1; + bool write_allocated:1; + bool write_done:1; + u8 idx; }; /* @@ -134,6 +139,7 @@ enum journal_flags { /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ + x(retry) \ x(blocked) \ x(max_in_flight) \ x(journal_full) \ @@ -149,6 +155,13 @@ enum journal_errors { typedef DARRAY(u64) darray_u64; +struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; + + struct bio bio; +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ @@ -203,8 +216,8 @@ struct journal { wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure io; struct delayed_work write_work; + struct workqueue_struct *wq; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -274,11 +287,6 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - u64 low_on_space_start; - u64 low_on_pin_start; - u64 max_in_flight_start; - u64 write_buffer_full_start; - struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; struct bch2_time_stats *flush_seq_time; @@ -313,7 +321,7 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio; + struct journal_bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 7a4ca5a28b3e..26569043e368 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, bool set) { return time - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) : 0; } @@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (c->opts.reconstruct_alloc || - fsck_err(c, lru_entry_bad, + if (fsck_err(c, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index bf0ef668fd38..0ea9f30803a2 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() * @s: mean and variance number of samples and their sums * @x: new value to include in the &mean_and_variance_weighted + * @initted: caller must track whether this is the first use or not + * @weight: ewma weight * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 x, bool initted, u8 weight) { // previous weighted variance. - u8 w = s->weight; + u8 w = weight; u64 var_w0 = s->variance; // new value weighted. s64 x_w = x << w; @@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 // new mean weighted. s64 u_w1 = s->mean + diff; - if (!s->init) { + if (!initted) { s->mean = x_w; s->variance = 0; } else { s->mean = u_w1; s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight) { - return fast_divpow2(s.mean, s.weight); + return fast_divpow2(s.mean, weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight) { // always positive don't need fast divpow2 - return s.variance >> s.weight; + return s.variance >> weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight) { - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 64df11ab422b..4fcf062dd22c 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -154,8 +154,6 @@ struct mean_and_variance { /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 weight; /* base 2 logarithim */ s64 mean; u64 variance; }; @@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s); u64 mean_and_variance_get_variance(struct mean_and_variance s1); u32 mean_and_variance_get_stddev(struct mean_and_variance s); -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 v, bool initted, u8 weight); -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 019583c3ca0e..db63b3f3b338 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test) static void mean_and_variance_weighted_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 2 }; + struct mean_and_variance_weighted s = { }; - mean_and_variance_weighted_update(&s, 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, 10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, 20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, 20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, 30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, 30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - s = (struct mean_and_variance_weighted) { .weight = 2 }; + s = (struct mean_and_variance_weighted) { }; - mean_and_variance_weighted_update(&s, -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, -10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, -20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, -20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, -30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, -30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); } static void mean_and_variance_weighted_advanced_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 8 }; + struct mean_and_variance_weighted s = { }; + bool initted = false; s64 i; - for (i = 10; i <= 100; i += 10) - mean_and_variance_weighted_update(&s, i); + for (i = 10; i <= 100; i += 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - s = (struct mean_and_variance_weighted) { .weight = 8 }; + s = (struct mean_and_variance_weighted) { }; + initted = false; - for (i = -10; i >= -100; i -= 10) - mean_and_variance_weighted_update(&s, i); + for (i = -10; i >= -100; i -= 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); } static void do_mean_and_variance_test(struct kunit *test, @@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test, s64 *weighted_stddev) { struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { .weight = weight }; + struct mean_and_variance_weighted vw = { }; for (unsigned i = 0; i < initial_n; i++) { mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value); + mean_and_variance_weighted_update(&vw, initial_value, false, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); } for (unsigned i = 0; i < n; i++) { mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i]); + mean_and_variance_weighted_update(&vw, data[i], true, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); } KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 5623cee3ef86..69098eeb5d48 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -EINVAL; + return -BCH_ERR_remove_would_lose_data; return 0; } @@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -EINVAL; + return -BCH_ERR_remove_with_metadata_missing_unimplemented; trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -132,10 +132,8 @@ retry: ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); - if (ret) { - bch_err(c, "Cannot drop device without losing data"); + if (ret) break; - } ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index b1ed0b9a20d3..08ea0cfc4aef 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0 || (*res != 0 && *res != 1)) { if (err) prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; + return ret < 0 ? ret : -BCH_ERR_option_not_bool; } break; case BCH_OPT_UINT: @@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) - return -1; + return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { @@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, bad_opt: pr_err("Bad mount option %s", name); - ret = -1; + ret = -BCH_ERR_option_name; goto out; bad_val: pr_err("Invalid mount option %s", err.buf); - ret = -1; + ret = -BCH_ERR_option_value; goto out; out: kfree(copied_opts_start); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a4b7faa3765..136083c11f3a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -290,6 +290,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't kick drives out when splitbrain detected")\ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ @@ -332,6 +337,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 22d1017aa49b..56336f3dd1d0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) u64 now = atomic64_read(&c->io_clock[WRITE].now); prt_str(out, "io wait duration: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); prt_str(out, "io wait remaining: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); prt_str(out, "duration waited: "); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 21e13bb4335b..2af219aedfdb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void drop_alloc_keys(struct journal_keys *keys) +static void do_reconstruct_alloc(struct bch_fs *c) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); + + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + struct journal_keys *keys = &c->journal_keys; size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->d[src].btree_id)) - keys->d[dst++] = keys->d[src]; - + if (!btree_id_is_alloc(keys->data[src].btree_id)) + keys->data[dst++] = keys->data[src]; keys->nr = dst; } @@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys) */ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { - struct journal_key *i; - - for (i = keys->d; i < keys->d + keys->nr; i++) + darray_for_each(*keys, i) if (i->k->k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } @@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (ret) goto out; + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } + /* Must be checked with btree locked: */ if (k->overwritten) goto out; @@ -166,11 +208,9 @@ static int bch2_journal_replay(struct bch_fs *c) * efficient - better locality of btree access - but some might fail if * that would cause a journal deadlock. */ - for (size_t i = 0; i < keys->nr; i++) { + darray_for_each(*keys, k) { cond_resched(); - struct journal_key *k = keys->d + i; - /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -264,7 +304,7 @@ static int journal_replay_entry_early(struct bch_fs *c, bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { - r->error = -EIO; + r->error = -BCH_ERR_btree_node_read_error; } r->alive = true; break; @@ -359,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c, genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; vstruct_for_each(&i->j, entry) { @@ -388,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && - c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - } if (r->error) { __fsck_err(c, @@ -524,8 +561,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) * setting journal_key->overwritten: it will be accessed by multiple * threads */ - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -862,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto out; genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i && !(*i)->ignore) { + if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } @@ -887,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c) genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i) { last_journal_entry = &(*i)->j; - (*i)->ignore = false; + (*i)->ignore_blacklisted = false; + (*i)->ignore_not_dirty= false; /* * This was probably a NO_FLUSH entry, * so last_seq was garbage - but we know @@ -923,10 +960,8 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - drop_alloc_keys(&c->journal_keys); - } + if (c->opts.reconstruct_alloc) + do_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -950,7 +985,7 @@ use_clean: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { - bch_err(c, "error creating new journal seq blacklist entry"); + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; } } @@ -961,9 +996,6 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) - bch2_journal_log_msg(c, "dropping alloc info"); - /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index fa0c8efd2a1b..1361e34d4e64 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -34,6 +34,7 @@ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ @@ -43,6 +44,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index b6bf0ebe7e84..5980ba2563fe 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -171,22 +171,6 @@ fsck_err: return ERR_PTR(ret); } -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 441dcb1bf160..e4396cb0bacb 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -45,7 +45,13 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(subvolume_fs_parent, \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ + x(btree_subvolume_children, \ + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ + BCH_FSCK_ERR_subvol_children_not_set) #define DOWNGRADE_TABLE() @@ -253,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi if (e < BCH_SB_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + __set_bit_le64(e, ext->errors_silent); } } } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index c08aacdfd073..5178bf579f7c 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -231,7 +231,7 @@ x(dirent_name_dot_or_dotdot, 223) \ x(dirent_name_has_slash, 224) \ x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ + x(inode_bi_parent_wrong, 226) \ x(dirent_in_missing_dir_inode, 227) \ x(dirent_in_non_dir_inode, 228) \ x(dirent_to_missing_inode, 229) \ @@ -250,7 +250,22 @@ x(hash_table_key_duplicate, 242) \ x(hash_table_key_wrong_offset, 243) \ x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index fcaa5a888744..3976f80721bf 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_snapshot(struct btree_trans *trans, +int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, @@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans, struct bkey_i *insert, bch_str_hash_flags_t str_hash_flags) { - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - insert->k.p.inode = inum.inum; - return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_set_in_snapshot(trans, desc, info, inum, + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 7c67c28d3ef8..ce7aed121942 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -13,13 +13,26 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static struct bpos subvolume_children_pos(struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_subvolume) + return POS_MIN; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (!s.v->fs_path_parent) + return POS_MIN; + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); +} + static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; + struct btree_iter subvol_children_iter = {}; struct bch_snapshot snapshot; + struct printbuf buf = PRINTBUF; unsigned snapid; int ret = 0; @@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.v->fs_path_parent, + c, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = 0; + } + + if (subvol.v->fs_path_parent) { + struct bpos pos = subvolume_children_pos(k); + + struct bkey_s_c subvol_children_k = + bch2_bkey_get_iter(trans, &subvol_children_iter, + BTREE_ID_subvolume_children, pos, 0); + ret = bkey_err(subvol_children_k); + if (ret) + goto err; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + c, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", + pos.inode, pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + if (ret) + goto err; + } + } + + struct bch_inode_unpacked inode; + struct btree_iter inode_iter = {}; + ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + 0); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode_iter.k.p.snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_tree; @@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans, SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } - +err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_children_iter); + printbuf_exit(&buf); return ret; } @@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c) return ret; } +static int check_subvol_child(struct btree_trans *trans, + struct btree_iter *child_iter, + struct bkey_s_c child_k) +{ + struct bch_fs *c = trans->c; + struct bch_subvolume s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), + 0, subvolume, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret || + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, + c, subvol_children_bad, + "incorrect entry in subvolume_children btree %llu:%llu", + child_k.k->p.inode, child_k.k->p.offset)) { + ret = bch2_btree_delete_at(trans, child_iter, 0); + if (ret) + goto err; + } +err: +fsck_err: + return ret; +} + +int bch2_check_subvol_children(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_child(trans, &iter, k))); + bch_err_fn(c, ret); + return 0; +} + /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } +} + +static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +{ + return !bpos_eq(pos, POS_MIN) + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) + : 0; +} + +int bch2_subvolume_trigger(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bpos children_pos_old = subvolume_children_pos(old); + struct bpos children_pos_new = subvolume_children_pos(new.s_c); + + if (!bpos_eq(children_pos_old, children_pos_new)) { + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: + subvolume_children_mod(trans, children_pos_new, true); + if (ret) + return ret; + } + } + + return 0; +} + +int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) +{ + struct btree_iter iter; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + bch2_trans_iter_exit(trans, &iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + : 0; } static __always_inline int @@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) return 0; - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) return 0; s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); @@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (ret) return ret; - s->v.parent = cpu_to_le32(new_parent); + s->v.creation_parent = cpu_to_le32(new_parent); return 0; } @@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.parent))); + subvolid_to_delete, le32_to_cpu(s.creation_parent))); } /* @@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 parent_subvolid, u32 src_subvolid, u32 *new_subvolid, u32 *new_snapshotid, @@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.parent = cpu_to_le32(src_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a6f56f66e27c..903c05162c06 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -8,17 +8,22 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); +int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ .val_to_text = bch2_subvolume_to_text, \ + .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ }) +int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); @@ -30,8 +35,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, - u32 *, u32 *, bool); +int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_fs_subvolumes_init(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h index af79134b07d6..e029df7ba89f 100644 --- a/fs/bcachefs/subvolume_format.h +++ b/fs/bcachefs/subvolume_format.h @@ -19,8 +19,8 @@ struct bch_subvolume { * This is _not_ necessarily the subvolume of the directory containing * this subvolume: */ - __le32 parent; - __le32 pad; + __le32 creation_parent; + __le32 fs_path_parent; bch_le128 otime; }; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index bd64eb68e84a..bceac29f3d86 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return ret; } + if (rw == WRITE && + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), + le64_to_cpu(sb->seq)); + return -BCH_ERR_invalid_sb_members_missing; + } + return 0; } @@ -717,6 +725,7 @@ retry: if (IS_ERR(sb->s_bdev_file)) { ret = PTR_ERR(sb->s_bdev_file); + prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); goto err; } sb->bdev = file_bdev(sb->s_bdev_file); @@ -743,9 +752,9 @@ retry: prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - printk(KERN_INFO "%s", err2.buf); + bch2_print_opts(opts, KERN_INFO "%s", err2.buf); else - printk(KERN_ERR "%s", err2.buf); + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); printbuf_exit(&err2); printbuf_reset(&err); @@ -803,21 +812,20 @@ got_super: goto err; } - ret = 0; sb->have_layout = true; ret = bch2_sb_validate(sb, &err, READ); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); goto err_no_print; } out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); err_no_print: bch2_free_super(sb); goto out; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6b23e11825e6..233f864ed8b0 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -56,6 +56,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> @@ -86,6 +87,23 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) +{ + struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); + } + va_end(args); +} + void __bch2_print(struct bch_fs *c, const char *fmt, ...) { struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -95,16 +113,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...) if (likely(!stdio)) { vprintk(fmt, args); } else { - unsigned long flags; - if (fmt[0] == KERN_SOH[0]) fmt += 2; - spin_lock_irqsave(&stdio->output_lock, flags); - prt_vprintf(&stdio->output_buf, fmt, args); - spin_unlock_irqrestore(&stdio->output_lock, flags); - - wake_up(&stdio->output_wait); + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); } va_end(args); } @@ -576,7 +588,7 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); - kvpfree(c, sizeof(*c)); + kvfree(c); module_put(THIS_MODULE); } @@ -715,7 +727,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; @@ -818,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; pr_uuid(&name, c->sb.user_uuid.b); - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -862,13 +874,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -882,8 +894,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -1061,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) } static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb) + struct bch_sb_handle *sb, + struct bch_opts *opts) { if (fs == sb) return 0; @@ -1102,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); - prt_printf(&buf, "Not using older sb"); + if (!opts->no_splitbrain_check) + prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); @@ -1124,17 +1140,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_newline(&buf); prt_bdevname(&buf, fs->bdev); - prt_str(&buf, "believes seq of "); + prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); + + if (!opts->no_splitbrain_check) { + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + } pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } return 0; @@ -1168,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); - bch2_time_stats_exit(&ca->io_latency[WRITE]); - bch2_time_stats_exit(&ca->io_latency[READ]); + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); @@ -1260,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); - bch2_time_stats_init(&ca->io_latency[READ]); - bch2_time_stats_init(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); @@ -1597,27 +1618,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); - bch_err(ca, "journal error"); + bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; @@ -1835,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(&c->disk_sb, &sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -2023,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb); + ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index cee80c47feea..c86a93a8d8fc 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -930,10 +930,10 @@ SHOW(bch2_dev) sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ]); + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 9220d7de10db..940db15d6a93 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -2,7 +2,6 @@ #ifndef NO_BCACHEFS_FS #include "bcachefs.h" -#include "printbuf.h" #include "thread_with_file.h" #include <linux/anon_inodes.h> @@ -10,6 +9,7 @@ #include <linux/kthread.h> #include <linux/pagemap.h> #include <linux/poll.h> +#include <linux/sched/sysctl.h> void bch2_thread_with_file_exit(struct thread_with_file *thr) { @@ -65,68 +65,82 @@ err: return ret; } -static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +/* stdio_redirect */ + +static bool stdio_redirect_has_input(struct stdio_redirect *stdio) { - return thr->stdio.output_buf.pos || - thr->output2.nr || - thr->thr.done; + return stdio->input.buf.nr || stdio->done; } -static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) +static bool stdio_redirect_has_output(struct stdio_redirect *stdio) { - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - size_t copied = 0, b; - int ret = 0; + return stdio->output.buf.nr || stdio->done; +} - if ((file->f_flags & O_NONBLOCK) && - !thread_with_stdio_has_output(thr)) - return -EAGAIN; +#define STDIO_REDIRECT_BUFSIZE 4096 - ret = wait_event_interruptible(thr->stdio.output_wait, - thread_with_stdio_has_output(thr)); - if (ret) - return ret; +static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - if (thr->thr.done) - return 0; +static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - while (len) { - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); - if (ret) - break; +static void stdio_buf_init(struct stdio_buf *buf) +{ + spin_lock_init(&buf->lock); + init_waitqueue_head(&buf->wait); + darray_init(&buf->buf); +} - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); +/* thread_with_stdio */ - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); - memmove(thr->stdio.output_buf.buf, - thr->stdio.output_buf.buf + b, - thr->stdio.output_buf.pos - b); +static void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); +} - thr->output2.nr += b; - thr->stdio.output_buf.pos -= b; - spin_unlock_irq(&thr->stdio.output_lock); +static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct stdio_buf *buf = &thr->stdio.output; + size_t copied = 0, b; + int ret = 0; - b = min(len, thr->output2.nr); - if (!b) - break; + if (!(file->f_flags & O_NONBLOCK)) { + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); + if (ret) + return ret; + } else if (!stdio_redirect_has_output(&thr->stdio)) + return -EAGAIN; - b -= copy_to_user(buf, thr->output2.data, b); - if (!b) { + while (len && buf->buf.nr) { + if (fault_in_writeable(ubuf, len) == len) { ret = -EFAULT; break; } - copied += b; - buf += b; - len -= b; - - memmove(thr->output2.data, - thr->output2.data + b, - thr->output2.nr - b); - thr->output2.nr -= b; + spin_lock_irq(&buf->lock); + b = min_t(size_t, len, buf->buf.nr); + + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { + ubuf += b; + len -= b; + copied += b; + buf->buf.nr -= b; + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + } + spin_unlock_irq(&buf->lock); } return copied ?: ret; @@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); + thread_with_stdio_done(thr); bch2_thread_with_file_exit(&thr->thr); - printbuf_exit(&thr->stdio.input_buf); - printbuf_exit(&thr->stdio.output_buf); - darray_exit(&thr->output2); - thr->exit(thr); + darray_exit(&thr->stdio.input.buf); + darray_exit(&thr->stdio.output.buf); + thr->ops->exit(thr); return 0; } -#define WRITE_BUFFER 4096 - -static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) -{ - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; -} - static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.input_buf; + struct stdio_buf *buf = &thr->stdio.input; size_t copied = 0; ssize_t ret = 0; @@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu break; } - spin_lock(&thr->stdio.input_lock); - if (buf->pos < WRITE_BUFFER) - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); - b = min(len, printbuf_remaining_size(buf)); - - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { - ubuf += b; - len -= b; - copied += b; - buf->pos += b; + spin_lock(&buf->lock); + if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) + darray_make_room_gfp(&buf->buf, + min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); + b = min(len, darray_room(buf->buf)); + + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; } - spin_unlock(&thr->stdio.input_lock); + spin_unlock(&buf->lock); if (b) { - wake_up(&thr->stdio.input_wait); + wake_up(&buf->wait); } else { if ((file->f_flags & O_NONBLOCK)) { ret = -EAGAIN; break; } - ret = wait_event_interruptible(thr->stdio.input_wait, - thread_with_stdio_has_input_space(thr)); + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_input_space(&thr->stdio)); if (ret) break; } @@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - poll_wait(file, &thr->stdio.output_wait, wait); - poll_wait(file, &thr->stdio.input_wait, wait); + poll_wait(file, &thr->stdio.output.wait, wait); + poll_wait(file, &thr->stdio.input.wait, wait); __poll_t mask = 0; - if (thread_with_stdio_has_output(thr)) + if (stdio_redirect_has_output(&thr->stdio)) mask |= EPOLLIN; - if (thread_with_stdio_has_input_space(thr)) + if (stdio_redirect_has_input_space(&thr->stdio)) mask |= EPOLLOUT; if (thr->thr.done) mask |= EPOLLHUP|EPOLLERR; return mask; } +static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static int thread_with_stdio_flush(struct file *file, fl_owner_t id) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + return thr->thr.ret; +} + +static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + if (thr->ops->unlocked_ioctl) + return thr->ops->unlocked_ioctl(thr, cmd, p); + return -ENOTTY; +} + static const struct file_operations thread_with_stdio_fops = { - .release = thread_with_stdio_release, + .llseek = no_llseek, .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, +}; + +static const struct file_operations thread_with_stdout_fops = { .llseek = no_llseek, + .read = thread_with_stdio_read, + .poll = thread_with_stdout_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, }; +static int thread_with_stdio_fn(void *arg) +{ + struct thread_with_stdio *thr = arg; + + thr->thr.ret = thr->ops->fn(thr); + + thread_with_stdio_done(thr); + return 0; +} + int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)) + const struct thread_with_stdio_ops *ops) { - thr->stdio.input_buf = PRINTBUF; - thr->stdio.input_buf.atomic++; - spin_lock_init(&thr->stdio.input_lock); - init_waitqueue_head(&thr->stdio.input_wait); + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; - thr->stdio.output_buf = PRINTBUF; - thr->stdio.output_buf.atomic++; - spin_lock_init(&thr->stdio.output_lock); - init_waitqueue_head(&thr->stdio.output_wait); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); +} - darray_init(&thr->output2); - thr->exit = exit; +int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); } +EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + struct stdio_buf *buf = &stdio->input; + + /* + * we're waiting on user input (or for the file descriptor to be + * closed), don't want a hung task warning: + */ + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); - wake_up(&stdio->input_wait); + wake_up(&buf->wait); return ret; } -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); - - if (stdio->done) - return -1; + struct stdio_buf *buf = &stdio->input; + size_t copied = 0; + ssize_t ret = 0; +again: + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + + if (stdio->done) { + ret = -1; + goto out; + } - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - char *n = memchr(stdio->input_buf.buf, '\n', ret); + spin_lock(&buf->lock); + size_t b = min(len, buf->buf.nr); + char *n = memchr(buf->buf.data, '\n', b); if (n) - ret = min(ret, n + 1 - stdio->input_buf.buf); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); + b = min_t(size_t, b, n + 1 - buf->buf.data); + buf->buf.nr -= b; + memcpy(ubuf, buf->buf.data, b); + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + ubuf += b; + len -= b; + copied += b; + spin_unlock(&buf->lock); + + wake_up(&buf->wait); + + if (!n && len) + goto again; +out: + return copied ?: ret; +} + +__printf(3, 0) +static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +{ + ssize_t ret; + + do { + va_list args2; + size_t len; + + va_copy(args2, args); + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + va_end(args2); + + if (len + 1 <= darray_room(*out)) { + out->nr += len; + return len; + } + + ret = darray_make_room_gfp(out, len + 1, gfp); + } while (ret == 0); + + return ret; +} + +ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) +{ + struct stdio_buf *buf = &stdio->output; + unsigned long flags; + ssize_t ret; + +again: + spin_lock_irqsave(&buf->lock, flags); + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); + spin_unlock_irqrestore(&buf->lock, flags); + + if (ret < 0) { + if (nonblocking) + return -EAGAIN; + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_output_space(stdio)); + if (ret) + return ret; + goto again; + } + + wake_up(&buf->wait); + return ret; +} + +ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, ...) +{ + va_list args; + ssize_t ret; + + va_start(args, fmt); + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + va_end(args); + return ret; } diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 05879c5048c8..af54ea8f5b0f 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -4,6 +4,38 @@ #include "thread_with_file_types.h" +/* + * Thread with file: Run a kthread and connect it to a file descriptor, so that + * it can be interacted with via fd read/write methods and closing the file + * descriptor stops the kthread. + * + * We have two different APIs: + * + * thread_with_file, the low level version. + * You get to define the full file_operations, including your release function, + * which means that you must call bch2_thread_with_file_exit() from your + * .release method + * + * thread_with_stdio, the higher level version + * This implements full piping of input and output, including .poll. + * + * Notes on behaviour: + * - kthread shutdown behaves like writing or reading from a pipe that has been + * closed + * - Input and output buffers are 4096 bytes, although buffers may in some + * situations slightly exceed that limit so as to avoid chopping off a + * message in the middle in nonblocking mode. + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - + * should be fine but might change in future revisions. + * - Output buffer may grow past 4096 bytes to deal with messages that are + * bigger than 4096 bytes + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only + * drop entire messages. + * + * To write, use stdio_redirect_printf() + * To read, use stdio_redirect_read() or stdio_redirect_readline() + */ + struct task_struct; struct thread_with_file { @@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *, const struct file_operations *, int (*fn)(void *)); +struct thread_with_stdio; + +struct thread_with_stdio_ops { + void (*exit)(struct thread_with_stdio *); + int (*fn)(struct thread_with_stdio *); + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); +}; + struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; - DARRAY(char) output2; - void (*exit)(struct thread_with_stdio *); + const struct thread_with_stdio_ops *ops; }; -static inline void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input_wait); - wake_up(&thr->stdio.output_wait); -} - int bch2_run_thread_with_stdio(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)); + const struct thread_with_stdio_ops *); +int bch2_run_thread_with_stdout(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); +__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); + #endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h index 90b5e645e98c..e0daf4eec341 100644 --- a/fs/bcachefs/thread_with_file_types.h +++ b/fs/bcachefs/thread_with_file_types.h @@ -2,14 +2,21 @@ #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#include "darray.h" + +struct stdio_buf { + spinlock_t lock; + wait_queue_head_t wait; + darray_char buf; +}; + struct stdio_redirect { - spinlock_t output_lock; - wait_queue_head_t output_wait; - struct printbuf output_buf; + struct stdio_buf input; + struct stdio_buf output; spinlock_t input_lock; wait_queue_head_t input_wait; - struct printbuf input_buf; + darray_char input_buf; bool done; }; diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c new file mode 100644 index 000000000000..4508e9dcbee2 --- /dev/null +++ b/fs/bcachefs/time_stats.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/time.h> +#include <linux/spinlock.h> + +#include "eytzinger.h" +#include "time_stats.h" + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ + { "eon", U64_MAX }, +}; + +const struct time_unit *bch2_pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + bool initted = stats->last_event != 0; + + if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (quantiles) + quantiles_update(quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + +static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __bch2_time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h new file mode 100644 index 000000000000..5df61403744b --- /dev/null +++ b/fs/bcachefs/time_stats.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * bch2_time_stats - collect statistics on events that have a duration, with nicely + * formatted textual output on demand + * + * - percpu buffering of event collection: cheap enough to shotgun + * everywhere without worrying about overhead + * + * tracks: + * - number of events + * - maximum event duration ever seen + * - sum of all event durations + * - average event duration, standard and weighted + * - standard deviation of event durations, standard and weighted + * and analagous statistics for the frequency of events + * + * We provide both mean and weighted mean (exponentially weighted), and standard + * deviation and weighted standard deviation, to give an efficient-to-compute + * view of current behaviour versus. average behaviour - "did this event source + * just become wonky, or is this typical?". + * + * Particularly useful for tracking down latency issues. + */ +#ifndef _BCACHEFS_TIME_STATS_H +#define _BCACHEFS_TIME_STATS_H + +#include <linux/sched/clock.h> +#include <linux/spinlock_types.h> +#include <linux/string.h> + +#include "mean_and_variance.h" + +struct time_unit { + const char *name; + u64 nsecs; +}; + +/* + * given a nanosecond value, pick the preferred time units for printing: + */ +const struct time_unit *bch2_pick_time_units(u64 ns); + +/* + * quantiles - do not use: + * + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't + * use in new code. + */ + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[31]; +}; + +struct bch2_time_stats { + spinlock_t lock; + bool have_quantiles; + /* all fields are in nanoseconds */ + u64 min_duration; + u64 max_duration; + u64 total_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + u64 last_event_start; + + struct mean_and_variance duration_stats; + struct mean_and_variance freq_stats; + +/* default weight for weighted mean and variance calculations */ +#define TIME_STATS_MV_WEIGHT 8 + + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance_weighted freq_stats_weighted; + struct time_stat_buffer __percpu *buffer; +}; + +struct bch2_time_stats_quantiles { + struct bch2_time_stats stats; + struct quantiles quantiles; +}; + +static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) +{ + return stats->have_quantiles + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles + : NULL; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); + +/** + * time_stats_update - collect a new event being tracked + * + * @stats - bch2_time_stats to update + * @start - start time of event, recorded with local_clock() + * + * The end duration of the event will be the current time + */ +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +/** + * track_event_change - track state change events + * + * @stats - bch2_time_stats to update + * @v - new state, true or false + * + * Use this when tracking time stats for state changes, i.e. resource X becoming + * blocked/unblocked. + */ +static inline bool track_event_change(struct bch2_time_stats *stats, bool v) +{ + if (v != !!stats->last_event_start) { + if (!v) { + bch2_time_stats_update(stats, stats->last_event_start); + stats->last_event_start = 0; + } else { + stats->last_event_start = local_clock() ?: 1; + return true; + } + } + + return false; +} + +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_exit(&statq->stats); +} +static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_init(&statq->stats); + statq->stats.have_quantiles = true; + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); +} + +#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 293b90d704fb..6aa81d1e6d36 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +); + #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3a32faa86b5c..216fadf16928 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) } #endif -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - void bch2_pr_time_units(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -/* time stats: */ - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - - if (time_after64(end, start)) { - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - bch2_quantiles_update(&stats->quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - for (struct bch2_time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized time_stats"); - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - bch2_time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct bch2_time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct bch2_time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - bch2_time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); prt_tab_rjust(out); @@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { - const struct time_unit *u; + struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; - int i; + u64 f_stddev = 0, d_stddev = 0; if (stats->buffer) { int cpu; @@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); printbuf_tabstops_reset(out); - i = eytzinger0_first(NR_QUANTILES); - u = pick_time_units(stats->quantiles.entries[i].m); - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - q = max(stats->quantiles.entries[i].m, last_q); - prt_printf(out, "%llu ", - div_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; + if (quantiles) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + bch2_pick_time_units(quantiles->entries[i].m); + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } } } -#else -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} -#endif - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - free_percpu(stats->buffer); -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} /* ratelimit: */ @@ -1007,28 +850,6 @@ void sort_cmp_size(void *base, size_t num, size_t size, } } -static void mempool_free_vp(void *element, void *pool_data) -{ - size_t size = (size_t) pool_data; - - vpfree(element, size); -} - -static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - - return vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return size < PAGE_SIZE - ? mempool_init_kmalloc_pool(pool, min_nr, size) - : mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b414736d59a5..7ffbddb80400 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -21,6 +21,7 @@ #include "mean_and_variance.h" #include "darray.h" +#include "time_stats.h" struct closure; @@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void vpfree(void *p, size_t size) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - free_pages((unsigned long) p, get_order(size)); -} - -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -{ - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); -} - -static inline void kvpfree(void *p, size_t size) -{ - if (size < PAGE_SIZE) - kfree(p); - else - vpfree(p, size); -} - -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -{ - return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); - #define HEAP(type) \ struct { \ size_t size, used; \ @@ -97,13 +66,13 @@ struct { \ ({ \ (heap)->used = 0; \ (heap)->size = (_size); \ - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ }) #define free_heap(heap) \ do { \ - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + kvfree((heap)->data); \ (heap)->data = NULL; \ } while (0) @@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct bch2_quantiles { - struct bch2_quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct bch2_time_stat_buffer { - unsigned nr; - struct bch2_time_stat_buffer_entry { - u64 start; - u64 end; - } entries[32]; -}; - -struct bch2_time_stats { - spinlock_t lock; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - struct bch2_quantiles quantiles; - - struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance freq_stats; - struct mean_and_variance_weighted freq_stats_weighted; - struct bch2_time_stat_buffer __percpu *buffer; -}; - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - if (v != !!*start) { - if (!v) { - bch2_time_stats_update(stats, *start); - *start = 0; - } else { - *start = local_clock() ?: 1; - return true; - } - } - - return false; -} -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - bool ret = v && !*start; - *start = v; - return ret; -} -#endif - void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); - #define ewma_add(ewma, val, weight) \ ({ \ typeof(ewma) _ewma = (ewma); \ @@ -788,8 +681,12 @@ static inline void __move_gap(void *array, size_t element_size, } /* Move the gap in a gap buffer: */ -#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ - __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) +#define move_gap(_d, _new_gap) \ +do { \ + __move_gap((_d)->data, sizeof((_d)->data[0]), \ + (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ + (_d)->gap = _new_gap; \ +} while (0) #define bubble_sort(_base, _nr, _cmp) \ do { \ @@ -876,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) void bch2_darray_str_exit(darray_str *); int bch2_split_devs(const char *, darray_str *); +#ifdef __KERNEL__ + +__must_check +static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + +__must_check +static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n) ? -EFAULT : 0; +} + +#endif + +static inline void __set_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 9c0d2316031b..754f17bba68e 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - return ret; + goto err_class_exit; ret = bch2_opt_check_may_set(c, opt_id, v); if (ret < 0) - return ret; + goto err_class_exit; s.v = v + 1; s.defined = true; @@ -595,6 +595,7 @@ err: (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); +err_class_exit: return bch2_err_class(ret); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7669d154c05e..e57054bdc5fd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4111,10 +4111,10 @@ insert_hole: * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated - * if create == 0 and these are pre-allocated blocks + * if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * @@ -4218,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * requested block isn't allocated yet; - * we couldn't try to create block if create flag is zero + * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t len; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2ccf3b5e3a7c..537803250ca9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -465,9 +465,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. if - * create==0 and the blocks are pre-allocated and unwritten, the resulting @map - * is marked as unwritten. If the create == 1, it will mark @map as mapped. + * On success, it returns the number of blocks being mapped or allocated. + * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are + * pre-allocated and unwritten, the resulting @map is marked as unwritten. + * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to @@ -589,8 +590,7 @@ found: * Returns if the blocks have already allocated * * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. + * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index f94901fd3835..044ca5238f41 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -5,6 +5,7 @@ #include <kunit/test.h> #include <kunit/static_stub.h> +#include <linux/random.h> #include "ext4.h" @@ -20,41 +21,135 @@ struct mbt_ctx { }; struct mbt_ext4_super_block { - struct super_block sb; + struct ext4_super_block es; + struct ext4_sb_info sbi; struct mbt_ctx mbt_ctx; }; -#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx)) +#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi)) +#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) +static const struct super_operations mbt_sops = { +}; + +static void mbt_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type mbt_fs_type = { + .name = "mballoc test", + .kill_sb = mbt_kill_sb, +}; + +static int mbt_mb_init(struct super_block *sb) +{ + ext4_fsblk_t block; + int ret; + + /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL); + if (sb->s_bdev == NULL) + return -ENOMEM; + + sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL); + if (sb->s_bdev->bd_queue == NULL) { + kfree(sb->s_bdev); + return -ENOMEM; + } + + /* + * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache = + * new_inode(sb); + */ + INIT_LIST_HEAD(&sb->s_inodes); + sb->s_op = &mbt_sops; + + ret = ext4_mb_init(sb); + if (ret != 0) + goto err_out; + + block = ext4_count_free_clusters(sb); + ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block, + GFP_KERNEL); + if (ret != 0) + goto err_mb_release; + + ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (ret != 0) + goto err_freeclusters; + + return 0; + +err_freeclusters: + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); +err_mb_release: + ext4_mb_release(sb); +err_out: + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); + return ret; +} + +static void mbt_mb_release(struct super_block *sb) +{ + percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter); + percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter); + ext4_mb_release(sb); + kfree(sb->s_bdev->bd_queue); + kfree(sb->s_bdev); +} + +static int mbt_set(struct super_block *sb, void *data) +{ + return 0; +} + static struct super_block *mbt_ext4_alloc_super_block(void) { - struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL); - struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + struct mbt_ext4_super_block *fsb; + struct super_block *sb; + struct ext4_sb_info *sbi; + + fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + if (fsb == NULL) + return NULL; - if (fsb == NULL || sbi == NULL || es == NULL) + sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL); + if (IS_ERR(sb)) goto out; - sbi->s_es = es; - fsb->sb.s_fs_info = sbi; - return &fsb->sb; + sbi = &fsb->sbi; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) + goto out_deactivate; + + bgl_lock_init(sbi->s_blockgroup_lock); + + sbi->s_es = &fsb->es; + sb->s_fs_info = sbi; + + up_write(&sb->s_umount); + return sb; +out_deactivate: + deactivate_locked_super(sb); out: kfree(fsb); - kfree(sbi); - kfree(es); return NULL; } static void mbt_ext4_free_super_block(struct super_block *sb) { - struct mbt_ext4_super_block *fsb = - container_of(sb, struct mbt_ext4_super_block, sb); + struct mbt_ext4_super_block *fsb = MBT_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); - kfree(sbi->s_es); - kfree(sbi); + kfree(sbi->s_blockgroup_lock); + deactivate_super(sb); kfree(fsb); } @@ -82,6 +177,9 @@ static void mbt_init_sb_layout(struct super_block *sb, sbi->s_clusters_per_group = layout->blocks_per_group >> layout->cluster_bits; sbi->s_desc_size = layout->desc_size; + sbi->s_desc_per_block_bits = + sb->s_blocksize_bits - (fls(layout->desc_size) - 1); + sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits; es->s_first_data_block = cpu_to_le32(0); es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * @@ -91,9 +189,13 @@ static void mbt_init_sb_layout(struct super_block *sb, static int mbt_grp_ctx_init(struct super_block *sb, struct mbt_grp_ctx *grp_ctx) { + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); if (grp_ctx->bitmap_bh.b_data == NULL) return -ENOMEM; + mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max); + ext4_free_group_clusters_set(sb, &grp_ctx->desc, max); return 0; } @@ -112,6 +214,13 @@ static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); } +static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + return grp_ctx->bitmap_bh.b_data; +} + /* called after mbt_init_sb_layout */ static int mbt_ctx_init(struct super_block *sb) { @@ -133,6 +242,8 @@ static int mbt_ctx_init(struct super_block *sb) * block which will fail ext4_sb_block_valid check. */ mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc, + EXT4_CLUSTERS_PER_GROUP(sb) - 1); return 0; out: @@ -167,6 +278,13 @@ static int ext4_wait_block_bitmap_stub(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { + /* + * real ext4_wait_block_bitmap will set these flags and + * functions like ext4_mb_init_cache will verify the flags. + */ + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + set_buffer_verified(bh); return 0; } @@ -232,6 +350,14 @@ static int mbt_kunit_init(struct kunit *test) kunit_activate_static_stub(test, ext4_mb_mark_context, ext4_mb_mark_context_stub); + + /* stub function will be called in mbt_mb_init->ext4_mb_init */ + if (mbt_mb_init(sb) != 0) { + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); + return -ENOMEM; + } + return 0; } @@ -239,6 +365,7 @@ static void mbt_kunit_exit(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; + mbt_mb_release(sb); mbt_ctx_release(sb); mbt_ext4_free_super_block(sb); } @@ -246,14 +373,19 @@ static void mbt_kunit_exit(struct kunit *test) static void test_new_blocks_simple(struct kunit *test) { struct super_block *sb = (struct super_block *)test->priv; - struct inode inode = { .i_sb = sb, }; + struct inode *inode; struct ext4_allocation_request ar; ext4_group_t i, goal_group = TEST_GOAL_GROUP; int err = 0; ext4_fsblk_t found; struct ext4_sb_info *sbi = EXT4_SB(sb); - ar.inode = &inode; + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + + inode->i_sb = sb; + ar.inode = inode; /* get block at goal */ ar.goal = ext4_group_first_block_no(sb, goal_group); @@ -297,6 +429,436 @@ static void test_new_blocks_simple(struct kunit *test) "unexpectedly get block when no block is available"); } +#define TEST_RANGE_COUNT 8 + +struct test_range { + ext4_grpblk_t start; + ext4_grpblk_t len; +}; + +static void +mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges, + int count) +{ + ext4_grpblk_t start, len, max; + int i; + + max = EXT4_CLUSTERS_PER_GROUP(sb) / count; + for (i = 0; i < count; i++) { + start = get_random_u32() % max; + len = get_random_u32() % max; + len = min(len, max - start); + + ranges[i].start = start + i * max; + ranges[i].len = len; + } +} + +static void +validate_free_blocks_simple(struct kunit *test, struct super_block *sb, + ext4_group_t goal_group, ext4_grpblk_t start, + ext4_grpblk_t len) +{ + void *bitmap; + ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + + for (i = 0; i < ext4_get_groups_count(sb); i++) { + if (i == goal_group) + continue; + + bitmap = mbt_ctx_bitmap(sb, i); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ_MSG(test, bit, max, + "free block on unexpected group %d", i); + } + + bitmap = mbt_ctx_bitmap(sb, goal_group); + bit = mb_find_next_zero_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, bit, start); + + bit = mb_find_next_bit(bitmap, max, bit + 1); + KUNIT_ASSERT_EQ(test, bit, start + len); +} + +static void +test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group, + ext4_grpblk_t start, ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode; + ext4_fsblk_t block; + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + if (len == 0) + return; + + block = ext4_group_first_block_no(sb, goal_group) + + EXT4_C2B(sbi, start); + ext4_free_blocks_simple(inode, block, len); + validate_free_blocks_simple(test, sb, goal_group, start, len); + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); +} + +static void test_free_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_group_t i; + struct test_range ranges[TEST_RANGE_COUNT]; + + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, max); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_free_blocks_simple_range(test, TEST_GOAL_GROUP, + ranges[i].start, ranges[i].len); +} + +static void +test_mark_diskspace_used_range(struct kunit *test, + struct ext4_allocation_context *ac, + ext4_grpblk_t start, + ext4_grpblk_t len) +{ + struct super_block *sb = (struct super_block *)test->priv; + int ret; + void *bitmap; + ext4_grpblk_t i, max; + + /* ext4_mb_mark_diskspace_used will BUG if len is 0 */ + if (len == 0) + return; + + ac->ac_b_ex.fe_group = TEST_GOAL_GROUP; + ac->ac_b_ex.fe_start = start; + ac->ac_b_ex.fe_len = len; + + bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP); + memset(bitmap, 0, sb->s_blocksize); + ret = ext4_mb_mark_diskspace_used(ac, NULL, 0); + KUNIT_ASSERT_EQ(test, ret, 0); + + max = EXT4_CLUSTERS_PER_GROUP(sb); + i = mb_find_next_bit(bitmap, max, 0); + KUNIT_ASSERT_EQ(test, i, start); + i = mb_find_next_zero_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, i, start + len); + i = mb_find_next_bit(bitmap, max, i + 1); + KUNIT_ASSERT_EQ(test, max, i); +} + +static void test_mark_diskspace_used(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode *inode; + struct ext4_allocation_context ac; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + + inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL); + if (!inode) + return; + inode->i_sb = sb; + + ac.ac_status = AC_STATUS_FOUND; + ac.ac_sb = sb; + ac.ac_inode = inode; + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mark_diskspace_used_range(test, &ac, ranges[i].start, + ranges[i].len); +} + +static void mbt_generate_buddy(struct super_block *sb, void *buddy, + void *bitmap, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + uint32_t order, off; + void *bb, *bb_h; + int max; + + memset(buddy, 0xff, sb->s_blocksize); + memset(grp, 0, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)])); + + bb = bitmap; + max = EXT4_CLUSTERS_PER_GROUP(sb); + bb_h = buddy + sbi->s_mb_offsets[1]; + + off = mb_find_next_zero_bit(bb, max, 0); + grp->bb_first_free = off; + while (off < max) { + grp->bb_counters[0]++; + grp->bb_free++; + + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + grp->bb_free++; + grp->bb_counters[0]--; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[1]++; + grp->bb_largest_free_order = 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + + for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) { + bb = buddy + sbi->s_mb_offsets[order]; + bb_h = buddy + sbi->s_mb_offsets[order + 1]; + max = max >> 1; + off = mb_find_next_zero_bit(bb, max, 0); + + while (off < max) { + if (!(off & 1) && !mb_test_bit(off + 1, bb)) { + mb_set_bits(bb, off, 2); + grp->bb_counters[order] -= 2; + mb_clear_bit(off >> 1, bb_h); + grp->bb_counters[order + 1]++; + grp->bb_largest_free_order = order + 1; + off++; + } + + off = mb_find_next_zero_bit(bb, max, off + 1); + } + } + + max = EXT4_CLUSTERS_PER_GROUP(sb); + off = mb_find_next_zero_bit(bitmap, max, 0); + while (off < max) { + grp->bb_fragments++; + + off = mb_find_next_bit(bitmap, max, off + 1); + if (off + 1 >= max) + break; + + off = mb_find_next_zero_bit(bitmap, max, off + 1); + } +} + +static void +mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1, + struct ext4_group_info *grp2) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + KUNIT_ASSERT_EQ(test, grp1->bb_first_free, + grp2->bb_first_free); + KUNIT_ASSERT_EQ(test, grp1->bb_fragments, + grp2->bb_fragments); + KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free); + KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order, + grp2->bb_largest_free_order); + + for (i = 1; i < MB_NUM_ORDERS(sb); i++) { + KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i], + grp2->bb_counters[i], + "bb_counters[%d] diffs, expected %d, generated %d", + i, grp1->bb_counters[i], + grp2->bb_counters[i]); + } +} + +static void +do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap, + void *mbt_buddy, struct ext4_group_info *mbt_grp, + void *ext4_buddy, struct ext4_group_info *ext4_grp) +{ + int i; + + mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp); + + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + ext4_grp->bb_counters[i] = 0; + /* needed by validation in ext4_mb_generate_buddy */ + ext4_grp->bb_free = mbt_grp->bb_free; + memset(ext4_buddy, 0xff, sb->s_blocksize); + ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP, + ext4_grp); + + KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, mbt_grp, ext4_grp); +} + +static void test_mb_generate_buddy(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *expected_bb, *generate_bb; + struct ext4_group_info *expected_grp, *generate_grp; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb); + generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb); + expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp); + generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP); + KUNIT_ASSERT_NOT_NULL(test, generate_grp); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) { + mb_set_bits(bitmap, ranges[i].start, ranges[i].len); + do_test_generate_buddy(test, sb, bitmap, expected_bb, + expected_grp, generate_bb, generate_grp); + } +} + +static void +test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int i; + + /* mb_mark_used only accepts non-zero len */ + if (len == 0) + return; + + ex.fe_start = start; + ex.fe_len = len; + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + mb_set_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free -= len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); +} + +static void test_mb_mark_used(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_mark_used_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + +static void +test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b, + ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap, + void *buddy, struct ext4_group_info *grp) +{ + struct super_block *sb = (struct super_block *)test->priv; + int i; + + /* mb_free_blocks will WARN if len is 0 */ + if (len == 0) + return; + + ext4_lock_group(sb, e4b->bd_group); + mb_free_blocks(NULL, e4b, start, len); + ext4_unlock_group(sb, e4b->bd_group); + + mb_clear_bits(bitmap, start, len); + /* bypass bb_free validatoin in ext4_mb_generate_buddy */ + grp->bb_free += len; + memset(buddy, 0xff, sb->s_blocksize); + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + grp->bb_counters[i] = 0; + ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp); + + KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize), + 0); + mbt_validate_group_info(test, grp, e4b->bd_info); + +} + +static void test_mb_free_blocks(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + void *bitmap, *buddy; + struct ext4_group_info *grp; + struct ext4_free_extent ex; + int ret; + int i; + struct test_range ranges[TEST_RANGE_COUNT]; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap); + buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); + grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, + bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_start = 0; + ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb); + ex.fe_group = TEST_GOAL_GROUP; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + + grp->bb_free = 0; + memset(bitmap, 0xff, sb->s_blocksize); + + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + for (i = 0; i < TEST_RANGE_COUNT; i++) + test_mb_free_blocks_range(test, &e4b, ranges[i].start, + ranges[i].len, bitmap, buddy, grp); + + ext4_mb_unload_buddy(&e4b); +} + static const struct mbt_ext4_block_layout mbt_test_layouts[] = { { .blocksize_bits = 10, @@ -334,6 +896,11 @@ KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); static struct kunit_case mbt_test_cases[] = { KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), + KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), {} }; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e4f7cf9d89c4..12b3f196010b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3015,8 +3015,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err, buddy_loaded = 0; + int i, err; + char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, @@ -3043,23 +3043,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); + seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } - buddy_loaded = 1; + ext4_mb_unload_buddy(&e4b); } + /* + * We care only about free space counters in the group info and + * these are safe to access even after the buddy has been unloaded + */ memcpy(&sg, grinfo, i); - - if (buddy_loaded) - ext4_mb_unload_buddy(&e4b); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); - seq_puts(seq, " ]\n"); + seq_puts(seq, " ]"); + if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) + seq_puts(seq, " Block bitmap corrupted!"); + seq_puts(seq, "\n"); return 0; } @@ -3829,8 +3832,7 @@ void ext4_mb_release(struct super_block *sb) } static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count, - struct bio **biop) + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; @@ -3839,13 +3841,8 @@ static inline int ext4_issue_discard(struct super_block *sb, count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); - if (biop) { - return __blkdev_issue_discard(sb->s_bdev, - (sector_t)discard_block << (sb->s_blocksize_bits - 9), - (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, biop); - } else - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); + + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, @@ -5169,10 +5166,16 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); + loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); - /* we can't allocate as much as normalizer wants. - * so, found space must get proper lstart - * to cover original request */ + /* + * We can't allocate as much as normalizer wants, so we try + * to get proper lstart to cover the original request, except + * when the goal doesn't cover the original request as below: + * + * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 + * best_ex:0/200(200) -> adjusted: 1848/2048(200) + */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); @@ -5184,7 +5187,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and - * still cover original start + * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; @@ -5194,7 +5197,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; - if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; @@ -5202,7 +5205,6 @@ adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); - BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } @@ -6487,8 +6489,14 @@ do_more: } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, - count_clusters, NULL); - if (err && err != -EOPNOTSUPP) + count_clusters); + /* + * Ignore EOPNOTSUPP error. This is consistent with + * what happens when using journal. + */ + if (err == -EOPNOTSUPP) + err = 0; + if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, @@ -6738,7 +6746,7 @@ __acquires(bitlock) */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ret = ext4_issue_discard(sb, group, start, count, NULL); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 4d4a5a32e310..0ba9837d65ca 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1602,7 +1602,8 @@ exit_journal: int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); - int meta_bg = ext4_has_feature_meta_bg(sb); + int meta_bg = ext4_has_feature_meta_bg(sb) && + gdb_num >= le32_to_cpu(es->s_first_meta_bg); sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - ext4_group_first_block_no(sb, 0); @@ -2084,7 +2085,7 @@ retry: } } - if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) { + if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f5e5a44778cf..cfb8449c731f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4421,22 +4421,6 @@ static int ext4_handle_clustersize(struct super_block *sb) } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - return -EINVAL; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - return -EINVAL; - } } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, @@ -4450,9 +4434,21 @@ static int ext4_handle_clustersize(struct super_block *sb) sbi->s_blocks_per_group); return -EINVAL; } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; sbi->s_cluster_bits = 0; } + sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + return -EINVAL; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { + ext4_msg(sb, KERN_ERR, + "blocks per group (%lu) and clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, sbi->s_clusters_per_group); + return -EINVAL; + } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ @@ -6864,6 +6860,10 @@ static int ext4_write_dquot(struct dquot *dquot) if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to commit dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -6880,6 +6880,10 @@ static int ext4_acquire_dquot(struct dquot *dquot) if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to acquire dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -6899,6 +6903,10 @@ static int ext4_release_dquot(struct dquot *dquot) return PTR_ERR(handle); } ret = dquot_release(dquot); + if (ret < 0) + ext4_error_err(dquot->dq_sb, -ret, + "Failed to release dquot type %d", + dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82dc5e673d5c..b67a176bfcf9 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1565,46 +1565,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, /* * Add value of the EA in an inode. */ -static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, - const void *value, size_t value_len, - struct inode **ret_inode) +static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, + struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; + /* Account inode & space to quota even if sharing... */ + err = ext4_xattr_inode_alloc_quota(inode, value_len); + if (err) + return ERR_PTR(err); + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); - if (err) { - iput(ea_inode); - return err; - } - - *ret_inode = ea_inode; - return 0; + if (err) + goto out_err; + return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); - if (IS_ERR(ea_inode)) - return PTR_ERR(ea_inode); + if (IS_ERR(ea_inode)) { + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ea_inode; + } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); - iput(ea_inode); - return err; + goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); - - *ret_inode = ea_inode; - return 0; + return ea_inode; +out_err: + iput(ea_inode); + ext4_xattr_inode_free_quota(inode, NULL, value_len); + return ERR_PTR(err); } /* @@ -1712,16 +1715,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, if (i->value && in_inode) { WARN_ON_ONCE(!i->value_len); - ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); - if (ret) - goto out; - - ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, - i->value_len, - &new_ea_inode); - if (ret) { + new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(new_ea_inode)) { + ret = PTR_ERR(new_ea_inode); new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } @@ -2160,17 +2158,6 @@ getblk_failed: ENTRY(header(s->base)+1)); if (error) goto getblk_failed; - if (ea_inode) { - /* Drop the extra ref on ea_inode. */ - error = ext4_xattr_inode_dec_ref(handle, - ea_inode); - if (error) - ext4_warning_inode(ea_inode, - "dec ref error=%d", - error); - iput(ea_inode); - ea_inode = NULL; - } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b0597a539fc5..eac698b8dd38 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -154,49 +154,47 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, if (unlikely(f2fs_cp_error(sbi))) return exist; - if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { - f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", - blkaddr, exist); - set_sbi_flag(sbi, SBI_NEED_FSCK); - return exist; - } + if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) || + (!exist && type == DATA_GENERIC_ENHANCE)) + goto out_err; + if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE) + goto out_handle; + return exist; - if (!exist && type == DATA_GENERIC_ENHANCE) { - f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", - blkaddr, exist); - set_sbi_flag(sbi, SBI_NEED_FSCK); - dump_stack(); - } +out_err: + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + dump_stack(); +out_handle: + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return exist; } -bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, +static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (time_to_inject(sbi, FAULT_BLKADDR)) - return false; - switch (type) { case META_NAT: break; case META_SIT: if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) - return false; + goto err; break; case META_SSA: if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || blkaddr < SM_I(sbi)->ssa_blkaddr)) - return false; + goto err; break; case META_CP: if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || blkaddr < __start_cp_addr(sbi))) - return false; + goto err; break; case META_POR: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) - return false; + goto err; break; case DATA_GENERIC: case DATA_GENERIC_ENHANCE: @@ -213,7 +211,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); dump_stack(); - return false; + goto err; } else { return __is_bitmap_valid(sbi, blkaddr, type); } @@ -221,13 +219,30 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case META_GENERIC: if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || blkaddr >= MAIN_BLKADDR(sbi))) - return false; + goto err; break; default: BUG(); } return true; +err: + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return false; +} + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY)) + return false; + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); +} + +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); } /* @@ -889,7 +904,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); - if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { + if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; @@ -1324,7 +1339,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason & CP_UMOUNT) { if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { + NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && @@ -1527,7 +1542,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_ver |= ((__u64)crc32 << 32); *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); - blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); @@ -1587,8 +1602,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) */ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || f2fs_sb_has_compression(sbi)) - invalidate_mapping_pages(META_MAPPING(sbi), - MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); + f2fs_bug_on(sbi, + invalidate_inode_pages2_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1)); f2fs_release_ino_entry(sbi, false); @@ -1730,9 +1746,9 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) im->ino_num = 0; } - sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS - NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * - F2FS_ORPHANS_PER_BLOCK; + F2FS_ORPHANS_PER_BLOCK; } int __init f2fs_create_checkpoint_caches(void) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 531517dac079..8892c8262141 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc) ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { - printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n", - KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "lzo-rle compress failed, ret:%d", ret); return -EIO; } return 0; @@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (provided != calculated) { if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); - printk_ratelimited( - "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", - KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + f2fs_info_ratelimited(sbi, + "checksum invalid, nid = %lu, %x vs %x", + dic->inode->i_ino, provided, calculated); } set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) struct f2fs_sb_info *sbi = bio->bi_private; struct compress_io_ctx *cic = (struct compress_io_ctx *)page_private(page); + enum count_type type = WB_DATA_TYPE(page, + f2fs_is_compressed_page(page)); int i; if (unlikely(bio->bi_status)) @@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) f2fs_compress_free_page(page); - dec_page_count(sbi, F2FS_WB_DATA); + dec_page_count(sbi, type); if (atomic_dec_return(&cic->pending_pages)) return; @@ -1441,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) } static int f2fs_write_raw_pages(struct compress_ctx *cc, - int *submitted, + int *submitted_p, struct writeback_control *wbc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret, i; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + int submitted, compr_blocks, i; + int ret = 0; compr_blocks = f2fs_compressed_blocks(cc); @@ -1461,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (compr_blocks < 0) return compr_blocks; + /* overwrite compressed cluster w/ normal cluster */ + if (compr_blocks > 0) + f2fs_lock_op(sbi); + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; @@ -1485,7 +1493,7 @@ continue_unlock: if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; - ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + ret = f2fs_write_single_data_page(cc->rpages[i], &submitted, NULL, NULL, wbc, io_type, compr_blocks, false); if (ret) { @@ -1493,26 +1501,29 @@ continue_unlock: unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { + ret = 0; /* * for quota file, just redirty left pages to * avoid deadlock caused by cluster update race * from foreground operation. */ if (IS_NOQUOTA(cc->inode)) - return 0; - ret = 0; + goto out; f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } - return ret; + goto out; } - *submitted += _submitted; + *submitted_p += submitted; } - f2fs_balance_fs(F2FS_M_SB(mapping), true); +out: + if (compr_blocks > 0) + f2fs_unlock_op(sbi); - return 0; + f2fs_balance_fs(sbi, true); + return ret; } int f2fs_write_multi_pages(struct compress_ctx *cc, @@ -1806,16 +1817,18 @@ void f2fs_put_page_dic(struct page *page, bool in_task) * check whether cluster blocks are contiguous, and add extent cache entry * only if cluster blocks are logically and physically contiguous. */ -unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, + unsigned int ofs_in_node) { - bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + bool compressed = data_blkaddr(dn->inode, dn->node_page, + ofs_in_node) == COMPRESS_ADDR; int i = compressed ? 1 : 0; block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, - dn->ofs_in_node + i); + ofs_in_node + i); for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, - dn->ofs_in_node + i); + ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) break; @@ -1878,12 +1891,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, set_page_private_data(cpage, ino); - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) - goto out; - memcpy(page_address(cpage), page_address(page), PAGE_SIZE); SetPageUptodate(cpage); -out: f2fs_put_page(cpage, 1); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 26e317696b33..d9494b5fc7c1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static bool __is_cp_guaranteed(struct page *page) +bool f2fs_is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; @@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page) S_ISDIR(inode->i_mode)) return true; - if (f2fs_is_compressed_page(page)) - return false; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; @@ -338,18 +336,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page); - - if (page_private_dummy(page)) { - clear_page_private_dummy(page); - unlock_page(page); - mempool_free(page, sbi->write_io_dummy); - - if (unlikely(bio->bi_status)) - f2fs_stop_checkpoint(sbi, true, - STOP_CP_REASON_WRITE_FAIL); - continue; - } + enum count_type type = WB_DATA_TYPE(page, false); fscrypt_finalize_bounce_page(&page); @@ -524,50 +511,13 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, submit_bio(bio); } -static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio) -{ - unsigned int start = - (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi); - - if (start == 0) - return; - - /* fill dummy pages */ - for (; start < F2FS_IO_SIZE(sbi); start++) { - struct page *page = - mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_NOFAIL); - f2fs_bug_on(sbi, !page); - - lock_page(page); - - zero_user_segment(page, 0, PAGE_SIZE); - set_page_private_dummy(page); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) - f2fs_bug_on(sbi, 1); - } -} - static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(is_read_io(bio_op(bio))); - if (type == DATA || type == NODE) { - if (f2fs_lfs_mode(sbi) && current->plug) - blk_finish_plug(current->plug); - - if (F2FS_IO_ALIGNED(sbi)) { - f2fs_align_write_bio(sbi, bio); - /* - * In the NODE case, we lose next block address chain. - * So, we need to do checkpoint in f2fs_sync_file. - */ - if (type == NODE) - set_sbi_flag(sbi, SBI_NEED_CP); - } - } + if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type)) + blk_finish_plug(current->plug); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); @@ -740,10 +690,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? - META_GENERIC : DATA_GENERIC_ENHANCE))) { - f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); + META_GENERIC : DATA_GENERIC_ENHANCE))) return -EFSCORRUPTED; - } trace_f2fs_submit_page_bio(page, fio); @@ -762,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page) : WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); @@ -796,16 +744,6 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { - if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) { - unsigned int filled_blocks = - F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size); - unsigned int io_size = F2FS_IO_SIZE(sbi); - unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt; - - /* IOs in bio is aligned and left space of vectors is not enough */ - if (!(filled_blocks % io_size) && left_vecs < io_size) - return false; - } if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) return false; return io_type_is_mergeable(io, fio); @@ -948,10 +886,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, - __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) { - f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; - } trace_f2fs_submit_page_bio(page, fio); @@ -973,7 +909,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); - inc_page_count(fio->sbi, WB_DATA_TYPE(page)); + inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1007,11 +943,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; + enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); - +next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { wait_for_completion_io(&io->zone_wait); @@ -1021,7 +958,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) } #endif -next: if (fio->in_list) { spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { @@ -1046,7 +982,8 @@ next: /* set submitted = true as a return value */ fio->submitted = 1; - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + type = WB_DATA_TYPE(bio_page, fio->compressed_page); + inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, @@ -1056,13 +993,6 @@ next: __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - if (F2FS_IO_ALIGNED(sbi) && - (fio->type == DATA || fio->type == NODE) && - fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - fio->retry = 1; - goto skip; - } io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, bio_page->index, fio, GFP_NOIO); @@ -1080,10 +1010,6 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; trace_f2fs_submit_page_write(fio->page, fio); -skip: - if (fio->in_list) - goto next; -out: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { @@ -1096,6 +1022,9 @@ out: __submit_merged_bio(io); } #endif + if (fio->in_list) + goto next; +out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); @@ -1218,7 +1147,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + err = inc_valid_block_count(sbi, dn->inode, &count, true); + if (unlikely(err)) return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, @@ -1285,8 +1215,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; - f2fs_handle_error(F2FS_I_SB(inode), - ERROR_INVALID_BLKADDR); goto put_err; } goto got_it; @@ -1312,8 +1240,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; - f2fs_handle_error(F2FS_I_SB(inode), - ERROR_INVALID_BLKADDR); goto put_err; } got_it: @@ -1475,15 +1401,18 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr == NULL_ADDR) { - err = inc_valid_block_count(sbi, dn->inode, &count); + err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; } set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; - f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, - &sum, seg_type, NULL); + err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr, + &dn->data_blkaddr, &sum, seg_type, NULL); + if (err) + return err; + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(sbi, old_blkaddr); @@ -1641,7 +1570,6 @@ next_block: if (!is_hole && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto sync_out; } @@ -2165,8 +2093,6 @@ got_it: if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; - f2fs_handle_error(F2FS_I_SB(inode), - ERROR_INVALID_BLKADDR); goto out; } } else { @@ -2668,8 +2594,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) if (fio) { if (page_private_gcing(fio->page)) return true; - if (page_private_dummy(fio->page)) - return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; @@ -2706,11 +2630,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_read_extent_cache_block(inode, page->index, &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, - DATA_GENERIC_ENHANCE)) { - f2fs_handle_error(fio->sbi, - ERROR_INVALID_BLKADDR); + DATA_GENERIC_ENHANCE)) return -EFSCORRUPTED; - } ipu_force = true; fio->need_lock = LOCK_DONE; @@ -2738,7 +2659,6 @@ got_it: !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; - f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR); goto out_writepage; } @@ -2838,7 +2758,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, - .need_lock = LOCK_RETRY, + .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY, .post_read = f2fs_post_read_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, @@ -2919,6 +2839,7 @@ write: if (err == -EAGAIN) { err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { + f2fs_bug_on(sbi, compr_blocks); fio.need_lock = LOCK_REQ; err = f2fs_do_write_data_page(&fio); } @@ -3704,7 +3625,6 @@ repeat: if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } err = f2fs_submit_page_read(use_cow ? @@ -3905,26 +3825,36 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blkofs; unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int end_blk = start_blk + blkcnt - 1; unsigned int secidx = start_blk / blk_per_sec; - unsigned int end_sec = secidx + blkcnt / blk_per_sec; + unsigned int end_sec; int ret = 0; + if (!blkcnt) + return 0; + end_sec = end_blk / blk_per_sec; + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); set_inode_flag(inode, FI_OPU_WRITE); - for (; secidx < end_sec; secidx++) { + for (; secidx <= end_sec; secidx++) { + unsigned int blkofs_end = secidx == end_sec ? + end_blk % blk_per_sec : blk_per_sec - 1; + f2fs_down_write(&sbi->pin_sem); - f2fs_lock_op(sbi); - f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); - f2fs_unlock_op(sbi); + ret = f2fs_allocate_pinning_section(sbi); + if (ret) { + f2fs_up_write(&sbi->pin_sem); + break; + } set_inode_flag(inode, FI_SKIP_WRITES); - for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + for (blkofs = 0; blkofs <= blkofs_end; blkofs++) { struct page *page; unsigned int blkidx = secidx * blk_per_sec + blkofs; @@ -4013,27 +3943,34 @@ retry: nr_pblocks = map.m_len; if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || - nr_pblocks & sec_blks_mask) { + nr_pblocks & sec_blks_mask || + !f2fs_valid_pinned_area(sbi, pblock)) { + bool last_extent = false; + not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); if (cur_lblock + nr_pblocks > sis->max) nr_pblocks -= blks_per_sec; + /* this extent is last one */ if (!nr_pblocks) { - /* this extent is last one */ - nr_pblocks = map.m_len; - f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); - goto next; + nr_pblocks = last_lblock - cur_lblock; + last_extent = true; } ret = f2fs_migrate_blocks(inode, cur_lblock, nr_pblocks); - if (ret) + if (ret) { + if (ret == -ENOENT) + ret = -EINVAL; goto out; - goto retry; + } + + if (!last_extent) + goto retry; } -next: + if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -4071,17 +4008,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + if (f2fs_readonly(sbi->sb)) return -EROFS; - if (f2fs_lfs_mode(F2FS_I_SB(inode))) { - f2fs_err(F2FS_I_SB(inode), - "Swapfile not supported in LFS mode"); + if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Swapfile not supported in LFS mode"); return -EINVAL; } @@ -4092,6 +4029,10 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + return ret; + f2fs_precache_extents(inode); ret = check_swap_activate(sis, file, span); @@ -4100,7 +4041,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + f2fs_update_time(sbi, REQ_TIME); return ret; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fdbf994f1271..8b0e1e71b667 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -41,7 +41,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = CAP_BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -135,7 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->cur_ckpt_time = sbi->cprc_info.cur_time; si->peak_ckpt_time = sbi->cprc_info.peak_time; spin_unlock(&sbi->cprc_info.stat_lock); - si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->total_count = BLKS_TO_SEGS(sbi, (int)sbi->user_block_count); si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); @@ -176,11 +176,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + si->util_free = (int)(BLKS_TO_SEGS(sbi, free_user_blocks(sbi))) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; - si->util_valid = (int)(written_block_count(sbi) >> - sbi->log_blocks_per_seg) + si->util_valid = (int)(BLKS_TO_SEGS(sbi, written_block_count(sbi))) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; @@ -208,7 +207,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) if (!blks) continue; - if (blks == sbi->blocks_per_seg) + if (blks == BLKS_PER_SEG(sbi)) si->full_seg[type]++; else si->dirty_seg[type]++; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 042593aed1ec..02c9355176d3 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -830,13 +830,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, return err; } -int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, + struct f2fs_filename *fname) { struct page *page; int err = 0; f2fs_down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, NULL, NULL); + page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -995,9 +996,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { if (found_valid_dirent || !bit_pos) { - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, + f2fs_warn_ratelimited(sbi, + "invalid namelen(0), ino:%u, run fsck to fix.", le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ad8dfac73bd4..48048fa36427 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -43,7 +43,6 @@ bool sanity_check_extent_cache(struct inode *inode) if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) || !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, DATA_GENERIC_ENHANCE)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, ei->blk, ei->fofs, ei->len); @@ -856,10 +855,8 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, goto out; if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { - f2fs_bug_on(sbi, 1); + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) return -EINVAL; - } out: /* * init block age with zero, this can happen when the block age extent diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ff428bee958..fced2b7652f4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -61,7 +61,9 @@ enum { FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, FAULT_LOCK_OP, - FAULT_BLKADDR, + FAULT_BLKADDR_VALIDITY, + FAULT_BLKADDR_CONSISTENCE, + FAULT_NO_SEGMENT, FAULT_MAX, }; @@ -76,6 +78,11 @@ struct f2fs_fault_info { extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) + +/* maximum retry count for injected failure */ +#define DEFAULT_FAILURE_RETRY_COUNT 8 +#else +#define DEFAULT_FAILURE_RETRY_COUNT 1 #endif /* @@ -143,7 +150,6 @@ struct f2fs_rwsem { struct f2fs_mount_info { unsigned int opt; - int write_io_size_bits; /* Write IO size bits */ block_t root_reserved_blocks; /* root reserved blocks */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ @@ -1081,7 +1087,8 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ -#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) +#define WB_DATA_TYPE(p, f) \ + (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -1111,6 +1118,7 @@ enum count_type { * ... Only can be used with META. */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) +#define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE) enum page_type { DATA = 0, NODE = 1, /* should not change this */ @@ -1205,7 +1213,6 @@ struct f2fs_io_info { unsigned int submitted:1; /* indicate IO submission */ unsigned int in_list:1; /* indicate fio is in io_list */ unsigned int is_por:1; /* indicate IO is from recovery or not */ - unsigned int retry:1; /* need to reallocate block address */ unsigned int encrypted:1; /* indicate file is encrypted */ unsigned int post_read:1; /* require post read */ enum iostat_type io_type; /* io type */ @@ -1407,18 +1414,16 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr); * Layout A: lowest bit should be 1 * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | * bit 0 PAGE_PRIVATE_NOT_POINTER - * bit 1 PAGE_PRIVATE_DUMMY_WRITE - * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION - * bit 3 PAGE_PRIVATE_INLINE_INODE - * bit 4 PAGE_PRIVATE_REF_RESOURCE - * bit 5- f2fs private data + * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 2 PAGE_PRIVATE_INLINE_INODE + * bit 3 PAGE_PRIVATE_REF_RESOURCE + * bit 4- f2fs private data * * Layout B: lowest bit should be 0 * page.private is a wrapped pointer. */ enum { PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ - PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ @@ -1565,7 +1570,6 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; - mempool_t *write_io_dummy; /* Dummy pages */ pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ @@ -1811,6 +1815,37 @@ struct f2fs_sb_info { #endif }; +/* Definitions to access f2fs_sb_info */ +#define SEGS_TO_BLKS(sbi, segs) \ + ((segs) << (sbi)->log_blocks_per_seg) +#define BLKS_TO_SEGS(sbi, blks) \ + ((blks) >> (sbi)->log_blocks_per_seg) + +#define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg) +#define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec)) +#define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec) + +__printf(3, 4) +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...); + +#define f2fs_err(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_notice(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__) +#define f2fs_info(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__) +#define f2fs_debug(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__) + +#define f2fs_err_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_info_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__) + #ifdef CONFIG_F2FS_FAULT_INJECTION #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ __builtin_return_address(0)) @@ -1828,9 +1863,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", - KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type], - func, parent_func); + f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", + f2fs_fault_name[type], func, parent_func); return true; } return false; @@ -2250,9 +2284,30 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; } +static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool cap) +{ + block_t avail_user_block_count; + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; + + if (!__allow_reserved_blocks(sbi, inode, cap)) + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (avail_user_block_count > sbi->unusable_block_count) + avail_user_block_count -= sbi->unusable_block_count; + else + avail_user_block_count = 0; + } + + return avail_user_block_count; +} + static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t *count) + struct inode *inode, blkcnt_t *count, bool partial) { blkcnt_t diff = 0, release = 0; block_t avail_user_block_count; @@ -2275,23 +2330,14 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = sbi->user_block_count - - sbi->current_reserved_blocks; - - if (!__allow_reserved_blocks(sbi, inode, true)) - avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; - - if (F2FS_IO_ALIGNED(sbi)) - avail_user_block_count -= sbi->blocks_per_seg * - SM_I(sbi)->additional_reserved_segments; + avail_user_block_count = get_available_block_count(sbi, inode, true); - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { - if (avail_user_block_count > sbi->unusable_block_count) - avail_user_block_count -= sbi->unusable_block_count; - else - avail_user_block_count = 0; - } if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + if (!partial) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } + diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) diff = *count; @@ -2319,20 +2365,6 @@ release_quota: return -ENOSPC; } -__printf(2, 3) -void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...); - -#define f2fs_err(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__) -#define f2fs_warn(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__) -#define f2fs_notice(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__) -#define f2fs_info(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__) -#define f2fs_debug(sbi, fmt, ...) \ - f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__) - #define PAGE_PRIVATE_GET_FUNC(name, flagname) \ static inline bool page_private_##name(struct page *page) \ { \ @@ -2361,17 +2393,14 @@ static inline void clear_page_private_##name(struct page *page) \ PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); static inline unsigned long get_page_private_data(struct page *page) { @@ -2505,11 +2534,8 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; - unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> - sbi->log_blocks_per_seg; - - return segs / sbi->segs_per_sec; + return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1, + BLKS_PER_SEC(sbi)); } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -2573,7 +2599,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 2) - start_addr += sbi->blocks_per_seg; + start_addr += BLKS_PER_SEG(sbi); return start_addr; } @@ -2582,7 +2608,7 @@ static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); if (sbi->cur_cp_pack == 1) - start_addr += sbi->blocks_per_seg; + start_addr += BLKS_PER_SEG(sbi); return start_addr; } @@ -2601,7 +2627,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count, user_block_count; + unsigned int valid_node_count; + unsigned int avail_user_block_count; int err; if (is_inode) { @@ -2621,21 +2648,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + - sbi->current_reserved_blocks + 1; - - if (!__allow_reserved_blocks(sbi, inode, false)) - valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + valid_block_count = sbi->total_valid_block_count + 1; + avail_user_block_count = get_available_block_count(sbi, inode, false); - if (F2FS_IO_ALIGNED(sbi)) - valid_block_count += sbi->blocks_per_seg * - SM_I(sbi)->additional_reserved_segments; - - user_block_count = sbi->user_block_count; - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - user_block_count -= sbi->unusable_block_count; - - if (unlikely(valid_block_count > user_block_count)) { + if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } @@ -3022,6 +3038,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_INLINE_DOTS: case FI_PIN_FILE: case FI_COMPRESS_RELEASED: + case FI_ATOMIC_COMMITTED: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -3445,7 +3462,7 @@ static inline __le32 *get_dnode_addr(struct inode *inode, sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ -#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) +#define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) @@ -3454,11 +3471,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, static inline void verify_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", blkaddr, type); - f2fs_bug_on(sbi, 1); - } } static inline bool __is_valid_data_blkaddr(block_t blkaddr) @@ -3560,7 +3575,8 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode); -int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, + struct f2fs_filename *fname); bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) @@ -3675,15 +3691,14 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); -void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); -void f2fs_get_new_segment(struct f2fs_sb_info *sbi, - unsigned int *newseg, bool new_sec, int dir); -void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); +int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi); +int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3704,7 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); @@ -3754,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, @@ -3794,6 +3811,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); +bool f2fs_is_cp_guaranteed(struct page *page); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -3857,6 +3875,9 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); +int f2fs_gc_range(struct f2fs_sb_info *sbi, + unsigned int start_seg, unsigned int end_seg, + bool dry_run, unsigned int dry_run_sections); int f2fs_resize_fs(struct file *filp, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); @@ -4277,7 +4298,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task); void f2fs_put_page_dic(struct page *page, bool in_task); -unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, + unsigned int ofs_in_node); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -4334,7 +4356,8 @@ static inline void f2fs_put_page_dic(struct page *page, bool in_task) { WARN_ON_ONCE(1); } -static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline unsigned int f2fs_cluster_blocks_are_contiguous( + struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; } static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } @@ -4391,15 +4414,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - if (!f2fs_compressed_file(inode)) + f2fs_down_write(&F2FS_I(inode)->i_sem); + + if (!f2fs_compressed_file(inode)) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return true; - if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + } + if (f2fs_is_mmap_file(inode) || + (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return false; + } fi->i_flags &= ~F2FS_COMPR_FL; stat_dec_compr_inode(inode); clear_inode_flag(inode, FI_COMPRESSED_FILE); f2fs_mark_inode_dirty_sync(inode, true); + + f2fs_up_write(&F2FS_I(inode)->i_sem); return true; } @@ -4502,6 +4534,17 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } +static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (f2fs_sb_has_blkzoned(sbi)) { + int devi = f2fs_target_device_index(sbi, blkaddr); + + return !bdev_is_zoned(FDEV(devi).bdev); + } + return true; +} + static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; @@ -4603,10 +4646,36 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); } +static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int cnt) +{ + bool need_submit = false; + int i = 0; + + do { + struct page *page; + + page = find_get_page(META_MAPPING(sbi), blkaddr + i); + if (page) { + if (PageWriteback(page)) + need_submit = true; + f2fs_put_page(page, 0); + } + } while (++i < cnt && !need_submit); + + if (need_submit) + f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, + NULL, 0, DATA); + + truncate_inode_pages_range(META_MAPPING(sbi), + F2FS_BLK_TO_BYTES((loff_t)blkaddr), + F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); +} + static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, block_t blkaddr) { - invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr); + f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1); f2fs_invalidate_compress_page(sbi, blkaddr); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b58ab1157b7e..1761ad125f97 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -39,6 +39,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); + vm_flags_t flags = vmf->vma->vm_flags; vm_fault_t ret; ret = filemap_fault(vmf); @@ -46,7 +47,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_MAPPED_READ_IO, F2FS_BLKSIZE); - trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret); + trace_f2fs_filemap_fault(inode, vmf->pgoff, flags, ret); return ret; } @@ -394,9 +395,20 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return f2fs_do_sync_file(file, start, end, datasync, false); } -static bool __found_offset(struct address_space *mapping, block_t blkaddr, - pgoff_t index, int whence) +static bool __found_offset(struct address_space *mapping, + struct dnode_of_data *dn, pgoff_t index, int whence) { + block_t blkaddr = f2fs_data_blkaddr(dn); + struct inode *inode = mapping->host; + bool compressed_cluster = false; + + if (f2fs_compressed_file(inode)) { + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size)); + + compressed_cluster = first_blkaddr == COMPRESS_ADDR; + } + switch (whence) { case SEEK_DATA: if (__is_valid_data_blkaddr(blkaddr)) @@ -404,8 +416,12 @@ static bool __found_offset(struct address_space *mapping, block_t blkaddr, if (blkaddr == NEW_ADDR && xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) return true; + if (compressed_cluster) + return true; break; case SEEK_HOLE: + if (compressed_cluster) + return false; if (blkaddr == NULL_ADDR) return true; break; @@ -474,7 +490,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; } - if (__found_offset(file->f_mapping, blkaddr, + if (__found_offset(file->f_mapping, &dn, pgofs, whence)) { f2fs_put_dnode(&dn); goto found; @@ -590,8 +606,10 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) f2fs_set_data_blkaddr(dn, NULL_ADDR); if (__is_valid_data_blkaddr(blkaddr)) { - if (!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE)) + if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE)) + continue; + if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) continue; if (compressed_cluster) valid_blocks++; @@ -818,8 +836,6 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) */ if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE)) return true; - if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi)) - return true; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) return true; @@ -1192,7 +1208,6 @@ next_dnode: !f2fs_is_valid_blkaddr(sbi, *blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; } @@ -1478,7 +1493,6 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); break; } @@ -1662,10 +1676,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) } filemap_invalidate_unlock(mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (ret) + return ret; /* write out all moved pages, if possible */ filemap_invalidate_lock(mapping); - filemap_write_and_wait_range(mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); filemap_invalidate_unlock(mapping); @@ -1731,9 +1747,11 @@ next_alloc: f2fs_down_write(&sbi->pin_sem); - f2fs_lock_op(sbi); - f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); - f2fs_unlock_op(sbi); + err = f2fs_allocate_pinning_section(sbi); + if (err) { + f2fs_up_write(&sbi->pin_sem); + goto out_err; + } map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO); @@ -2066,7 +2084,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) inode_lock(inode); - if (!f2fs_disable_compressed_file(inode)) { + if (!f2fs_disable_compressed_file(inode) || + f2fs_is_pinned_file(inode)) { ret = -EINVAL; goto out; } @@ -2243,8 +2262,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) + if (ret) { + if (ret == -EIO) + ret = 0; goto out; + } f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: @@ -2260,6 +2282,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); + if (ret == -EIO) + ret = 0; goto out; default: ret = -EINVAL; @@ -2578,7 +2602,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, .m_may_create = false }; struct extent_info ei = {}; pgoff_t pg_start, pg_end, next_pgofs; - unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; block_t blk_end = 0; bool fragmented = false; @@ -2687,7 +2710,8 @@ do_map: set_inode_flag(inode, FI_SKIP_WRITES); idx = map.m_lblk; - while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + while (idx < map.m_lblk + map.m_len && + cnt < BLKS_PER_SEG(sbi)) { struct page *page; page = f2fs_get_lock_data_page(inode, idx, true); @@ -2707,7 +2731,7 @@ do_map: map.m_lblk = idx; check: - if (map.m_lblk < pg_end && cnt < blk_per_seg) + if (map.m_lblk < pg_end && cnt < BLKS_PER_SEG(sbi)) goto do_map; clear_inode_flag(inode, FI_SKIP_WRITES); @@ -2976,8 +3000,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || __is_large_section(sbi)) { - f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1", - range.dev_num, sbi->s_ndevs, sbi->segs_per_sec); + f2fs_warn(sbi, "Can't flush %u in %d for SEGS_PER_SEC %u != 1", + range.dev_num, sbi->s_ndevs, SEGS_PER_SEC(sbi)); return -EINVAL; } @@ -3183,6 +3207,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 pin; int ret = 0; @@ -3192,7 +3217,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + if (f2fs_readonly(sbi->sb)) return -EROFS; ret = mnt_want_write_file(filp); @@ -3205,9 +3230,18 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; + } else if (f2fs_is_pinned_file(inode)) { + goto done; } - if (f2fs_should_update_outplace(inode, NULL)) { + if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) { + ret = -EFBIG; + goto out; + } + + /* Let's allow file pinning on zoned device. */ + if (!f2fs_sb_has_blkzoned(sbi) && + f2fs_should_update_outplace(inode, NULL)) { ret = -EINVAL; goto out; } @@ -3229,7 +3263,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) set_inode_flag(inode, FI_PIN_FILE); ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + f2fs_update_time(sbi, REQ_TIME); out: inode_unlock(inode); mnt_drop_write_file(filp); @@ -3438,10 +3472,8 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) { - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + DATA_GENERIC_ENHANCE))) return -EFSCORRUPTED; - } } while (count) { @@ -3588,10 +3620,10 @@ out: return ret; } -static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, + unsigned int *reserved_blocks) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - unsigned int reserved_blocks = 0; int cluster_size = F2FS_I(dn->inode)->i_cluster_size; block_t blkaddr; int i; @@ -3603,10 +3635,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) if (!__is_valid_data_blkaddr(blkaddr)) continue; if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, - DATA_GENERIC_ENHANCE))) { - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + DATA_GENERIC_ENHANCE))) return -EFSCORRUPTED; - } } while (count) { @@ -3614,40 +3644,53 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count) blkcnt_t reserved; int ret; - for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { - blkaddr = f2fs_data_blkaddr(dn); + for (i = 0; i < cluster_size; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); if (i == 0) { - if (blkaddr == COMPRESS_ADDR) - continue; - dn->ofs_in_node += cluster_size; - goto next; + if (blkaddr != COMPRESS_ADDR) { + dn->ofs_in_node += cluster_size; + goto next; + } + continue; } - if (__is_valid_data_blkaddr(blkaddr)) { + /* + * compressed cluster was not released due to it + * fails in release_compress_blocks(), so NEW_ADDR + * is a possible case. + */ + if (blkaddr == NEW_ADDR || + __is_valid_data_blkaddr(blkaddr)) { compr_blocks++; continue; } - - f2fs_set_data_blkaddr(dn, NEW_ADDR); } reserved = cluster_size - compr_blocks; - ret = inc_valid_block_count(sbi, dn->inode, &reserved); - if (ret) + + /* for the case all blocks in cluster were reserved */ + if (reserved == 1) + goto next; + + ret = inc_valid_block_count(sbi, dn->inode, &reserved, false); + if (unlikely(ret)) return ret; - if (reserved != cluster_size - compr_blocks) - return -ENOSPC; + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + if (f2fs_data_blkaddr(dn) == NULL_ADDR) + f2fs_set_data_blkaddr(dn, NEW_ADDR); + } f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); - reserved_blocks += reserved; + *reserved_blocks += reserved; next: count -= cluster_size; } - return reserved_blocks; + return 0; } static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) @@ -3671,9 +3714,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) - goto out; - f2fs_balance_fs(sbi, true); inode_lock(inode); @@ -3683,6 +3723,9 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) goto unlock_inode; } + if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) + goto unlock_inode; + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); @@ -3708,7 +3751,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, F2FS_I(inode)->i_cluster_size); - ret = reserve_compress_blocks(&dn, count); + ret = reserve_compress_blocks(&dn, count, &reserved_blocks); f2fs_put_dnode(&dn); @@ -3716,23 +3759,21 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; page_idx += count; - reserved_blocks += ret; } filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (ret >= 0) { + if (!ret) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); } unlock_inode: inode_unlock(inode); -out: mnt_drop_write_file(filp); - if (ret >= 0) { + if (!ret) { ret = put_user(reserved_blocks, (u64 __user *)arg); } else if (reserved_blocks && atomic_read(&F2FS_I(inode)->i_compr_blocks)) { @@ -3877,8 +3918,6 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) DATA_GENERIC_ENHANCE)) { ret = -EFSCORRUPTED; f2fs_put_dnode(&dn); - f2fs_handle_error(sbi, - ERROR_INVALID_BLKADDR); goto out; } @@ -3981,16 +4020,20 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) sizeof(option))) return -EFAULT; - if (!f2fs_compressed_file(inode) || - option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || - option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || - option.algorithm >= COMPRESS_MAX) + if (option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) return -EINVAL; file_start_write(filp); inode_lock(inode); f2fs_down_write(&F2FS_I(inode)->i_sem); + if (!f2fs_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { ret = -EBUSY; goto out; @@ -4066,7 +4109,6 @@ static int f2fs_ioc_decompress_file(struct file *filp) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; - unsigned int blk_per_seg = sbi->blocks_per_seg; int cluster_size = fi->i_cluster_size; int count, ret; @@ -4110,7 +4152,7 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) { + if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) { ret = filemap_fdatawrite(inode->i_mapping); if (ret < 0) break; @@ -4145,7 +4187,6 @@ static int f2fs_ioc_compress_file(struct file *filp) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t page_idx = 0, last_idx; - unsigned int blk_per_seg = sbi->blocks_per_seg; int cluster_size = F2FS_I(inode)->i_cluster_size; int count, ret; @@ -4188,7 +4229,7 @@ static int f2fs_ioc_compress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) { + if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) { ret = filemap_fdatawrite(inode->i_mapping); if (ret < 0) break; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a079eebfb080..8852814dab7f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -259,7 +259,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = 1; } else { p->gc_mode = select_gc_type(sbi, gc_type); - p->ofs_unit = sbi->segs_per_sec; + p->ofs_unit = SEGS_PER_SEC(sbi); if (__is_large_section(sbi)) { p->dirty_bitmap = dirty_i->dirty_secmap; p->max_search = count_bits(p->dirty_bitmap, @@ -280,11 +280,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - /* let's select beginning hot/small space first in no_heap mode*/ + /* let's select beginning hot/small space first. */ if (f2fs_need_rand_seg(sbi)) - p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); - else if (test_opt(sbi, NOHEAP) && - (type == CURSEG_HOT_DATA || IS_NODESEG(type))) + p->offset = get_random_u32_below(MAIN_SECS(sbi) * + SEGS_PER_SEC(sbi)); + else if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; @@ -295,13 +295,13 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); else if (p->alloc_mode == AT_SSR) return UINT_MAX; /* LFS */ if (p->gc_mode == GC_GREEDY) - return 2 * sbi->blocks_per_seg * p->ofs_unit; + return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit); else if (p->gc_mode == GC_CB) return UINT_MAX; else if (p->gc_mode == GC_AT) @@ -348,7 +348,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) mtime = div_u64(mtime, usable_segs_per_sec); vblocks = div_u64(vblocks, usable_segs_per_sec); - u = (vblocks * 100) >> sbi->log_blocks_per_seg; + u = BLKS_TO_SEGS(sbi, vblocks * 100); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) @@ -496,9 +496,9 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, return; } - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < SEGS_PER_SEC(sbi); i++) mtime += get_seg_entry(sbi, start + i)->mtime; - mtime = div_u64(mtime, sbi->segs_per_sec); + mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) @@ -599,7 +599,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, unsigned long long age; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; - unsigned int seg_blocks = sbi->blocks_per_seg; unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * @@ -629,7 +628,7 @@ next_node: f2fs_bug_on(sbi, !vblocks); /* rare case */ - if (vblocks == seg_blocks) + if (vblocks == BLKS_PER_SEG(sbi)) goto skip_node; iter++; @@ -755,7 +754,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, int ret = 0; mutex_lock(&dirty_i->seglist_lock); - last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec; + last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); p.alloc_mode = alloc_mode; p.age = age; @@ -896,7 +895,7 @@ next: else sm->last_victim[p.gc_mode] = segno + p.ofs_unit; sm->last_victim[p.gc_mode] %= - (MAIN_SECS(sbi) * sbi->segs_per_sec); + (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); break; } } @@ -1184,7 +1183,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index) .op_flags = 0, .encrypted_page = NULL, .in_list = 0, - .retry = 0, }; int err; @@ -1197,7 +1195,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } goto got_it; @@ -1216,7 +1213,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index) if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto put_page; } got_it: @@ -1273,7 +1269,6 @@ static int move_data_block(struct inode *inode, block_t bidx, .op_flags = 0, .encrypted_page = NULL, .in_list = 0, - .retry = 0, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -1364,8 +1359,13 @@ static int move_data_block(struct inode *inode, block_t bidx, set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* allocate block address */ - f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); + if (err) { + f2fs_put_page(mpage, 1); + /* filesystem should shutdown, no need to recovery block */ + goto up_out; + } fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -1393,18 +1393,12 @@ static int move_data_block(struct inode *inode, block_t bidx, fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); - if (fio.retry) { - err = -EAGAIN; - if (PageWriteback(fio.encrypted_page)) - end_page_writeback(fio.encrypted_page); - goto put_page_out; - } f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); -put_page_out: + f2fs_put_page(fio.encrypted_page, 1); recover_block: if (err) @@ -1678,7 +1672,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum; struct blk_plug plug; unsigned int segno = start_segno; - unsigned int end_segno = start_segno + sbi->segs_per_sec; + unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi); int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -1686,7 +1680,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, int submitted = 0; if (__is_large_section(sbi)) - end_segno = rounddown(end_segno, sbi->segs_per_sec); + end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi)); /* * zone-capacity can be less than zone-size in zoned devices, @@ -1694,7 +1688,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * calculate the end segno in the zone which can be garbage collected */ if (f2fs_sb_has_blkzoned(sbi)) - end_segno -= sbi->segs_per_sec - + end_segno -= SEGS_PER_SEC(sbi) - f2fs_usable_segs_in_sec(sbi, segno); sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); @@ -1983,10 +1977,43 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) init_atgc_management(sbi); } +int f2fs_gc_range(struct f2fs_sb_info *sbi, + unsigned int start_seg, unsigned int end_seg, + bool dry_run, unsigned int dry_run_sections) +{ + unsigned int segno; + unsigned int gc_secs = dry_run_sections; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + + do_garbage_collect(sbi, segno, &gc_list, FG_GC, + dry_run_sections == 0); + put_gc_inode(&gc_list); + + if (!dry_run && get_valid_blocks(sbi, segno, true)) + return -EAGAIN; + if (dry_run && dry_run_sections && + !get_valid_blocks(sbi, segno, true) && --gc_secs == 0) + break; + + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + } + + return 0; +} + static int free_segment_range(struct f2fs_sb_info *sbi, - unsigned int secs, bool gc_only) + unsigned int secs, bool dry_run) { - unsigned int segno, next_inuse, start, end; + unsigned int next_inuse, start, end; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; int gc_mode, gc_type; int err = 0; @@ -1994,7 +2021,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, /* Force block allocation for GC */ MAIN_SECS(sbi) -= secs; - start = MAIN_SECS(sbi) * sbi->segs_per_sec; + start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); end = MAIN_SEGS(sbi) - 1; mutex_lock(&DIRTY_I(sbi)->seglist_lock); @@ -2008,29 +2035,15 @@ static int free_segment_range(struct f2fs_sb_info *sbi, mutex_unlock(&DIRTY_I(sbi)->seglist_lock); /* Move out cursegs from the target range */ - for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) - f2fs_allocate_segment_for_resize(sbi, type, start, end); - - /* do GC to move out valid blocks in the range */ - for (segno = start; segno <= end; segno += sbi->segs_per_sec) { - struct gc_inode_list gc_list = { - .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), - }; - - do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); - put_gc_inode(&gc_list); - - if (!gc_only && get_valid_blocks(sbi, segno, true)) { - err = -EAGAIN; - goto out; - } - if (fatal_signal_pending(current)) { - err = -ERESTARTSYS; + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) { + err = f2fs_allocate_segment_for_resize(sbi, type, start, end); + if (err) goto out; - } } - if (gc_only) + + /* do GC to move out valid blocks in the range */ + err = f2fs_gc_range(sbi, start, end, dry_run, 0); + if (err || dry_run) goto out; stat_inc_cp_call_count(sbi, TOTAL_CALL); @@ -2056,7 +2069,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) int segment_count; int segment_count_main; long long block_count; - int segs = secs * sbi->segs_per_sec; + int segs = secs * SEGS_PER_SEC(sbi); f2fs_down_write(&sbi->sb_lock); @@ -2069,7 +2082,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->segment_count = cpu_to_le32(segment_count + segs); raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); raw_sb->block_count = cpu_to_le64(block_count + - (long long)segs * sbi->blocks_per_seg); + (long long)SEGS_TO_BLKS(sbi, segs)); if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; int dev_segs = @@ -2084,8 +2097,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) { - int segs = secs * sbi->segs_per_sec; - long long blks = (long long)segs * sbi->blocks_per_seg; + int segs = secs * SEGS_PER_SEC(sbi); + long long blks = SEGS_TO_BLKS(sbi, segs); long long user_block_count = le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); @@ -2127,7 +2140,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count) int last_dev = sbi->s_ndevs - 1; __u64 last_segs = FDEV(last_dev).total_segments; - if (block_count + last_segs * sbi->blocks_per_seg <= + if (block_count + SEGS_TO_BLKS(sbi, last_segs) <= old_block_count) return -EINVAL; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 28a00942802c..9c0d06c4d19a 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -96,7 +96,7 @@ static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) if (f2fs_sb_has_blkzoned(sbi)) return free_segs_blk_count_zoned(sbi); - return free_segments(sbi) << sbi->log_blocks_per_seg; + return SEGS_TO_BLKS(sbi, free_segments(sbi)); } static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) @@ -104,7 +104,7 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) block_t free_blks, ovp_blks; free_blks = free_segs_blk_count(sbi); - ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + ovp_blks = SEGS_TO_BLKS(sbi, overprovision_segments(sbi)); if (free_blks < ovp_blks) return 0; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f7f63a567d86..e54f8c08bda8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -851,7 +851,7 @@ out: static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode, bool is_whiteout, - struct inode **new_inode) + struct inode **new_inode, struct f2fs_filename *fname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -879,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out; - err = f2fs_do_tmpfile(inode, dir); + err = f2fs_do_tmpfile(inode, dir, fname); if (err) goto release_out; @@ -930,22 +930,24 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL); + err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL, NULL); return finish_open_simple(file, err); } static int f2fs_create_whiteout(struct mnt_idmap *idmap, - struct inode *dir, struct inode **whiteout) + struct inode *dir, struct inode **whiteout, + struct f2fs_filename *fname) { - return __f2fs_tmpfile(idmap, dir, NULL, - S_IFCHR | WHITEOUT_MODE, true, whiteout); + return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, + true, whiteout, fname); } int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct inode **new_inode) { - return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode); + return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, + false, new_inode, NULL); } static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, @@ -989,7 +991,14 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(idmap, old_dir, &whiteout); + struct f2fs_filename fname; + + err = f2fs_setup_filename(old_dir, &old_dentry->d_name, + 0, &fname); + if (err) + return err; + + err = f2fs_create_whiteout(idmap, old_dir, &whiteout, &fname); if (err) return err; } @@ -1104,14 +1113,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, iput(whiteout); } - if (old_is_dir) { - if (old_dir_entry) - f2fs_set_link(old_inode, old_dir_entry, - old_dir_page, new_dir); - else - f2fs_put_page(old_dir_page, 0); + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + if (old_is_dir) f2fs_i_links_write(old_dir, false); - } + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); if (S_ISDIR(old_inode->i_mode)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9b546fd21010..b3de6d6cdb02 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -852,21 +852,29 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && f2fs_sb_has_readonly(sbi)) { - unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + unsigned int ofs_in_node = dn->ofs_in_node; + pgoff_t fofs = index; + unsigned int c_len; block_t blkaddr; + /* should align fofs and ofs_in_node to cluster_size */ + if (fofs % cluster_size) { + fofs = round_down(fofs, cluster_size); + ofs_in_node = round_down(ofs_in_node, cluster_size); + } + + c_len = f2fs_cluster_blocks_are_contiguous(dn, ofs_in_node); if (!c_len) goto out; - blkaddr = f2fs_data_blkaddr(dn); + blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node); if (blkaddr == COMPRESS_ADDR) blkaddr = data_blkaddr(dn->inode, dn->node_page, - dn->ofs_in_node + 1); + ofs_in_node + 1); f2fs_update_read_extent_tree_range_compressed(dn->inode, - index, blkaddr, - F2FS_I(dn->inode)->i_cluster_size, - c_len); + fofs, blkaddr, cluster_size, c_len); } out: return 0; @@ -1919,7 +1927,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) for (i = 0; i < nr_folios; i++) { struct page *page = &fbatch.folios[i]->page; - if (!IS_DNODE(page)) + if (!IS_INODE(page)) continue; lock_page(page); @@ -2841,7 +2849,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, int i, idx, last_offset, nrpages; /* scan the node segment */ - last_offset = sbi->blocks_per_seg; + last_offset = BLKS_PER_SEG(sbi); addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; @@ -3158,7 +3166,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return 0; - nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { struct page *page; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 5bd16a95eef8..6aea13024ac1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -208,10 +208,10 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (block_off << 1) - - (block_off & (sbi->blocks_per_seg - 1))); + (block_off & (BLKS_PER_SEG(sbi) - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) - block_addr += sbi->blocks_per_seg; + block_addr += BLKS_PER_SEG(sbi); return block_addr; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d0f24ccbd1ac..e7bf15b8240a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -354,7 +354,7 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, if (blkaddr + 1 == next_blkaddr) ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, ra_blocks * 2); - else if (next_blkaddr % sbi->blocks_per_seg) + else if (next_blkaddr % BLKS_PER_SEG(sbi)) ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, ra_blocks / 2); return ra_blocks; @@ -611,6 +611,19 @@ truncate_out: return 0; } +static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn) +{ + int i, err = 0; + + for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) { + err = f2fs_reserve_new_block(dn); + if (!err) + break; + } + + return err; +} + static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page) { @@ -680,14 +693,12 @@ retry_dn: if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } if (__is_valid_data_blkaddr(dest) && !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { err = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto err; } @@ -712,14 +723,8 @@ retry_dn: */ if (dest == NEW_ADDR) { f2fs_truncate_data_blocks_range(&dn, 1); - do { - err = f2fs_reserve_new_block(&dn); - if (err == -ENOSPC) { - f2fs_bug_on(sbi, 1); - break; - } - } while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + + err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; continue; @@ -727,16 +732,8 @@ retry_dn: /* dest is valid block, try to recover from src to dest */ if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { - if (src == NULL_ADDR) { - do { - err = f2fs_reserve_new_block(&dn); - if (err == -ENOSPC) { - f2fs_bug_on(sbi, 1); - break; - } - } while (err && - IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)); + err = f2fs_reserve_new_block_retry(&dn); if (err) goto err; } @@ -756,8 +753,6 @@ retry_prev: f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; - f2fs_handle_error(sbi, - ERROR_INVALID_BLKADDR); goto err; } @@ -852,7 +847,7 @@ next: f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } if (!err) - f2fs_allocate_new_segments(sbi); + err = f2fs_allocate_new_segments(sbi); return err; } @@ -864,7 +859,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; - bool fix_curseg_write_pointer = false; if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); @@ -895,8 +889,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) else f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: - fix_curseg_write_pointer = !check_only || list_empty(&inode_list); - destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); @@ -914,11 +906,13 @@ skip: * and the f2fs is not read only, check and fix zoned block devices' * write pointer consistency. */ - if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && - f2fs_sb_has_blkzoned(sbi)) { - err = f2fs_fix_curseg_write_pointer(sbi); - if (!err) - err = f2fs_check_write_pointer(sbi); + if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) { + int err2 = f2fs_fix_curseg_write_pointer(sbi); + + if (!err2) + err2 = f2fs_check_write_pointer(sbi); + if (err2) + err = err2; ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e1065ba70207..4fd76e867e0a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -192,6 +192,9 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) if (!f2fs_is_atomic_file(inode)) return; + if (clean) + truncate_inode_pages_final(inode->i_mapping); + release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); @@ -201,7 +204,6 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) F2FS_I(inode)->atomic_write_task = NULL; if (clean) { - truncate_inode_pages_final(inode->i_mapping); f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } @@ -248,7 +250,7 @@ retry: } else { blkcnt_t count = 1; - err = inc_valid_block_count(sbi, inode, &count); + err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; @@ -334,8 +336,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode) DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; - f2fs_handle_error(sbi, - ERROR_INVALID_BLKADDR); goto out; } @@ -400,6 +400,9 @@ int f2fs_commit_atomic_write(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (f2fs_cp_error(sbi)) + return; + if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); @@ -448,8 +451,8 @@ static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); - unsigned int threshold = sbi->blocks_per_seg * factor * - DEFAULT_DIRTY_THRESHOLD; + unsigned int threshold = + SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); unsigned int global_threshold = threshold * 3 / 2; if (dents >= threshold || qdata >= threshold || @@ -872,7 +875,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); - block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg; + block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t holes[2] = {0, 0}; /* DATA and NODE */ block_t unusable; @@ -901,11 +904,16 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); + + if (F2FS_OPTION(sbi).unusable_cap_perc == 100) + return 0; if (unusable > F2FS_OPTION(sbi).unusable_cap) return -EAGAIN; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; + if (has_not_enough_free_secs(sbi, 0, 0)) + return -EAGAIN; return 0; } @@ -1132,8 +1140,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, struct seg_entry *sentry; unsigned int segno; block_t blk = start; - unsigned long offset, size, max_blocks = sbi->blocks_per_seg; - unsigned long *map; + unsigned long offset, size, *map; while (blk < end) { segno = GET_SEGNO(sbi, blk); @@ -1143,7 +1150,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, if (end < START_BLOCK(sbi, segno + 1)) size = GET_BLKOFF_FROM_SEG0(sbi, end); else - size = max_blocks; + size = BLKS_PER_SEG(sbi); map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); @@ -2048,7 +2055,6 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - int max_blocks = sbi->blocks_per_seg; struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; @@ -2060,8 +2066,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || - !f2fs_block_unit_discard(sbi)) + if (se->valid_blocks == BLKS_PER_SEG(sbi) || + !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) return false; if (!force) { @@ -2078,13 +2085,14 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { - start = __find_rev_next_bit(dmap, max_blocks, end + 1); - if (start >= max_blocks) + start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); + if (start >= BLKS_PER_SEG(sbi)) break; - end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); - if (force && start && end != max_blocks - && (end - start) < cpc->trim_minlen) + end = __find_rev_next_zero_bit(dmap, + BLKS_PER_SEG(sbi), start + 1); + if (force && start && end != BLKS_PER_SEG(sbi) && + (end - start) < cpc->trim_minlen) continue; if (check_only) @@ -2166,8 +2174,8 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, start + 1); if (section_alignment) { - start = rounddown(start, sbi->segs_per_sec); - end = roundup(end, sbi->segs_per_sec); + start = rounddown(start, SEGS_PER_SEC(sbi)); + end = roundup(end, SEGS_PER_SEC(sbi)); } for (i = start; i < end; i++) { @@ -2186,7 +2194,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, if (!f2fs_sb_has_blkzoned(sbi) && (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), - (end - start) << sbi->log_blocks_per_seg); + SEGS_TO_BLKS(sbi, end - start)); continue; } next: @@ -2195,9 +2203,9 @@ next: if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), - sbi->segs_per_sec << sbi->log_blocks_per_seg); + BLKS_PER_SEC(sbi)); - start = start_segno + sbi->segs_per_sec; + start = start_segno + SEGS_PER_SEC(sbi); if (start < end) goto next; else @@ -2216,7 +2224,7 @@ next: find_next: if (is_valid) { next_pos = find_next_zero_bit_le(entry->discard_map, - sbi->blocks_per_seg, cur_pos); + BLKS_PER_SEG(sbi), cur_pos); len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || @@ -2228,13 +2236,13 @@ find_next: total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, - sbi->blocks_per_seg, cur_pos); + BLKS_PER_SEG(sbi), cur_pos); } skip: cur_pos = next_pos; is_valid = !is_valid; - if (cur_pos < sbi->blocks_per_seg) + if (cur_pos < BLKS_PER_SEG(sbi)) goto find_next; release_discard_addr(entry); @@ -2251,6 +2259,12 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int err = 0; + if (f2fs_sb_has_readonly(sbi)) { + f2fs_info(sbi, + "Skip to start discard thread for readonly image"); + return 0; + } + if (!f2fs_realtime_discard_enable(sbi)) return 0; @@ -2283,7 +2297,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) - dcc->discard_granularity = sbi->blocks_per_seg; + dcc->discard_granularity = BLKS_PER_SEG(sbi); else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEC(sbi); @@ -2297,7 +2311,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; @@ -2405,6 +2419,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) #endif segno = GET_SEGNO(sbi, blkaddr); + if (segno == NULL_SEGNO) + return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; @@ -2546,7 +2562,7 @@ static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int typ struct curseg_info *curseg = CURSEG_I(sbi, type); if (sbi->ckpt->alloc_type[type] == SSR) - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); return curseg->next_blkoff; } @@ -2634,7 +2650,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); - if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } @@ -2643,54 +2659,51 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ -static void get_new_segment(struct f2fs_sb_info *sbi, - unsigned int *newseg, bool new_sec, int dir) +static int get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); - unsigned int left_start = hint; bool init = true; - int go_left = 0; int i; + int ret = 0; spin_lock(&free_i->segmap_lock); - if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { + ret = -ENOSPC; + goto out_unlock; + } + + if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } + + /* + * If we format f2fs on zoned storage, let's try to get pinned sections + * from beginning of the storage, which should be a conventional one. + */ + if (f2fs_sb_has_blkzoned(sbi)) { + segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg); + hint = GET_SEC_FROM_SEG(sbi, segno); + } + find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { - if (dir == ALLOC_RIGHT) { - secno = find_first_zero_bit(free_i->free_secmap, + secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); - f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); - } else { - go_left = 1; - left_start = hint - 1; + if (secno >= MAIN_SECS(sbi)) { + ret = -ENOSPC; + goto out_unlock; } } - if (go_left == 0) - goto skip_left; - - while (test_bit(left_start, free_i->free_secmap)) { - if (left_start > 0) { - left_start--; - continue; - } - left_start = find_first_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi)); - f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); - break; - } - secno = left_start; -skip_left: segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); @@ -2701,21 +2714,13 @@ skip_left: goto got_it; if (zoneno == old_zoneno) goto got_it; - if (dir == ALLOC_LEFT) { - if (!go_left && zoneno + 1 >= total_zones) - goto got_it; - if (go_left && zoneno == 0) - goto got_it; - } for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ - if (go_left) - hint = zoneno * sbi->secs_per_zone - 1; - else if (zoneno + 1 >= total_zones) + if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; @@ -2725,9 +2730,23 @@ skip_left: got_it: /* set it as dirty segment in free segmap */ f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); + + /* no free section in conventional zone */ + if (new_sec && pinning && + !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) { + ret = -EAGAIN; + goto out_unlock; + } __set_inuse(sbi, segno); *newseg = segno; +out_unlock: spin_unlock(&free_i->segmap_lock); + + if (ret == -ENOSPC) { + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); + f2fs_bug_on(sbi, 1); + } + return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) @@ -2736,6 +2755,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; unsigned short seg_type = curseg->seg_type; + /* only happen when get_new_segment() fails */ + if (curseg->next_segno == NULL_SEGNO) + return; + curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); @@ -2761,9 +2784,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); + return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); - /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) return curseg->segno; @@ -2774,8 +2796,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; - if (test_opt(sbi, NOHEAP) && - (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))) + if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) @@ -2792,30 +2813,31 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ -static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned short seg_type = curseg->seg_type; unsigned int segno = curseg->segno; - int dir = ALLOC_LEFT; + bool pinning = type == CURSEG_COLD_DATA_PINNED; + int ret; if (curseg->inited) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, segno)); - if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA) - dir = ALLOC_RIGHT; - - if (test_opt(sbi, NOHEAP)) - dir = ALLOC_RIGHT; + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); segno = __get_next_segno(sbi, type); - get_new_segment(sbi, &segno, new_sec, dir); + ret = get_new_segment(sbi, &segno, new_sec, pinning); + if (ret) { + if (ret == -ENOSPC) + curseg->segno = NULL_SEGNO; + return ret; + } + curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); + return 0; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2831,7 +2853,7 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, @@ -2842,14 +2864,14 @@ static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { - return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; + return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type) +static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2874,21 +2896,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) if (IS_ERR(sum_page)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); - return; + return PTR_ERR(sum_page); } sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); + return 0; } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age); -static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, +static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, int target_type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); + int ret = 0; curseg->seg_type = target_type; @@ -2896,38 +2920,41 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type, struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; - change_curseg(sbi, type); + ret = change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; - new_curseg(sbi, type, true); + ret = new_curseg(sbi, type, true); } stat_inc_seg_type(sbi, curseg); + return ret; } -static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) +static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + int ret = 0; if (!sbi->am.atgc_enabled) - return; + return 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); - get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); + ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, + CURSEG_COLD_DATA, SSR, 0); up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); - + return ret; } -void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) { - __f2fs_init_atgc_curseg(sbi); + return __f2fs_init_atgc_curseg(sbi); } static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) @@ -3055,11 +3082,12 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type) return false; } -void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, +int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; + int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); @@ -3070,9 +3098,9 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type); + ret = change_curseg(sbi, type); else - new_curseg(sbi, type, true); + ret = new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); @@ -3086,45 +3114,84 @@ unlock: mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); + return ret; } -static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, +static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; + int err = 0; + + if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) + goto allocate; if (!force && curseg->inited && !curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, new_sec) && !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) - return; + return 0; +allocate: old_segno = curseg->segno; - new_curseg(sbi, type, true); + err = new_curseg(sbi, type, true); + if (err) + return err; stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); + return 0; } -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) +int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { + int ret; + f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_segment(sbi, type, true, force); + ret = __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); + + return ret; } -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) +{ + int err; + bool gc_required = true; + +retry: + f2fs_lock_op(sbi); + err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1); + f2fs_up_write(&sbi->gc_lock); + + gc_required = false; + if (!err) + goto retry; + } + + return err; +} + +int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + int err = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i, false, false); + err += __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); + + return err; } bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, @@ -3242,8 +3309,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); if (need_align) { - start_segno = rounddown(start_segno, sbi->segs_per_sec); - end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); + end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } cpc.reason = CP_DISCARD; @@ -3416,7 +3483,14 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; +} + +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) @@ -3427,12 +3501,18 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; bool segment_full = false; + int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); + if (curseg->segno == NULL_SEGNO) { + ret = -ENOSPC; + goto out_err; + } + if (from_gc) { f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); @@ -3441,7 +3521,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg); + f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); f2fs_wait_discard_bio(sbi, *new_blkaddr); @@ -3470,25 +3550,35 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, * since SSR needs latest valid block information. */ update_sit_entry(sbi, *new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, old_blkaddr, -1); /* * If the current segment is full, flush it out and replace it with a * new segment. */ if (segment_full) { + if (type == CURSEG_COLD_DATA_PINNED && + !((curseg->segno + 1) % sbi->segs_per_sec)) { + reset_curseg_fields(curseg); + goto skip_new_segment; + } + if (from_gc) { - get_atssr_segment(sbi, type, se->type, + ret = get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); } else { if (need_new_seg(sbi, type)) - new_curseg(sbi, type, false); + ret = new_curseg(sbi, type, false); else - change_curseg(sbi, type); + ret = change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } + + if (ret) + goto out_err; } + +skip_new_segment: /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous @@ -3497,12 +3587,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - if (IS_DATASEG(type)) + if (IS_DATASEG(curseg->seg_type)) atomic64_inc(&sbi->allocated_data_blocks); up_write(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) { + if (page && IS_NODESEG(curseg->seg_type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); f2fs_inode_chksum_set(sbi, page); @@ -3511,9 +3601,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (fio) { struct f2fs_bio_info *io; - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = 0; - INIT_LIST_HEAD(&fio->list); fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; @@ -3523,8 +3610,15 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } mutex_unlock(&curseg->curseg_mutex); - f2fs_up_read(&SM_I(sbi)->curseg_lock); + return 0; +out_err: + *new_blkaddr = NULL_ADDR; + up_write(&sit_i->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + f2fs_up_read(&SM_I(sbi)->curseg_lock); + return ret; + } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, @@ -3561,21 +3655,25 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); -reallocate: - f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio); + + if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio)) { + if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) + fscrypt_finalize_bounce_page(&fio->encrypted_page); + if (PageWriteback(fio->page)) + end_page_writeback(fio->page); + if (f2fs_in_warm_node_list(fio->sbi, fio->page)) + f2fs_del_fsync_node_entry(fio->sbi, fio->page); + goto out; + } if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); - if (fio->retry) { - fio->old_blkaddr = fio->new_blkaddr; - goto reallocate; - } f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - +out: if (keep_order) f2fs_up_read(&fio->sbi->io_order_lock); } @@ -3659,8 +3757,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) } if (fio->post_read) - invalidate_mapping_pages(META_MAPPING(sbi), - fio->new_blkaddr, fio->new_blkaddr); + f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); @@ -3749,7 +3846,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type); + if (change_curseg(sbi, type)) + goto out_unlock; } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -3775,12 +3873,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type); + if (change_curseg(sbi, type)) + goto out_unlock; } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; } +out_unlock: up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_write(&SM_I(sbi)->curseg_lock); @@ -3850,7 +3950,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); - invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1); + f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) @@ -3892,7 +3992,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) - blk_off = sbi->blocks_per_seg; + blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; @@ -3960,7 +4060,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) struct f2fs_summary *ns = &sum->entries[0]; int i; - for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } @@ -4466,7 +4566,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) #endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); - sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; + sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); sit_i->written_valid_blocks = 0; sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; @@ -4539,9 +4639,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) array[i].seg_type = CURSEG_COLD_DATA; else if (i == CURSEG_ALL_DATA_ATGC) array[i].seg_type = CURSEG_COLD_DATA; - array[i].segno = NULL_SEGNO; - array[i].next_blkoff = 0; - array[i].inited = false; + reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } @@ -4593,21 +4691,20 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; - if (f2fs_block_unit_discard(sbi)) { - /* build discard map only one time */ - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, + if (!f2fs_block_unit_discard(sbi)) + goto init_discard_map_done; + + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, + goto init_discard_map_done; + } + memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - + sbi->discard_blks += BLKS_PER_SEG(sbi) - se->valid_blocks; - } - } - +init_discard_map_done: if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; @@ -4747,7 +4844,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) return; mutex_lock(&dirty_i->seglist_lock); - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); @@ -4846,7 +4943,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) if (curseg->alloc_type == SSR) continue; - for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) { + for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) { if (!f2fs_test_bit(blkofs, se->cur_valid_map)) continue; out: @@ -4862,6 +4959,16 @@ out: } #ifdef CONFIG_BLK_DEV_ZONED +static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = { + [BLK_ZONE_COND_NOT_WP] = "NOT_WP", + [BLK_ZONE_COND_EMPTY] = "EMPTY", + [BLK_ZONE_COND_IMP_OPEN] = "IMPLICIT_OPEN", + [BLK_ZONE_COND_EXP_OPEN] = "EXPLICIT_OPEN", + [BLK_ZONE_COND_CLOSED] = "CLOSED", + [BLK_ZONE_COND_READONLY] = "READONLY", + [BLK_ZONE_COND_FULL] = "FULL", + [BLK_ZONE_COND_OFFLINE] = "OFFLINE", +}; static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, @@ -4883,14 +4990,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ - if (zone_segno >= MAIN_SEGS(sbi) || - IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) + if (zone_segno >= MAIN_SEGS(sbi)) return 0; /* * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); + if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { + f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", + zone_segno, valid_block_cnt, + f2fs_zone_status[zone->cond]); + return 0; + } if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) @@ -4898,8 +5010,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, if (!valid_block_cnt) { f2fs_notice(sbi, "Zone without valid block has non-zero write " - "pointer. Reset the write pointer: cond[0x%x]", - zone->cond); + "pointer. Reset the write pointer: cond[%s]", + f2fs_zone_status[zone->cond]); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) @@ -4916,8 +5028,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, * selected for write operation until it get discarded. */ f2fs_notice(sbi, "Valid blocks are not aligned with write " - "pointer: valid block[0x%x,0x%x] cond[0x%x]", - zone_segno, valid_block_cnt, zone->cond); + "pointer: valid block[0x%x,0x%x] cond[%s]", + zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, @@ -5128,7 +5240,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( unsigned int secno; if (!sbi->unusable_blocks_per_sec) - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); secno = GET_SEC_FROM_SEG(sbi, segno); seg_start = START_BLOCK(sbi, segno); @@ -5143,10 +5255,10 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( */ if (seg_start >= sec_cap_blkaddr) return 0; - if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr) + if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr) return sec_cap_blkaddr - seg_start; - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); } #else int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) @@ -5172,7 +5284,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi)) return f2fs_usable_zone_blks_in_seg(sbi, segno); - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); } unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, @@ -5181,7 +5293,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, if (f2fs_sb_has_blkzoned(sbi)) return CAP_SEGS_PER_SEC(sbi); - return sbi->segs_per_sec; + return SEGS_PER_SEC(sbi); } /* @@ -5196,14 +5308,14 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = ULLONG_MAX; - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { unsigned int i; unsigned long long mtime = 0; - for (i = 0; i < sbi->segs_per_sec; i++) + for (i = 0; i < SEGS_PER_SEC(sbi); i++) mtime += get_seg_entry(sbi, segno + i)->mtime; - mtime = div_u64(mtime, sbi->segs_per_sec); + mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; @@ -5242,7 +5354,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - sm_info->min_seq_blocks = sbi->blocks_per_seg; + sm_info->min_seq_blocks = BLKS_PER_SEG(sbi); sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8129be788bd5..e1c0f418aa11 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -48,21 +48,21 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ - (sbi)->segs_per_sec) || \ + SEGS_PER_SEC(sbi)) || \ ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ - (sbi)->segs_per_sec)) + SEGS_PER_SEC(sbi))) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ @@ -77,40 +77,37 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define TOTAL_SEGS(sbi) \ (SM_I(sbi) ? SM_I(sbi)->segment_count : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi))) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) #define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) + (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno)))) #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) + (BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr))) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1)) #define GET_SEGNO(sbi, blk_addr) \ ((!__is_valid_data_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define BLKS_PER_SEC(sbi) \ - ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define CAP_BLKS_PER_SEC(sbi) \ - ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ - (sbi)->unusable_blocks_per_sec) + (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec) #define CAP_SEGS_PER_SEC(sbi) \ - ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ - (sbi)->log_blocks_per_seg)) + (SEGS_PER_SEC(sbi) - \ + BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec)) #define GET_SEC_FROM_SEG(sbi, segno) \ - (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi)) #define GET_SEG_FROM_SEC(sbi, secno) \ - ((secno) * (sbi)->segs_per_sec) + ((secno) * SEGS_PER_SEC(sbi)) #define GET_ZONE_FROM_SEC(sbi, secno) \ (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ @@ -139,16 +136,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* - * indicate a block allocation direction: RIGHT and LEFT. - * RIGHT means allocating new sections towards the end of volume. - * LEFT means the opposite direction. - */ -enum { - ALLOC_RIGHT = 0, - ALLOC_LEFT -}; - -/* * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. @@ -364,7 +351,7 @@ static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, unsigned int blocks = 0; int i; - for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { struct seg_entry *se = get_seg_entry(sbi, start_segno); blocks += se->ckpt_valid_blocks; @@ -449,7 +436,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) free_i->free_segments++; next = find_next_bit(free_i->free_segmap, - start_segno + sbi->segs_per_sec, start_segno); + start_segno + SEGS_PER_SEC(sbi), start_segno); if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; @@ -485,7 +472,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, - start_segno + sbi->segs_per_sec, start_segno); + start_segno + SEGS_PER_SEC(sbi), start_segno); if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; @@ -573,23 +560,22 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int segno, left_blocks; + unsigned segno, left_blocks; int i; - /* check current node segment */ + /* check current node sections in the worst case. */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; - left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; - + left_blocks = CAP_BLKS_PER_SEC(sbi) - + get_ckpt_valid_blocks(sbi, segno, true); if (node_blocks > left_blocks) return false; } - /* check current data segment */ + /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; - left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - - get_seg_entry(sbi, segno)->ckpt_valid_blocks; + left_blocks = CAP_BLKS_PER_SEC(sbi) - + get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) return false; return true; @@ -638,7 +624,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, if (free_secs > upper_secs) return false; - else if (free_secs <= lower_secs) + if (free_secs <= lower_secs) return true; return !curseg_space; } @@ -793,10 +779,10 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (usable_blks_per_seg < sbi->blocks_per_seg) + if (usable_blks_per_seg < BLKS_PER_SEG(sbi)) f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, - sbi->blocks_per_seg, - usable_blks_per_seg) != sbi->blocks_per_seg); + BLKS_PER_SEG(sbi), + usable_blks_per_seg) != BLKS_PER_SEG(sbi)); /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg @@ -915,9 +901,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) return 0; if (type == DATA) - return sbi->blocks_per_seg; + return BLKS_PER_SEG(sbi); else if (type == NODE) - return 8 * sbi->blocks_per_seg; + return SEGS_TO_BLKS(sbi, 8); else if (type == META) return 8 * BIO_MAX_VECS; else @@ -969,3 +955,13 @@ wake_up: dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } + +static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi) +{ + int devi; + + for (devi = 0; devi < sbi->s_ndevs; devi++) + if (bdev_is_zoned(FDEV(devi).bdev)) + return GET_SEGNO(sbi, FDEV(devi).start_blk); + return 0; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f6ffbfe75653..a6867f26f141 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,24 +44,26 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION const char *f2fs_fault_name[FAULT_MAX] = { - [FAULT_KMALLOC] = "kmalloc", - [FAULT_KVMALLOC] = "kvmalloc", - [FAULT_PAGE_ALLOC] = "page alloc", - [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_NID] = "alloc nid", - [FAULT_ORPHAN] = "orphan", - [FAULT_BLOCK] = "no more block", - [FAULT_DIR_DEPTH] = "too big dir depth", - [FAULT_EVICT_INODE] = "evict_inode fail", - [FAULT_TRUNCATE] = "truncate fail", - [FAULT_READ_IO] = "read IO error", - [FAULT_CHECKPOINT] = "checkpoint error", - [FAULT_DISCARD] = "discard error", - [FAULT_WRITE_IO] = "write IO error", - [FAULT_SLAB_ALLOC] = "slab alloc", - [FAULT_DQUOT_INIT] = "dquot initialize", - [FAULT_LOCK_OP] = "lock_op", - [FAULT_BLKADDR] = "invalid blkaddr", + [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", + [FAULT_READ_IO] = "read IO error", + [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", + [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", + [FAULT_NO_SEGMENT] = "no free segment", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -137,7 +139,6 @@ enum { Opt_resgid, Opt_resuid, Opt_mode, - Opt_io_size_bits, Opt_fault_injection, Opt_fault_type, Opt_lazytime, @@ -216,7 +217,6 @@ static match_table_t f2fs_tokens = { {Opt_resgid, "resgid=%u"}, {Opt_resuid, "resuid=%u"}, {Opt_mode, "mode=%s"}, - {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_fault_type, "fault_type=%u"}, {Opt_lazytime, "lazytime"}, @@ -263,7 +263,8 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; -void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -274,8 +275,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...) level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cF2FS-fs (%s): %pV\n", - KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + if (limit_rate) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); va_end(args); } @@ -343,46 +348,6 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } -static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) -{ - unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; - unsigned int avg_vblocks; - unsigned int wanted_reserved_segments; - block_t avail_user_block_count; - - if (!F2FS_IO_ALIGNED(sbi)) - return 0; - - /* average valid block count in section in worst case */ - avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); - - /* - * we need enough free space when migrating one section in worst case - */ - wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * - reserved_segments(sbi); - wanted_reserved_segments -= reserved_segments(sbi); - - avail_user_block_count = sbi->user_block_count - - sbi->current_reserved_blocks - - F2FS_OPTION(sbi).root_reserved_blocks; - - if (wanted_reserved_segments * sbi->blocks_per_seg > - avail_user_block_count) { - f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", - wanted_reserved_segments, - avail_user_block_count >> sbi->log_blocks_per_seg); - return -ENOSPC; - } - - SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; - - f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", - wanted_reserved_segments); - - return 0; -} - static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -663,7 +628,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) #ifdef CONFIG_F2FS_FS_ZSTD static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) { - unsigned int level; + int level; int len = 4; if (strlen(str) == len) { @@ -677,9 +642,15 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); return -EINVAL; } - if (kstrtouint(str + 1, 10, &level)) + if (kstrtoint(str + 1, 10, &level)) return -EINVAL; + /* f2fs does not support negative compress level now */ + if (level < 0) { + f2fs_info(sbi, "do not support negative compress level: %d", level); + return -ERANGE; + } + if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { f2fs_info(sbi, "invalid zstd compress level: %d", level); return -EINVAL; @@ -763,10 +734,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) clear_opt(sbi, DISCARD); break; case Opt_noheap: - set_opt(sbi, NOHEAP); - break; case Opt_heap: - clear_opt(sbi, NOHEAP); + f2fs_warn(sbi, "heap/no_heap options were deprecated"); break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: @@ -913,16 +882,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; - case Opt_io_size_bits: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { - f2fs_warn(sbi, "Not support %ld, larger than %d", - BIT(arg), BIO_MAX_VECS); - return -EINVAL; - } - F2FS_OPTION(sbi).write_io_size_bits = arg; - break; #ifdef CONFIG_F2FS_FAULT_INJECTION case Opt_fault_injection: if (args->from && match_int(args, &arg)) @@ -1392,12 +1351,6 @@ default_check: } #endif - if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", - F2FS_IO_SIZE_KB(sbi)); - return -EINVAL; - } - if (test_opt(sbi, INLINE_XATTR_SIZE)) { int min_size, max_size; @@ -1718,7 +1671,6 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_page_array_cache(sbi); f2fs_destroy_xattr_caches(sbi); - mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); @@ -2009,10 +1961,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) } else { seq_puts(seq, ",nodiscard"); } - if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap"); - else - seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -2078,9 +2026,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resgid)); - if (F2FS_IO_SIZE_BITS(sbi)) - seq_printf(seq, ",io_bits=%u", - F2FS_OPTION(sbi).write_io_size_bits); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", @@ -2192,7 +2137,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, NOHEAP); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; @@ -2247,6 +2191,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .init_gc_type = FG_GC, .should_migrate_blocks = false, .err_gc_skipped = true, + .no_bg_gc = true, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); @@ -2332,7 +2277,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); - bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); @@ -2440,12 +2384,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) { - err = -EINVAL; - f2fs_warn(sbi, "switch io_bits option is not allowed"); - goto restore_opts; - } - if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { err = -EINVAL; f2fs_warn(sbi, "switch compress_cache option is not allowed"); @@ -3706,7 +3644,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } main_segs = le32_to_cpu(raw_super->segment_count_main); - blocks_per_seg = sbi->blocks_per_seg; + blocks_per_seg = BLKS_PER_SEG(sbi); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || @@ -3818,9 +3756,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); - sbi->total_node_count = - (le32_to_cpu(raw_super->segment_count_nat) / 2) - * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; + sbi->total_node_count = SEGS_TO_BLKS(sbi, + ((le32_to_cpu(raw_super->segment_count_nat) / 2) * + NAT_ENTRY_PER_BLOCK)); F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); @@ -3829,7 +3767,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - sbi->migration_granularity = sbi->segs_per_sec; + sbi->migration_granularity = SEGS_PER_SEC(sbi); sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; @@ -3930,11 +3868,6 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; zone_sectors = bdev_zone_sectors(bdev); - if (!is_power_of_2(zone_sectors)) { - f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); - return -EINVAL; - } - if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; @@ -4090,7 +4023,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) f2fs_up_write(&sbi->sb_lock); if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err); + f2fs_err_ratelimited(sbi, + "f2fs_commit_super fails to record stop_reason, err:%d", + err); } void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) @@ -4133,8 +4068,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) err = f2fs_commit_super(sbi, false); if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d", - error, err); + f2fs_err_ratelimited(sbi, + "f2fs_commit_super fails to record errors:%u, err:%d", + error, err); out_unlock: f2fs_up_write(&sbi->sb_lock); } @@ -4259,14 +4195,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (i == 0) { FDEV(i).start_blk = 0; FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); + SEGS_TO_BLKS(sbi, + FDEV(i).total_segments) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; + SEGS_TO_BLKS(sbi, + FDEV(i).total_segments) - 1; FDEV(i).bdev_file = bdev_file_open_by_path( FDEV(i).path, mode, sbi->sb, NULL); } @@ -4305,8 +4241,6 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } - f2fs_info(sbi, - "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -4519,19 +4453,10 @@ try_onemore: if (err) goto free_iostat; - if (F2FS_IO_ALIGNED(sbi)) { - sbi->write_io_dummy = - mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); - if (!sbi->write_io_dummy) { - err = -ENOMEM; - goto free_percpu; - } - } - /* init per sbi slab cache */ err = f2fs_init_xattr_caches(sbi); if (err) - goto free_io_dummy; + goto free_percpu; err = f2fs_init_page_array_cache(sbi); if (err) goto free_xattr_cache; @@ -4619,10 +4544,6 @@ try_onemore: goto free_nm; } - err = adjust_reserved_segment(sbi); - if (err) - goto free_nm; - /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); @@ -4749,13 +4670,20 @@ reset_checkpoint: * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. */ - if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { - err = f2fs_check_write_pointer(sbi); - if (err) - goto free_meta; + if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) { + int err2; + + f2fs_notice(sbi, "Checking entire write pointers"); + err2 = f2fs_check_write_pointer(sbi); + if (err2) + err = err2; } + if (err) + goto free_meta; - f2fs_init_inmem_curseg(sbi); + err = f2fs_init_inmem_curseg(sbi); + if (err) + goto sync_free_meta; /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -4854,8 +4782,6 @@ free_page_array_cache: f2fs_destroy_page_array_cache(sbi); free_xattr_cache: f2fs_destroy_xattr_caches(sbi); -free_io_dummy: - mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); free_iostat: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a7ec55c7bb20..a568ce96cf56 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -493,8 +493,8 @@ out: spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - F2FS_OPTION(sbi).root_reserved_blocks - - sbi->blocks_per_seg * - SM_I(sbi)->additional_reserved_segments)) { + SEGS_TO_BLKS(sbi, + SM_I(sbi)->additional_reserved_segments))) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -551,7 +551,7 @@ out: } if (!strcmp(a->attr.name, "migration_granularity")) { - if (t == 0 || t > sbi->segs_per_sec) + if (t == 0 || t > SEGS_PER_SEC(sbi)) return -EINVAL; } @@ -1492,6 +1492,50 @@ static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused disk_map_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + + seq_printf(seq, "Address Layout : %5luB Block address (# of Segments)\n", + F2FS_BLKSIZE); + seq_printf(seq, " SB : %12s\n", "0/1024B"); + seq_printf(seq, " seg0_blkaddr : 0x%010x\n", SEG0_BLKADDR(sbi)); + seq_printf(seq, " Checkpoint : 0x%010x (%10d)\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2); + seq_printf(seq, " SIT : 0x%010x (%10d)\n", + SIT_I(sbi)->sit_base_addr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit)); + seq_printf(seq, " NAT : 0x%010x (%10d)\n", + NM_I(sbi)->nat_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat)); + seq_printf(seq, " SSA : 0x%010x (%10d)\n", + SM_I(sbi)->ssa_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa)); + seq_printf(seq, " Main : 0x%010x (%10d)\n", + SM_I(sbi)->main_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + seq_printf(seq, " Segs/Sections : %12d\n", + SEGS_PER_SEC(sbi)); + seq_printf(seq, " Section size : %12d MB\n", + SEGS_PER_SEC(sbi) << 1); + + if (!f2fs_is_multi_device(sbi)) + return 0; + + seq_puts(seq, "\nDisk Map for multi devices:\n"); + for (i = 0; i < sbi->s_ndevs; i++) + seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n", + i, bdev_is_zoned(FDEV(i).bdev), + FDEV(i).start_blk, FDEV(i).end_blk, + FDEV(i).path); + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -1573,6 +1617,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, discard_plist_seq_show, sb); + proc_create_single_data("disk_map", 0444, sbi->s_proc, + disk_map_seq_show, sb); return 0; put_feature_list_kobj: kobject_put(&sbi->s_feature_list_kobj); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 4fc95f353a7a..f7bb0c54502c 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -258,21 +258,23 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (!page || !PageUptodate(page)) { + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); - if (page) - put_page(page); + if (!IS_ERR(folio)) + folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); - page = read_mapping_page(inode->i_mapping, index, NULL); + folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - return page; + return folio_file_page(folio, index); } static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index c52e63e10d35..509eea96a457 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, fid->parent_i_gen = parent->i_generation; type = FILEID_FAT_WITH_PARENT; *lenp = FAT_FID_SIZE_WITH_PARENT; + } else { + /* + * We need to initialize this field because the fh is actually + * 12 bytes long + */ + fid->parent_i_pos_hi = 0; } return type; diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5..8674dbfbe59d 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,14 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_PASSTHROUGH + bool "FUSE passthrough operations support" + default y + depends on FUSE_FS + select FS_STACK + help + This allows bypassing FUSE server by mapping specific FUSE operations + to be performed directly on a backing file. + + If you want to allow passthrough operations, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d..6e0228c6d0cb 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 284a35006462..97ac994ff78f 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1a8f82f478cb..3ec8bb5e68ff 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1775,6 +1775,61 @@ copy_finish: return err; } +/* + * Resending all processing queue requests. + * + * During a FUSE daemon panics and failover, it is possible for some inflight + * requests to be lost and never returned. As a result, applications awaiting + * replies would become stuck forever. To address this, we can use notification + * to trigger resending of these pending requests to the FUSE daemon, ensuring + * they are properly processed again. + * + * Please note that this strategy is applicable only to idempotent requests or + * if the FUSE daemon takes careful measures to avoid processing duplicated + * non-idempotent requests. + */ +static void fuse_resend(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + struct fuse_req *req, *next; + struct fuse_iqueue *fiq = &fc->iq; + LIST_HEAD(to_queue); + unsigned int i; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], &to_queue); + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + list_for_each_entry_safe(req, next, &to_queue, list) { + __set_bit(FR_PENDING, &req->flags); + /* mark the request as resend request */ + req->in.h.unique |= FUSE_UNIQUE_RESEND; + } + + spin_lock(&fiq->lock); + /* iq and pq requests are both oldest to newest */ + list_splice(&to_queue, &fiq->pending); + fiq->ops->wake_pending_and_unlock(fiq); +} + +static int fuse_notify_resend(struct fuse_conn *fc) +{ + fuse_resend(fc); + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1800,6 +1855,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_RESEND: + return fuse_notify_resend(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2251,43 +2309,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; + if (get_user(oldfd, argp)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + + fdput(f); + return res; +} + +static long fuse_dev_ioctl_backing_open(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_backing_open(fud->fc, &map); +} + +static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + int backing_id; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (get_user(backing_id, argp)) + return -EFAULT; + + return fuse_backing_close(fud->fc, backing_id); +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + switch (cmd) { case FUSE_DEV_IOC_CLONE: - if (get_user(oldfd, (__u32 __user *)arg)) - return -EFAULT; + return fuse_dev_ioctl_clone(file, argp); - f = fdget(oldfd); - if (!f.file) - return -EINVAL; + case FUSE_DEV_IOC_BACKING_OPEN: + return fuse_dev_ioctl_backing_open(file, argp); + + case FUSE_DEV_IOC_BACKING_CLOSE: + return fuse_dev_ioctl_backing_close(file, argp); - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); - - res = -EINVAL; - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fdput(f); - break; default: - res = -ENOTTY; - break; + return -ENOTTY; } - return res; } const struct file_operations fuse_dev_operations = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d19cbf34c634..4a6df591add6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; + if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { + pr_warn_once("root generation should be zero\n"); + outarg->generation = 0; + } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), @@ -615,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; - struct fuse_open_out outopen; + struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; @@ -630,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; @@ -659,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; - args.out_args[1].size = sizeof(outopen); - args.out_args[1].value = &outopen; + /* Store outarg for fuse_finish_open() */ + outopenp = &ff->args->open_outarg; + args.out_args[1].size = sizeof(*outopenp); + args.out_args[1].value = outopenp; err = get_create_ext(&args, dir, entry, mode); if (err) @@ -676,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_invalid_attr(&outentry.attr)) goto out_free_ff; - ff->fh = outopen.fh; + ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; + ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { @@ -692,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) @@ -1210,7 +1218,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file, if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { - make_bad_inode(inode); + fuse_make_bad(inode); return -EIO; } @@ -1485,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission - * modell. + * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this @@ -1630,7 +1638,30 @@ out_err: static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c007b0f0c3a7..a56e7bffd000 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -20,6 +20,7 @@ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/splice.h> +#include <linux/task_io_accounting_ops.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); - mutex_destroy(&ff->readdir.lock); + kfree(ff->args); kfree(ff); } @@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { - struct fuse_open_out outarg; + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); @@ -274,10 +289,13 @@ out_inode_unlock: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; + + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -634,7 +653,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, for (i = 0; i < ap->num_pages; i++) { if (should_dirty) set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + if (ap->args.is_pinned) + unpin_user_page(ap->pages[i]); } } @@ -925,7 +945,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, put_page(page); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1299,13 +1319,93 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return false; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_file_uncached_io_end(inode, ff); + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; - ssize_t err; + ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -1327,10 +1427,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; + task_io_account_write(count); + err = file_remove_privs(file); if (err) goto out; @@ -1392,10 +1494,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + struct page **pt_pages; + + pt_pages = &ap->pages[ap->num_pages]; + ret = iov_iter_extract_pages(ii, &pt_pages, + *nbytesp - nbytes, + max_pages - ap->num_pages, + 0, &start); if (ret < 0) break; @@ -1412,6 +1517,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; @@ -1558,51 +1664,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. - */ - if (fuse_direct_write_extending_i_size(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { + task_io_account_write(res); if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { res = fuse_direct_IO(iocb, from); } else { @@ -1611,10 +1683,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -1631,10 +1700,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1649,10 +1721,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else return fuse_cache_write_iter(iocb, from); +} + +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); else - return fuse_direct_write_iter(iocb, from); + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) @@ -1667,7 +1767,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) __free_page(ap->pages[i]); if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1909,7 +2009,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -1947,26 +2047,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } -static int fuse_writepage_locked(struct page *page) +static int fuse_writepage_locked(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; + struct folio *tmp_folio; int error = -ENOMEM; - set_page_writeback(page); + folio_start_writeback(folio); wpa = fuse_writepage_args_alloc(); if (!wpa) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto err_free; error = -EIO; @@ -1975,21 +2075,21 @@ static int fuse_writepage_locked(struct page *page) goto err_nofile; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0); - copy_highpage(tmp_page, page); + folio_copy(tmp_folio, folio); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; + ap->pages[0] = &tmp_folio->page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -1997,48 +2097,20 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + folio_end_writeback(folio); return 0; err_nofile: - __free_page(tmp_page); + folio_put(tmp_folio); err_free: kfree(wpa); err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); + mapping_set_error(folio->mapping, error); + folio_end_writeback(folio); return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; @@ -2307,7 +2379,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: @@ -2401,7 +2473,7 @@ static int fuse_launder_folio(struct folio *folio) /* Serialize with pending writeback for the same page */ fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(&folio->page); + err = fuse_writepage_locked(folio); if (!err) fuse_wait_on_page_writeback(inode, folio->index); } @@ -2462,13 +2534,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED + /* + * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) @@ -2476,7 +2565,19 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) invalidate_inode_pages2(file->f_mapping); - return generic_file_mmap(file, vma); + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + */ + rc = fuse_file_cached_io_start(inode, ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -2580,10 +2681,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); @@ -3213,8 +3310,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3225,10 +3322,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, @@ -3245,7 +3342,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bcbe34488862..b24084b60864 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -76,6 +76,16 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** Container for data related to mapping to backing file */ +struct fuse_backing { + struct file *file; + struct cred *cred; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -111,7 +121,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -123,9 +133,15 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -173,6 +189,10 @@ struct fuse_inode { #endif /** Submount specific lookup tracking */ struct fuse_submount_lookup *submount_lookup; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct fuse_backing *fb; +#endif }; /** FUSE inode state bits */ @@ -187,19 +207,21 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; struct fuse_mount; -struct fuse_release_args; +union fuse_file_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_mount *fm; - /* Argument space reserved for release */ - struct fuse_release_args *release_args; + /* Argument space reserved for open/release */ + union fuse_file_args *args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -221,12 +243,6 @@ struct fuse_file { /* Readdir related */ struct { - /* - * Protects below fields against (crazy) parallel readdir on - * same open file. Uncontended in the normal case. - */ - struct mutex lock; - /* Dir stream position */ loff_t pos; @@ -244,6 +260,15 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct file *passthrough; + const struct cred *cred; +#endif + /** Has flock been performed on this file? */ bool flock:1; }; @@ -283,6 +308,7 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; + bool is_pinned:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); @@ -295,6 +321,19 @@ struct fuse_args_pages { unsigned int num_pages; }; +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +union fuse_file_args { + /* Used during open() */ + struct fuse_open_out open_outarg; + /* Used during release() */ + struct fuse_release_args release_args; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -818,6 +857,12 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /** Passthrough support for read/write IO */ + unsigned int passthrough:1; + + /** Maximum stack depth for passthrough backing files */ + int max_stack_depth; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -867,6 +912,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** IDR for backing files ids */ + struct idr backing_files_map; +#endif }; /* @@ -940,7 +990,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation, static inline void fuse_make_bad(struct inode *inode) { - remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } @@ -1032,14 +1081,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); @@ -1349,11 +1393,82 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); + +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +#ifdef CONFIG_FUSE_PASSTHROUGH +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); +void fuse_backing_put(struct fuse_backing *fb); +#else + +static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + return NULL; +} + +static inline void fuse_backing_put(struct fuse_backing *fb) +{ +} +#endif + +void fuse_backing_files_init(struct fuse_conn *fc); +void fuse_backing_files_free(struct fuse_conn *fc); +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); +int fuse_backing_close(struct fuse_conn *fc, int backing_id); + +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id); +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); + +static inline struct file *fuse_file_passthrough(struct fuse_file *ff) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return ff->passthrough; +#else + return NULL; +#endif +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 516ea2979a90..3a5d88878335 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + return &fi->inode; out_free_forget: @@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode) #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + kmem_cache_free(fuse_inode_cachep, fi); } @@ -469,8 +475,11 @@ retry: } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); - iput(inode); - goto retry; + if (inode != d_inode(sb->s_root)) { + remove_inode_hash(inode); + iput(inode); + goto retry; + } } fi = get_fuse_inode(inode); spin_lock(&fi->lock); @@ -924,6 +933,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; @@ -954,6 +966,8 @@ void fuse_conn_put(struct fuse_conn *fc) WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); call_rcu(&fc->rcu, delayed_release); } } @@ -974,7 +988,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); } struct fuse_inode_handle { @@ -1117,6 +1131,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1291,6 +1310,26 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1337,7 +1376,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1346,6 +1386,8 @@ void fuse_send_init(struct fuse_mount *fm) #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; @@ -1496,8 +1538,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, - .uid = fi->inode.i_uid.val, - .gid = fi->inode.i_gid.val, + .uid = __kuid_val(fi->inode.i_uid), + .gid = __kgid_val(fi->inode.i_gid), .rdev = fi->inode.i_rdev, .blksize = 1u << fi->inode.i_blkbits, }; @@ -1534,6 +1576,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 000000000000..c653ddcf0578 --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> + +/* + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); +} + +/* + * Start cached io mode. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. + */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* There are no io modes if server does not implement open */ + if (!ff->args) + return 0; + + spin_lock(&fi->lock); + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); + } + + /* + * Check if inode entered passthrough io mode while waiting for parallel + * dio write completion. + */ + if (fuse_inode_backing(fi)) { + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + return -ETXTBSY; + } + + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } + spin_unlock(&fi->lock); + return 0; +} + +static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb; + int err = 0; + + spin_lock(&fi->lock); + /* deny conflicting backing files on same fuse inode */ + oldfb = fuse_inode_backing(fi); + if (oldfb && oldfb != fb) { + err = -EBUSY; + goto unlock; + } + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode != IOM_NONE); + fi->iocachectr--; + ff->iomode = IOM_UNCACHED; + + /* fuse inode holds a single refcount of backing file */ + if (!oldfb) { + oldfb = fuse_inode_backing_set(fi, fb); + WARN_ON_ONCE(oldfb != NULL); + } else { + fuse_backing_put(fb); + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb = NULL; + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fi->iocachectr++; + if (!fi->iocachectr) { + wake_up(&fi->direct_io_waitq); + oldfb = fuse_inode_backing_set(fi, NULL); + } + spin_unlock(&fi->lock); + if (oldfb) + fuse_backing_put(oldfb); +} + +/* + * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. + * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write + * operations go directly to the server, but mmap is done on the backing file. + * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode + * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. + */ +#define FOPEN_PASSTHROUGH_MASK \ + (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ + FOPEN_NOFLUSH) + +static int fuse_file_passthrough_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_backing *fb; + int err; + + /* Check allowed conditions for file open in passthrough mode */ + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || + (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) + return -EINVAL; + + fb = fuse_passthrough_open(file, inode, + ff->args->open_outarg.backing_id); + if (IS_ERR(fb)) + return PTR_ERR(fb); + + /* First passthrough file open denies caching inode io mode */ + err = fuse_file_uncached_io_start(inode, ff, fb); + if (!err) + return 0; + + fuse_passthrough_release(ff, fb); + fuse_backing_put(fb); + + return err; +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->args) + return 0; + + /* + * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode + * which is already open for passthrough. + */ + err = -EINVAL; + if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) + goto fail; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First passthrough file open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if ((ff->open_flags & FOPEN_DIRECT_IO) && + !(ff->open_flags & FOPEN_PASSTHROUGH)) + return 0; + + if (ff->open_flags & FOPEN_PASSTHROUGH) + err = fuse_file_passthrough_open(inode, file); + else + err = fuse_file_cached_io_start(inode, ff); + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + /* + * Last parallel dio close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_end(inode, ff); + break; + case IOM_CACHED: + fuse_file_cached_io_end(inode, ff); + break; + } +} diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 000000000000..1567f0323858 --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/file.h> +#include <linux/backing-file.h> +#include <linux/splice.h> + +static void fuse_file_accessed(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_atime(inode); +} + +static void fuse_file_modified(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + inode_lock(inode); + ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = in, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + return backing_file_splice_read(backing_file, ppos, pipe, len, flags, + &ctx); +} + +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct inode *inode = file_inode(out); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = out, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + inode_lock(inode); + ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__, + backing_file, vma->vm_start, vma->vm_end); + + return backing_file_mmap(backing_file, vma, &ctx); +} + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags) + goto out; + + file = fget(map->fd); + res = -EBADF; + if (!file) + goto out; + + res = -EOPNOTSUPP; + if (!file->f_op->read_iter || !file->f_op->write_iter) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +/* + * Setup passthrough to a backing file. + * + * Returns an fb object with elevated refcount to be stored in fuse inode. + */ +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_backing *fb = NULL; + struct file *backing_file; + int err; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + err = -ENOENT; + if (!fb) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ + backing_file = backing_file_open(&file->f_path, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { + fuse_backing_put(fb); + goto out; + } + + err = 0; + ff->passthrough = backing_file; + ff->cred = get_cred(fb->cred); +out: + pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__, + backing_id, fb, ff->passthrough, err); + + return err ? ERR_PTR(err) : fb; +} + +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__, + fb, ff->passthrough); + + fput(ff->passthrough); + ff->passthrough = NULL; + put_cred(ff->cred); + ff->cred = NULL; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c66a54d6c7d3..0377b6dc24c8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) if (fuse_is_bad(inode)) return -EIO; - mutex_lock(&ff->readdir.lock); - err = UNCACHED; if (ff->open_flags & FOPEN_CACHE_DIR) err = fuse_readdir_cached(file, ctx); if (err == UNCACHED) err = fuse_readdir_uncached(file, ctx); - mutex_unlock(&ff->readdir.lock); - return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 5f1be1da92ce..322af827a232 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -16,6 +16,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" @@ -31,6 +32,9 @@ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); +/* The /sys/fs/virtio_fs/ kset */ +static struct kset *virtio_fs_kset; + enum { VQ_HIPRIO, VQ_REQUEST @@ -55,7 +59,7 @@ struct virtio_fs_vq { /* A virtio-fs device instance */ struct virtio_fs { - struct kref refcount; + struct kobject kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -161,18 +165,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) complete(&fsvq->in_flight_zero); } -static void release_virtio_fs_obj(struct kref *ref) +static ssize_t tag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + return sysfs_emit(buf, fs->tag); +} + +static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); + +static struct attribute *virtio_fs_attrs[] = { + &virtio_fs_tag_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(virtio_fs); + +static void virtio_fs_ktype_release(struct kobject *kobj) { - struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); kfree(vfs->vqs); kfree(vfs); } +static const struct kobj_type virtio_fs_ktype = { + .release = virtio_fs_ktype_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = virtio_fs_groups, +}; + /* Make sure virtiofs_mutex is held */ static void virtio_fs_put(struct virtio_fs *fs) { - kref_put(&fs->refcount, release_virtio_fs_obj); + kobject_put(&fs->kobj); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) @@ -243,25 +269,46 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } /* Add a new instance to the list or return -EEXIST if tag name exists*/ -static int virtio_fs_add_instance(struct virtio_fs *fs) +static int virtio_fs_add_instance(struct virtio_device *vdev, + struct virtio_fs *fs) { struct virtio_fs *fs2; - bool duplicate = false; + int ret; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs2, &virtio_fs_instances, list) { - if (strcmp(fs->tag, fs2->tag) == 0) - duplicate = true; + if (strcmp(fs->tag, fs2->tag) == 0) { + mutex_unlock(&virtio_fs_mutex); + return -EEXIST; + } } - if (!duplicate) - list_add_tail(&fs->list, &virtio_fs_instances); + /* Use the virtio_device's index as a unique identifier, there is no + * need to allocate our own identifiers because the virtio_fs instance + * is only visible to userspace as long as the underlying virtio_device + * exists. + */ + fs->kobj.kset = virtio_fs_kset; + ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); + if (ret < 0) { + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); + if (ret < 0) { + kobject_del(&fs->kobj); + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + list_add_tail(&fs->list, &virtio_fs_instances); mutex_unlock(&virtio_fs_mutex); - if (duplicate) - return -EEXIST; + kobject_uevent(&fs->kobj, KOBJ_ADD); + return 0; } @@ -274,7 +321,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag) list_for_each_entry(fs, &virtio_fs_instances, list) { if (strcmp(fs->tag, tag) == 0) { - kref_get(&fs->refcount); + kobject_get(&fs->kobj); goto found; } } @@ -323,6 +370,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) return -ENOMEM; memcpy(fs->tag, tag_buf, len); fs->tag[len] = '\0'; + + /* While the VIRTIO specification allows any character, newlines are + * awkward on mount(8) command-lines and cause problems in the sysfs + * "tag" attr and uevent TAG= properties. Forbid them. + */ + if (strchr(fs->tag, '\n')) { + dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n"); + return -EINVAL; + } + return 0; } @@ -345,7 +402,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) kfree(req); dec_in_flight_req(fsvq); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); } @@ -627,7 +684,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); /* End requests */ @@ -795,8 +852,11 @@ static void virtio_fs_cleanup_dax(void *data) put_dax(dax_dev); } +DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T)) + static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) { + struct dax_device *dax_dev __free(cleanup_dax) = NULL; struct virtio_shm_region cache_reg; struct dev_pagemap *pgmap; bool have_cache; @@ -804,6 +864,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) if (!IS_ENABLED(CONFIG_FUSE_DAX)) return 0; + dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); + if (IS_ERR(dax_dev)) { + int rc = PTR_ERR(dax_dev); + return rc == -EOPNOTSUPP ? 0 : rc; + } + /* Get cache region */ have_cache = virtio_get_shm_region(vdev, &cache_reg, (u8)VIRTIO_FS_SHMCAP_ID_CACHE); @@ -849,10 +915,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); - fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); - if (IS_ERR(fs->dax_dev)) - return PTR_ERR(fs->dax_dev); - + fs->dax_dev = no_free_ptr(dax_dev); return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs->dax_dev); } @@ -865,7 +928,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) fs = kzalloc(sizeof(*fs), GFP_KERNEL); if (!fs) return -ENOMEM; - kref_init(&fs->refcount); + kobject_init(&fs->kobj, &virtio_fs_ktype); vdev->priv = fs; ret = virtio_fs_read_tag(vdev, fs); @@ -887,7 +950,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) */ virtio_device_ready(vdev); - ret = virtio_fs_add_instance(fs); + ret = virtio_fs_add_instance(vdev, fs); if (ret < 0) goto out_vqs; @@ -896,11 +959,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); - kfree(fs->vqs); out: vdev->priv = NULL; - kfree(fs); + kobject_put(&fs->kobj); return ret; } @@ -924,6 +986,8 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + sysfs_remove_link(&fs->kobj, "device"); + kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); @@ -1510,21 +1574,56 @@ static struct file_system_type virtio_fs_type = { .kill_sb = virtio_kill_sb, }; +static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + add_uevent_var(env, "TAG=%s", fs->tag); + return 0; +} + +static const struct kset_uevent_ops virtio_fs_uevent_ops = { + .uevent = virtio_fs_uevent, +}; + +static int __init virtio_fs_sysfs_init(void) +{ + virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops, + fs_kobj); + if (!virtio_fs_kset) + return -ENOMEM; + return 0; +} + +static void virtio_fs_sysfs_exit(void) +{ + kset_unregister(virtio_fs_kset); + virtio_fs_kset = NULL; +} + static int __init virtio_fs_init(void) { int ret; - ret = register_virtio_driver(&virtio_fs_driver); + ret = virtio_fs_sysfs_init(); if (ret < 0) return ret; + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + goto sysfs_exit; + ret = register_filesystem(&virtio_fs_type); - if (ret < 0) { - unregister_virtio_driver(&virtio_fs_driver); - return ret; - } + if (ret < 0) + goto unregister_virtio_driver; return 0; + +unregister_virtio_driver: + unregister_virtio_driver(&virtio_fs_driver); +sysfs_exit: + virtio_fs_sysfs_exit(); + return ret; } module_init(virtio_fs_init); @@ -1532,6 +1631,7 @@ static void __exit virtio_fs_exit(void) { unregister_filesystem(&virtio_fs_type); unregister_virtio_driver(&virtio_fs_driver); + virtio_fs_sysfs_exit(); } module_exit(virtio_fs_exit); diff --git a/fs/inode.c b/fs/inode.c index d290f007b3d1..3a41f83a4ba5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2033,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -static int __file_remove_privs(struct file *file, unsigned int flags) +int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2058,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } +EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2070,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) */ int file_remove_privs(struct file *file) { - return __file_remove_privs(file, 0); + return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); @@ -2163,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags) * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - ret = __file_remove_privs(file, flags); + ret = file_remove_privs_flags(file, flags); if (ret) return ret; diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index ad572f7ee897..43a651ed8264 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -83,8 +83,10 @@ static int fscache_begin_operation(struct netfs_cache_resources *cres, cres->debug_id = cookie->debug_id; cres->inval_counter = cookie->inval_counter; - if (!fscache_begin_cookie_access(cookie, why)) + if (!fscache_begin_cookie_access(cookie, why)) { + cres->cache_priv = NULL; return -ENOBUFS; + } again: spin_lock(&cookie->lock); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index fbdc9ca80f71..de77848ae654 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -73,14 +73,9 @@ const struct rpc_program nfs_program = { .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, - .stats = &nfs_rpcstat, .pipe_dir_name = NFS_PIPE_DIRNAME, }; -struct rpc_stat nfs_rpcstat = { - .program = &nfs_program -}; - static struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; @@ -502,6 +497,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, @@ -513,6 +509,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, + .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, @@ -1182,6 +1179,8 @@ void nfs_clients_init(struct net *net) #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); + memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); + nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d4a42ce0c7e3..6bace5fece04 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -181,7 +181,6 @@ static int nfs_delegation_claim_opens(struct inode *inode, struct nfs_open_context *ctx; struct nfs4_state_owner *sp; struct nfs4_state *state; - unsigned int seq; int err; again: @@ -202,12 +201,9 @@ again: sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) err = nfs_delegation_claim_locks(state, stateid); - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7af5d270de28..bb2f583eb28b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -606,6 +606,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) trace_nfs_direct_commit_complete(dreq); + spin_lock(&dreq->lock); if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; @@ -613,6 +614,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) } else { status = dreq->error; } + spin_unlock(&dreq->lock); nfs_init_cinfo_from_dreq(&cinfo, dreq); @@ -625,7 +627,10 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) spin_unlock(&dreq->lock); nfs_release_request(req); } else if (!nfs_write_match_verf(verf, req)) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); /* * Despite the reboot, the write was successful, * so reset wb_nio. @@ -667,10 +672,17 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) LIST_HEAD(mds_list); nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_commit_begin(cinfo.mds); nfs_scan_commit(dreq->inode, &mds_list, &cinfo); res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); - if (res < 0) /* res == -ENOMEM */ - nfs_direct_write_reschedule(dreq); + if (res < 0) { /* res == -ENOMEM */ + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + } + if (nfs_commit_end(cinfo.mds)) + nfs_direct_write_complete(dreq); } static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index acf4b88889dc..4fa304fa5bc4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -35,6 +35,7 @@ #include "../internal.h" #include "../nfs4session.h" #include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -172,6 +173,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; + trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); /* If DS was already in cache, free ds addrs */ while (!list_empty(&dsaddrs)) { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index ef817a0475ff..3e724cb7ef01 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2016,7 +2016,7 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { mirror = flseg->mirror_array[idx]; mirror_ds = mirror->mirror_ds; - if (!mirror_ds) + if (IS_ERR_OR_NULL(mirror_ds)) continue; ds = mirror->mirror_ds->ds; if (!ds) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 853e8d609bb3..d0a0956f8a13 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -652,6 +652,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->fscache_uniq = NULL; break; case Opt_fscache: + trace_nfs_mount_assign(param->key, param->string); ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = param->string; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 2d1bfee225c3..ddc1ee031955 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -301,11 +301,11 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) struct inode *inode = sreq->rreq->inode; struct nfs_open_context *ctx = sreq->rreq->netfs_priv; struct page *page; + unsigned long idx; int err; pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; pgoff_t last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); - XA_STATE(xas, &sreq->rreq->mapping->i_pages, start); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); @@ -316,19 +316,14 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) pgio.pg_netfs = netfs; /* used in completion */ - xas_lock(&xas); - xas_for_each(&xas, page, last) { + xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) { /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */ - xas_pause(&xas); - xas_unlock(&xas); err = nfs_read_add_folio(&pgio, ctx, page_folio(page)); if (err < 0) { netfs->error = err; goto out; } - xas_lock(&xas); } - xas_unlock(&xas); out: nfs_pageio_complete_read(&pgio); nfs_netfs_put(netfs); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 93ea49a7eb61..c709c296ea9a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2426,12 +2426,16 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); + nfs_clients_init(net); + rpc_proc_register(net, &nn->rpcstats); return nfs_fs_proc_net_init(net); } static void nfs_net_exit(struct net *net) { + rpc_proc_unregister(net, "nfs"); nfs_fs_proc_net_exit(net); nfs_clients_exit(net); } @@ -2486,15 +2490,12 @@ static int __init init_nfs_fs(void) if (err) goto out1; - rpc_proc_register(&init_net, &nfs_rpcstat); - err = register_nfs_fs(); if (err) goto out0; return 0; out0: - rpc_proc_unregister(&init_net, "nfs"); nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2524,7 +2525,6 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); unregister_pernet_subsys(&nfs_net_ops); - rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e3722ce6722e..06253695fe53 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -449,8 +449,6 @@ int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); -extern struct rpc_stat nfs_rpcstat; - extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index c8374f74dce1..a68b21603ea9 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -9,6 +9,7 @@ #include <linux/nfs4.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/sunrpc/stats.h> struct bl_dev_msg { int32_t status; @@ -34,6 +35,7 @@ struct nfs_net { struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; + struct rpc_stat rpcstats; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 674c012868b1..b0c8a39c2bbd 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -111,6 +111,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b59876b01a1e..0282d93c8bcc 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -55,11 +55,14 @@ int nfs42_proc_removexattr(struct inode *inode, const char *name); * They would be 7 bytes long in the eventual buffer ("user.x\0"), and * 8 bytes long XDR-encoded. * - * Include the trailing eof word as well. + * Include the trailing eof word as well and make the result a multiple + * of 4 bytes. */ static inline u32 nfs42_listxattr_xdrsize(u32 buflen) { - return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4; + u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4; + + return (size + 3) & ~3; } #endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 6ff41ceb9f1c..7024230f0d1d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -120,7 +120,6 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; - seqcount_spinlock_t so_reclaim_seqcount; struct mutex so_delegreturn_mutex; }; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 11e3a285594c..84573df5cf5a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -924,6 +924,7 @@ static int nfs4_set_client(struct nfs_server *server, else cl_init.max_connect = max_connect; switch (proto) { + case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: cl_init.nconnect = nconnect; @@ -1000,6 +1001,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 815996cb27fc..ea390db94b62 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3069,10 +3069,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); struct inode *dir = d_inode(opendata->dir); unsigned long dir_verifier; - unsigned int seq; int ret; - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); dir_verifier = nfs_save_change_attribute(dir); ret = _nfs4_proc_open(opendata, ctx); @@ -3125,11 +3123,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - if (d_inode(dentry) == state->inode) { + if (d_inode(dentry) == state->inode) nfs_inode_attach_open_context(ctx); - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - nfs4_schedule_stateid_recovery(server, state); - } out: if (!opendata->cancelled) { @@ -8973,10 +8968,12 @@ try_again: return; status = task->tk_status; - if (status == 0) + if (status == 0) { status = nfs4_detect_session_trunking(adata->clp, task->tk_msg.rpc_resp, xprt); - + trace_nfs4_trunked_exchange_id(adata->clp, + xprt->address_strings[RPC_DISPLAY_ADDR], status); + } if (status == 0) rpc_clnt_xprt_switch_add_xprt(clnt, xprt); else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt, @@ -10618,29 +10615,33 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { ssize_t error, error2, error3; + size_t left = size; - error = generic_listxattr(dentry, list, size); + error = generic_listxattr(dentry, list, left); if (error < 0) return error; if (list) { list += error; - size -= error; + left -= error; } - error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); + error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left); if (error2 < 0) return error2; if (list) { list += error2; - size -= error2; + left -= error2; } - error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size); + error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; - return error + error2 + error3; + error += error2 + error3; + if (size && error > size) + return -ERANGE; + return error; } static void nfs4_enable_swap(struct inode *inode) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8cfabdbda336..662e86ea3a2d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -513,7 +513,6 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); - seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock); mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -1667,7 +1666,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1735,7 +1733,6 @@ restart: spin_lock(&sp->so_lock); goto restart; } - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); #ifdef CONFIG_NFS_V4_2 if (found_ssc_copy_state) @@ -1745,7 +1742,6 @@ restart: out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } @@ -1928,9 +1924,12 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct nfs_server *server; struct rb_node *pos; LIST_HEAD(freeme); - int status = 0; int lost_locks = 0; + int status; + status = nfs4_begin_drain_session(clp); + if (status < 0) + return status; restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { @@ -2694,6 +2693,9 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Detect expired delegations... */ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { section = "detect expired delegations"; + status = nfs4_begin_drain_session(clp); + if (status < 0) + goto out_error; nfs_reap_expired_delegations(clp); continue; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index d09bcfd7db89..8da5a9c000f4 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -145,6 +145,7 @@ static int do_nfs4_mount(struct nfs_server *server, const char *export_path) { struct nfs_fs_context *root_ctx; + struct nfs_fs_context *ctx; struct fs_context *root_fc; struct vfsmount *root_mnt; struct dentry *dentry; @@ -157,6 +158,12 @@ static int do_nfs4_mount(struct nfs_server *server, .dirfd = -1, }; + struct fs_parameter param_fsc = { + .key = "fsc", + .type = fs_value_is_string, + .dirfd = -1, + }; + if (IS_ERR(server)) return PTR_ERR(server); @@ -168,9 +175,26 @@ static int do_nfs4_mount(struct nfs_server *server, kfree(root_fc->source); root_fc->source = NULL; + ctx = nfs_fc2context(fc); root_ctx = nfs_fc2context(root_fc); root_ctx->internal = true; root_ctx->server = server; + + if (ctx->fscache_uniq) { + len = strlen(ctx->fscache_uniq); + param_fsc.size = len; + param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL); + if (param_fsc.string == NULL) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_param(root_fc, ¶m_fsc); + kfree(param_fsc.string); + if (ret < 0) { + put_fs_context(root_fc); + return ret; + } + } /* We leave export_path unset as it's not used to find the root. */ len = strlen(hostname) + 5; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d9ac556bebcf..d22c6670f770 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -28,4 +28,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error); + +EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo); #endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index fd7cb15b08b2..10985a4b8259 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -77,6 +77,36 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); +TRACE_EVENT(nfs4_trunked_exchange_id, + TP_PROTO( + const struct nfs_client *clp, + const char *addr, + int error + ), + + TP_ARGS(clp, addr, error), + + TP_STRUCT__entry( + __string(main_addr, clp->cl_hostname) + __string(trunk_addr, addr) + __field(unsigned long, error) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __assign_str(main_addr, clp->cl_hostname); + __assign_str(trunk_addr, addr); + ), + + TP_printk( + "error=%ld (%s) main_addr=%s trunk_addr=%s", + -__entry->error, + show_nfs4_status(__entry->error), + __get_str(main_addr), + __get_str(trunk_addr) + ) +); + TRACE_EVENT(nfs4_sequence_done, TP_PROTO( const struct nfs4_session *session, @@ -1991,6 +2021,34 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status, DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo); DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid); +TRACE_EVENT(fl_getdevinfo, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + char *ds_remotestr + ), + TP_ARGS(server, deviceid, ds_remotestr), + + TP_STRUCT__entry( + __string(mds_addr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + __string(ds_ips, ds_remotestr) + ), + + TP_fast_assign( + __assign_str(mds_addr, server->nfs_client->cl_hostname); + __assign_str(ds_ips, ds_remotestr); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + TP_printk( + "deviceid=%s, mds_addr=%s, ds_ips=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(mds_addr), + __get_str(ds_ips) + ) +); + DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( const struct nfs_pgio_header *hdr diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 7600100ba26f..432612d22437 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -175,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src, size_t len = strlen(dest); if (len && dest[len - 1] != ',') - if (strlcat(dest, ",", destlen) > destlen) + if (strlcat(dest, ",", destlen) >= destlen) return -1; - if (strlcat(dest, src, destlen) > destlen) + if (strlcat(dest, src, destlen) >= destlen) return -1; return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0c0fed1ecd0b..a5cc6199127f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1999,6 +1999,14 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + lseg = ERR_PTR(-EIO); + goto out; + } + lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp)); if (IS_ERR(lseg)) goto out; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index afd23910f3bf..88e061bd711b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -919,6 +919,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); list_for_each_entry(da, &ds->ds_addrs, da_node) { + char servername[48]; + dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); @@ -929,6 +931,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .xprtsec = clp->cl_xprtsec, }; struct nfs4_add_xprt_data xprtdata = { .clp = clp, @@ -938,10 +941,45 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto) + if (da->da_transport != clp->cl_proto && + clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) continue; + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == + XPRT_TRANSPORT_TCP_TLS) { + struct sockaddr *addr = + (struct sockaddr *)&da->da_addr; + struct sockaddr_in *sin = + (struct sockaddr_in *)&da->da_addr; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)&da->da_addr; + + /* for NFS with TLS we need to supply a correct + * servername of the trunked transport, not the + * servername of the main transport stored in + * clp->cl_hostname. And set the protocol to + * indicate to use TLS + */ + servername[0] = '\0'; + switch(addr->sa_family) { + case AF_INET: + snprintf(servername, sizeof(servername), + "%pI4", &sin->sin_addr.s_addr); + break; + case AF_INET6: + snprintf(servername, sizeof(servername), + "%pI6", &sin6->sin6_addr); + break; + default: + /* do not consider this address */ + continue; + } + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + xprt_args.servername = servername; + } if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; + /** * Test this address for session trunking and * add as an alias @@ -953,6 +991,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (xprtdata.cred) put_cred(xprtdata.cred); } else { + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == + XPRT_TRANSPORT_TCP_TLS) + da->da_transport = XPRT_TRANSPORT_TCP_TLS; clp = nfs4_set_ds_client(mds_srv, &da->da_addr, da->da_addrlen, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 7dc21a48e3e7..a142287d86f6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -305,6 +305,8 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len); if (IS_ERR(new)) { error = PTR_ERR(new); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); goto out; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 075b31c93f87..dc03f98f7616 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -516,8 +516,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else nfs_show_nfsv4_options(m, nfss, showdefaults); - if (nfss->options & NFS_OPTION_FSCACHE) + if (nfss->options & NFS_OPTION_FSCACHE) { +#ifdef CONFIG_NFS_FSCACHE + if (nfss->fscache_uniq) + seq_printf(m, ",fsc=%s", nfss->fscache_uniq); + else + seq_puts(m, ",fsc"); +#else seq_puts(m, ",fsc"); +#endif + } if (nfss->options & NFS_OPTION_MIGRATION) seq_puts(m, ",migration"); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 84bb85264572..5de85d725fb9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -667,10 +667,6 @@ static int nfs_writepage_locked(struct folio *folio, struct inode *inode = folio_file_mapping(folio)->host; int err; - if (wbc->sync_mode == WB_SYNC_NONE && - NFS_SERVER(inode)->write_congested) - return AOP_WRITEPAGE_ACTIVATE; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); @@ -1650,7 +1646,7 @@ static int wait_on_commit(struct nfs_mds_commit_info *cinfo) !atomic_read(&cinfo->rpcs_out)); } -static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) { atomic_inc(&cinfo->rpcs_out); } diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 7342de296ec3..89caef7513db 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; - desc_kaddr = kmap(desc_bh->b_page); + desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); - for (j = 0; j < n; j++, desc++, group++) { + for (j = 0; j < n; j++, desc++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); - if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) { - ret = nilfs_palloc_get_bitmap_block( - inode, group, 1, &bitmap_bh); - if (ret < 0) - goto out_desc; - bitmap_kaddr = kmap(bitmap_bh->b_page); - bitmap = bitmap_kaddr + bh_offset(bitmap_bh); - pos = nilfs_palloc_find_available_slot( - bitmap, group_offset, - entries_per_group, lock); - if (pos >= 0) { - /* found a free entry */ - nilfs_palloc_group_desc_add_entries( - desc, lock, -1); - req->pr_entry_nr = - entries_per_group * group + pos; - kunmap(desc_bh->b_page); - kunmap(bitmap_bh->b_page); - - req->pr_desc_bh = desc_bh; - req->pr_bitmap_bh = bitmap_bh; - return 0; - } - kunmap(bitmap_bh->b_page); - brelse(bitmap_bh); + if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0) + continue; + + kunmap_local(desc_kaddr); + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, + &bitmap_bh); + if (unlikely(ret < 0)) { + brelse(desc_bh); + return ret; } - group_offset = 0; + desc_kaddr = kmap_local_page(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + + bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); + bitmap = bitmap_kaddr + bh_offset(bitmap_bh); + pos = nilfs_palloc_find_available_slot( + bitmap, group_offset, entries_per_group, lock); + kunmap_local(bitmap_kaddr); + if (pos >= 0) + goto found; + + brelse(bitmap_bh); } - kunmap(desc_bh->b_page); + kunmap_local(desc_kaddr); brelse(desc_bh); } /* no entries left */ return -ENOSPC; - out_desc: - kunmap(desc_bh->b_page); - brelse(desc_bh); - return ret; +found: + /* found a free entry */ + nilfs_palloc_group_desc_add_entries(desc, lock, -1); + req->pr_entry_nr = entries_per_group * group + pos; + kunmap_local(desc_kaddr); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; } /** @@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap(req->pr_bitmap_bh->b_page); - kunmap(req->pr_desc_bh->b_page); + kunmap_local(bitmap_kaddr); + kunmap_local(desc_kaddr); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); @@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); - desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); - bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, else nilfs_palloc_group_desc_add_entries(desc, lock, 1); - kunmap(req->pr_bitmap_bh->b_page); - kunmap(req->pr_desc_bh->b_page); + kunmap_local(bitmap_kaddr); + kunmap_local(desc_kaddr); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); @@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); @@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) entry_start = rounddown(group_offset, epb); } while (true); - kunmap(bitmap_bh->b_page); + kunmap_local(bitmap_kaddr); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); @@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) inode->i_ino); } - desc_kaddr = kmap_atomic(desc_bh->b_page); + desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); - kunmap_atomic(desc_kaddr); + kunmap_local(desc_kaddr); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 7a8f166f2c8d..383f0afa2cea 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) */ void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) { - down_write(&bmap->b_sem); memcpy(raw_inode->i_bmap, bmap->b_u.u_data, NILFS_INODE_BMAP_SIZE * sizeof(__le64)); if (bmap->b_inode->i_ino == NILFS_DAT_INO) bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; - - up_write(&bmap->b_sem); } void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 13592e82eaf6..65659fa0372e 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -724,7 +724,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, dat = nilfs_bmap_get_dat(btree); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) - goto out; + goto dat_error; ptr = blocknr; } cnt = 1; @@ -743,7 +743,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, if (dat) { ret = nilfs_dat_translate(dat, ptr2, &blocknr); if (ret < 0) - goto out; + goto dat_error; ptr2 = blocknr; } if (ptr2 != ptr + cnt || ++cnt == maxblocks) @@ -781,6 +781,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, out: nilfs_btree_free_path(path); return ret; + + dat_error: + if (ret == -ENOENT) + ret = -EINVAL; /* Notify bmap layer of metadata corruption */ + goto out; } static void nilfs_btree_promote_key(struct nilfs_bmap *btree, diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 39136637f715..69a5cced1e84 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -28,7 +28,7 @@ nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; - do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + tcno = div64_ul(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); return (unsigned long)tcno; } @@ -187,35 +187,90 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, } /** - * nilfs_cpfile_get_checkpoint - get a checkpoint - * @cpfile: inode of checkpoint file - * @cno: checkpoint number - * @create: create flag - * @cpp: pointer to a checkpoint - * @bhp: pointer to a buffer head - * - * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint - * specified by @cno. A new checkpoint will be created if @cno is the current - * checkpoint number and @create is nonzero. - * - * Return Value: On success, 0 is returned, and the checkpoint and the - * buffer head of the buffer on which the checkpoint is located are stored in - * the place pointed by @cpp and @bhp, respectively. On error, one of the - * following negative error codes is returned. + * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile + * @cpfile: checkpoint file inode + * @cno: number of checkpoint entry to read + * @root: nilfs root object + * @ifile: ifile's inode to read and attach to @root * - * %-EIO - I/O error. + * This function imports checkpoint information from the checkpoint file and + * stores it to the inode file given by @ifile and the nilfs root object + * given by @root. * - * %-ENOMEM - Insufficient amount of memory available. + * Return: 0 on success, or the following negative error code on failure. + * * %-EINVAL - Invalid checkpoint. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + */ +int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, struct inode *ifile) +{ + struct buffer_head *cp_bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (cno < 1 || cno > nilfs_mdt_cno(cpfile)) + return -EINVAL; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = -EINVAL; + goto out_sem; + } + + kaddr = kmap_local_page(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -EINVAL; + goto put_cp; + } + + ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode); + if (unlikely(ret)) { + /* + * Since this inode is on a checkpoint entry, treat errors + * as metadata corruption. + */ + nilfs_err(cpfile->i_sb, + "ifile inode (checkpoint number=%llu) corrupted", + (unsigned long long)cno); + ret = -EIO; + goto put_cp; + } + + /* Configure the nilfs root object */ + atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count)); + root->ifile = ifile; + +put_cp: + kunmap_local(kaddr); + brelse(cp_bh); +out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile + * @cpfile: checkpoint file inode + * @cno: number of checkpoint to set up * - * %-ENOENT - No such checkpoint. + * This function creates a checkpoint with the number specified by @cno on + * cpfile. If the specified checkpoint entry already exists due to a past + * failure, it will be reused without returning an error. + * In either case, the buffer of the block containing the checkpoint entry + * and the cpfile inode are made dirty for inclusion in the write log. * - * %-EINVAL - invalid checkpoint. + * Return: 0 on success, or the following negative error code on failure. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). + * * %-EROFS - Read only filesystem */ -int nilfs_cpfile_get_checkpoint(struct inode *cpfile, - __u64 cno, - int create, - struct nilfs_checkpoint **cpp, - struct buffer_head **bhp) +int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno) { struct buffer_head *header_bh, *cp_bh; struct nilfs_cpfile_header *header; @@ -223,70 +278,128 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile, void *kaddr; int ret; - if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || - (cno < nilfs_mdt_cno(cpfile) && create))) - return -EINVAL; + if (WARN_ON_ONCE(cno < 1)) + return -EIO; down_write(&NILFS_MDT(cpfile)->mi_sem); - ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); - if (ret < 0) + if (unlikely(ret < 0)) { + if (ret == -ENOENT) { + nilfs_error(cpfile->i_sb, + "checkpoint creation failed due to metadata corruption."); + ret = -EIO; + } goto out_sem; - ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); - if (ret < 0) + } + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh); + if (unlikely(ret < 0)) goto out_header; - kaddr = kmap(cp_bh->b_page); + + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { - if (!create) { - kunmap(cp_bh->b_page); - brelse(cp_bh); - ret = -ENOENT; - goto out_header; - } /* a newly-created checkpoint */ nilfs_checkpoint_clear_invalid(cp); if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, kaddr, 1); - mark_buffer_dirty(cp_bh); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, 1); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(header_bh); - nilfs_mdt_mark_dirty(cpfile); + } else { + kunmap_local(kaddr); } - if (cpp != NULL) - *cpp = cp; - *bhp = cp_bh; + /* Force the buffer and the inode to become dirty */ + mark_buffer_dirty(cp_bh); + brelse(cp_bh); + nilfs_mdt_mark_dirty(cpfile); - out_header: +out_header: brelse(header_bh); - out_sem: +out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** - * nilfs_cpfile_put_checkpoint - put a checkpoint - * @cpfile: inode of checkpoint file - * @cno: checkpoint number - * @bh: buffer head + * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile + * @cpfile: checkpoint file inode + * @cno: checkpoint number + * @root: nilfs root object + * @blkinc: number of blocks added by this checkpoint + * @ctime: checkpoint creation time + * @minor: minor checkpoint flag + * + * This function completes the checkpoint entry numbered by @cno in the + * cpfile with the data given by the arguments @root, @blkinc, @ctime, and + * @minor. * - * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint - * specified by @cno. @bh must be the buffer head which has been returned by - * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + * Return: 0 on success, or the following negative error code on failure. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). */ -void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, - struct buffer_head *bh) +int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, __u64 blkinc, + time64_t ctime, bool minor) { - kunmap(bh->b_page); - brelse(bh); + struct buffer_head *cp_bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (WARN_ON_ONCE(cno < 1)) + return -EIO; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + goto error; + goto out_sem; + } + + kaddr = kmap_local_page(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (unlikely(nilfs_checkpoint_invalid(cp))) { + kunmap_local(kaddr); + brelse(cp_bh); + goto error; + } + + cp->cp_snapshot_list.ssl_next = 0; + cp->cp_snapshot_list.ssl_prev = 0; + cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count)); + cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count)); + cp->cp_nblk_inc = cpu_to_le64(blkinc); + cp->cp_create = cpu_to_le64(ctime); + cp->cp_cno = cpu_to_le64(cno); + + if (minor) + nilfs_checkpoint_set_minor(cp); + else + nilfs_checkpoint_clear_minor(cp); + + nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode); + nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode); + + kunmap_local(kaddr); + brelse(cp_bh); +out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; + +error: + nilfs_error(cpfile->i_sb, + "checkpoint finalization failed due to metadata corruption."); + ret = -EIO; + goto out_sem; } /** @@ -347,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; } - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint( cpfile, cno, cp_bh, kaddr); nicps = 0; @@ -369,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, cpfile, cp_bh, kaddr, nicps); if (count == 0) { /* make hole */ - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(cp_bh); ret = nilfs_cpfile_delete_checkpoint_block( @@ -384,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(cp_bh); } if (tnicps > 0) { - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } brelse(header_bh); @@ -447,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { if (!nilfs_checkpoint_invalid(cp)) { @@ -457,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, n++; } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); } @@ -491,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); if (curr == 0) { ret = 0; @@ -512,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, ret = 0; /* No snapshots (started from a hole block) */ goto out; } - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); while (n < nci) { cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); curr = ~(__u64)0; /* Terminator */ @@ -528,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); if (curr_blkoff != next_blkoff) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &bh); @@ -536,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, WARN_ON(ret == -ENOENT); goto out; } - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); } curr = next; curr_blkoff = next_blkoff; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); *cnop = curr; ret = n; @@ -650,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } if (nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_cp; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); list = &header->ch_snapshot_list; curr_bh = header_bh; @@ -679,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); curr = prev; if (curr_blkoff != prev_blkoff) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(curr_bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &curr_bh); if (ret < 0) goto out_header; - kaddr = kmap_atomic(curr_bh->b_page); + kaddr = kmap_local_page(curr_bh->b_page); } curr_blkoff = prev_blkoff; cp = nilfs_cpfile_block_get_checkpoint( @@ -693,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) list = &cp->cp_snapshot_list; prev = le64_to_cpu(list->ssl_prev); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, @@ -705,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) get_bh(prev_bh); } - kaddr = kmap_atomic(curr_bh->b_page); + kaddr = kmap_local_page(curr_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, curr, curr_bh, kaddr); list->ssl_prev = cpu_to_le64(cno); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); nilfs_checkpoint_set_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(prev_bh->b_page); + kaddr = kmap_local_page(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(cno); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, 1); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(prev_bh); mark_buffer_dirty(curr_bh); @@ -768,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } if (!nilfs_checkpoint_snapshot(cp)) { ret = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); goto out_cp; } list = &cp->cp_snapshot_list; next = le64_to_cpu(list->ssl_next); prev = le64_to_cpu(list->ssl_prev); - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) @@ -808,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) get_bh(prev_bh); } - kaddr = kmap_atomic(next_bh->b_page); + kaddr = kmap_local_page(next_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, next, next_bh, kaddr); list->ssl_prev = cpu_to_le64(prev); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(prev_bh->b_page); + kaddr = kmap_local_page(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(next); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(cp_bh->b_page); + kaddr = kmap_local_page(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); nilfs_checkpoint_clear_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, -1); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(next_bh); mark_buffer_dirty(prev_bh); @@ -889,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); if (ret < 0) goto out; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); if (nilfs_checkpoint_invalid(cp)) ret = -ENOENT; else ret = nilfs_checkpoint_snapshot(cp); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); out: @@ -972,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); cpstat->cs_cno = nilfs_mdt_cno(cpfile); cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); out_sem: diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index edabb2dc5756..f5b1d59289eb 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -16,10 +16,12 @@ #include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */ -int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, - struct nilfs_checkpoint **, - struct buffer_head **); -void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, struct inode *ifile); +int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno); +int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno, + struct nilfs_root *root, __u64 blkinc, + time64_t ctime, bool minor); int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 9cf6ba58f585..180fc8d36213 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MAX); entry->de_blocknr = cpu_to_le64(0); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_palloc_commit_alloc_entry(dat, req); nilfs_dat_commit_entry(dat, req); @@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat, struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(NILFS_CNO_MIN); entry->de_end = cpu_to_le64(NILFS_CNO_MIN); entry->de_blocknr = cpu_to_le64(0); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_dat_commit_entry(dat, req); @@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, struct nilfs_dat_entry *entry; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_dat_commit_entry(dat, req); } @@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) if (ret < 0) return ret; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (blocknr == 0) { ret = nilfs_palloc_prepare_free_entry(dat, req); @@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, sector_t blocknr; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); end = start = le64_to_cpu(entry->de_start); @@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, } entry->de_end = cpu_to_le64(end); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (blocknr == 0) nilfs_dat_commit_free(dat, req); @@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) sector_t blocknr; void *kaddr; - kaddr = kmap_atomic(req->pr_entry_bh->b_page); + kaddr = kmap_local_page(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (start == nilfs_mdt_cno(dat) && blocknr == 0) nilfs_palloc_abort_free_entry(dat, req); @@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) } } - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { nilfs_crit(dat->i_sb, @@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) __func__, (unsigned long long)vblocknr, (unsigned long long)le64_to_cpu(entry->de_start), (unsigned long long)le64_to_cpu(entry->de_end)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); return -EINVAL; } WARN_ON(blocknr == 0); entry->de_blocknr = cpu_to_le64(blocknr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); @@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) } } - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); blocknr = le64_to_cpu(entry->de_blocknr); if (blocknr == 0) { @@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) *blocknrp = blocknr; out: - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); return ret; } @@ -457,10 +457,10 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, 0, &entry_bh); if (ret < 0) return ret; - kaddr = kmap_atomic(entry_bh->b_page); + kaddr = kmap_local_page(entry_bh->b_page); /* last virtual block number in this block */ first = vinfo->vi_vblocknr; - do_div(first, entries_per_block); + first = div64_ul(first, entries_per_block); first *= entries_per_block; last = first + entries_per_block - 1; for (j = i, n = 0; @@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, vinfo->vi_end = le64_to_cpu(entry->de_end); vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(entry_bh); } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 4c85914f2abc..893ab36824cc 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -66,7 +66,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, dat = nilfs_bmap_get_dat(direct); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) - return ret; + goto dat_error; ptr = blocknr; } @@ -79,7 +79,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, if (dat) { ret = nilfs_dat_translate(dat, ptr2, &blocknr); if (ret < 0) - return ret; + goto dat_error; ptr2 = blocknr; } if (ptr2 != ptr + cnt) @@ -87,6 +87,11 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, } *ptrp = ptr; return cnt; + + dat_error: + if (ret == -ENOENT) + ret = -EINVAL; /* Notify bmap layer of metadata corruption */ + return ret; } static __u64 diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index a8a4bc8490b4..612e609158b5 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -15,6 +15,7 @@ #include "mdt.h" #include "alloc.h" #include "ifile.h" +#include "cpfile.h" /** * struct nilfs_ifile_info - on-memory private data of ifile @@ -115,11 +116,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) return ret; } - kaddr = kmap_atomic(req.pr_entry_bh->b_page); + kaddr = kmap_local_page(req.pr_entry_bh->b_page); raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, req.pr_entry_bh, kaddr); raw_inode->i_flags = 0; - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); @@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile, * nilfs_ifile_read - read or get ifile inode * @sb: super block instance * @root: root object + * @cno: number of checkpoint entry to read * @inode_size: size of an inode - * @raw_inode: on-disk ifile inode - * @inodep: buffer to store the inode + * + * Return: 0 on success, or the following negative error code on failure. + * * %-EINVAL - Invalid checkpoint. + * * %-ENOMEM - Insufficient memory available. + * * %-EIO - I/O error (including metadata corruption). */ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, - size_t inode_size, struct nilfs_inode *raw_inode, - struct inode **inodep) + __u64 cno, size_t inode_size) { + struct the_nilfs *nilfs; struct inode *ifile; int err; @@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache); - err = nilfs_read_inode_common(ifile, raw_inode); + nilfs = sb->s_fs_info; + err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile); if (err) goto failed; unlock_new_inode(ifile); out: - *inodep = ifile; return 0; failed: iget_failed(ifile); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 35c5273f4821..625545cc2a98 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -21,15 +21,14 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { - void *kaddr = kmap(ibh->b_page); + void *kaddr = kmap_local_page(ibh->b_page); return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); } -static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, - struct buffer_head *ibh) +static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode) { - kunmap(ibh->b_page); + kunmap_local(raw_inode); } int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); @@ -39,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, - size_t inode_size, struct nilfs_inode *raw_inode, - struct inode **inodep); + __u64 cno, size_t inode_size); #endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 9c334c722fc1..7340a01d80e1 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -112,7 +112,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, "%s (ino=%lu): a race condition while inserting a data block at offset=%llu", __func__, inode->i_ino, (unsigned long long)blkoff); - err = 0; + err = -EAGAIN; } nilfs_transaction_abort(inode->i_sb); goto out; @@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb, inode, inode->i_mode, huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); } - nilfs_ifile_unmap_inode(root->ifile, ino, bh); + nilfs_ifile_unmap_inode(raw_inode); brelse(bh); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); @@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb, return 0; failed_unmap: - nilfs_ifile_unmap_inode(root->ifile, ino, bh); + nilfs_ifile_unmap_inode(raw_inode); brelse(bh); bad_inode: @@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) return s_inode; } +/** + * nilfs_write_inode_common - export common inode information to on-disk inode + * @inode: inode object + * @raw_inode: on-disk inode + * + * This function writes standard information from the on-memory inode @inode + * to @raw_inode on ifile, cpfile or a super root block. Since inode bmap + * data is not exported, nilfs_bmap_write() must be called separately during + * log writing. + */ void nilfs_write_inode_common(struct inode *inode, - struct nilfs_inode *raw_inode, int has_bmap) + struct nilfs_inode *raw_inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_flags = cpu_to_le32(ii->i_flags); raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { - struct the_nilfs *nilfs = inode->i_sb->s_fs_info; - - /* zero-fill unused portion in the case of super root block */ - raw_inode->i_xattr = 0; - raw_inode->i_pad = 0; - memset((void *)raw_inode + sizeof(*raw_inode), 0, - nilfs->ns_inode_size - sizeof(*raw_inode)); - } - - if (has_bmap) - nilfs_bmap_write(ii->i_bmap, raw_inode); - else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - raw_inode->i_device_code = - cpu_to_le64(huge_encode_dev(inode->i_rdev)); /* * When extending inode, nilfs->ns_inode_size should be checked * for substitutions of appended fields. @@ -813,14 +808,13 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) if (flags & I_DIRTY_DATASYNC) set_bit(NILFS_I_INODE_SYNC, &ii->i_state); - nilfs_write_inode_common(inode, raw_inode, 0); - /* - * XXX: call with has_bmap = 0 is a workaround to avoid - * deadlock of bmap. This delays update of i_bmap to just - * before writing. - */ + nilfs_write_inode_common(inode, raw_inode); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(huge_encode_dev(inode->i_rdev)); - nilfs_ifile_unmap_inode(ifile, ino, ibh); + nilfs_ifile_unmap_inode(raw_inode); } #define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index cfb6aca5ec38..f1a01c191cf5 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1111,7 +1111,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; minseg = range[0] + segbytes - 1; - do_div(minseg, segbytes); + minseg = div64_ul(minseg, segbytes); if (range[1] < 4096) goto out; @@ -1120,7 +1120,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) if (maxseg < segbytes) goto out; - do_div(maxseg, segbytes); + maxseg = div64_ul(maxseg, segbytes); maxseg--; ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index e45c01a559c0..4f792a0ad0f0 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); - kunmap_atomic(kaddr); + kunmap_local(kaddr); set_buffer_uptodate(bh); mark_buffer_dirty(bh); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 98cffaf0ac12..2e29b98ba8ba 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t); extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern void nilfs_set_inode_flags(struct inode *); extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); -extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode); struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, unsigned long ino); struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5c2eba1987bd..14e470fb8870 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) struct page *spage = sbh->b_page, *dpage = dbh->b_page; struct buffer_head *bh; - kaddr0 = kmap_atomic(spage); - kaddr1 = kmap_atomic(dpage); + kaddr0 = kmap_local_page(spage); + kaddr1 = kmap_local_page(dpage); memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); - kunmap_atomic(kaddr1); - kunmap_atomic(kaddr0); + kunmap_local(kaddr1); + kunmap_local(kaddr0); dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; dbh->b_blocknr = sbh->b_blocknr; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index a9b8d77c8c1d..49a70c68bf3c 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -482,9 +482,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, if (unlikely(!bh_org)) return -EIO; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memcpy(kaddr + from, bh_org->b_data, bh_org->b_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh_org); return 0; } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6e59dc19a732..dc431b4c34c9 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } raw_sum->ss_datasum = cpu_to_le32(crc); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2bfb08052d39..aa5290cb7467 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -880,76 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) nilfs_mdt_clear_dirty(nilfs->ns_dat); } -static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) -{ - struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct buffer_head *bh_cp; - struct nilfs_checkpoint *raw_cp; - int err; - - /* XXX: this interface will be changed */ - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, - &raw_cp, &bh_cp); - if (likely(!err)) { - /* - * The following code is duplicated with cpfile. But, it is - * needed to collect the checkpoint even if it was not newly - * created. - */ - mark_buffer_dirty(bh_cp); - nilfs_mdt_mark_dirty(nilfs->ns_cpfile); - nilfs_cpfile_put_checkpoint( - nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - } else if (err == -EINVAL || err == -ENOENT) { - nilfs_error(sci->sc_super, - "checkpoint creation failed due to metadata corruption."); - err = -EIO; - } - return err; -} - -static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) -{ - struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct buffer_head *bh_cp; - struct nilfs_checkpoint *raw_cp; - int err; - - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, - &raw_cp, &bh_cp); - if (unlikely(err)) { - if (err == -EINVAL || err == -ENOENT) { - nilfs_error(sci->sc_super, - "checkpoint finalization failed due to metadata corruption."); - err = -EIO; - } - goto failed_ibh; - } - raw_cp->cp_snapshot_list.ssl_next = 0; - raw_cp->cp_snapshot_list.ssl_prev = 0; - raw_cp->cp_inodes_count = - cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); - raw_cp->cp_blocks_count = - cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); - raw_cp->cp_nblk_inc = - cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); - raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); - raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); - - if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) - nilfs_checkpoint_clear_minor(raw_cp); - else - nilfs_checkpoint_set_minor(raw_cp); - - nilfs_write_inode_common(sci->sc_root->ifile, - &raw_cp->cp_ifile_inode, 1); - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); - return 0; - - failed_ibh: - return err; -} - static void nilfs_fill_in_file_bmap(struct inode *ifile, struct nilfs_inode_info *ii) @@ -963,7 +893,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile, raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, ibh); nilfs_bmap_write(ii->i_bmap, raw_inode); - nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + nilfs_ifile_unmap_inode(raw_inode); } } @@ -977,6 +907,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci) } } +/** + * nilfs_write_root_mdt_inode - export root metadata inode information to + * the on-disk inode + * @inode: inode object of the root metadata file + * @raw_inode: on-disk inode + * + * nilfs_write_root_mdt_inode() writes inode information and bmap data of + * @inode to the inode area of the metadata file allocated on the super root + * block created to finalize the log. Since super root blocks are configured + * each time, this function zero-fills the unused area of @raw_inode. + */ +static void nilfs_write_root_mdt_inode(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + nilfs_write_inode_common(inode, raw_inode); + + /* zero-fill unused portion of raw_inode */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + + nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode); +} + static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { @@ -998,12 +955,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs->ns_nongc_ctime : sci->sc_seg_ctime); raw_sr->sr_flags = 0; - nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr + - NILFS_SR_DAT_OFFSET(isz), 1); - nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr + - NILFS_SR_CPFILE_OFFSET(isz), 1); - nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + - NILFS_SR_SUFILE_OFFSET(isz), 1); + nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr + + NILFS_SR_DAT_OFFSET(isz)); + nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr + + NILFS_SR_CPFILE_OFFSET(isz)); + nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr + + NILFS_SR_SUFILE_OFFSET(isz)); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); set_buffer_uptodate(bh_sr); unlock_buffer(bh_sr); @@ -1230,7 +1188,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) break; nilfs_sc_cstage_inc(sci); /* Creating a checkpoint */ - err = nilfs_segctor_create_checkpoint(sci); + err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile, + nilfs->ns_cno); if (unlikely(err)) break; fallthrough; @@ -2101,7 +2060,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) { - err = nilfs_segctor_fill_in_checkpoint(sci); + err = nilfs_cpfile_finalize_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root, + sci->sc_nblk_inc + sci->sc_nblk_this_inc, + sci->sc_seg_ctime, + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)); if (unlikely(err)) goto failed_to_write; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 0a8119456c21..6748218be7c5 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -48,7 +48,7 @@ nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; - do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile)); return (unsigned long)t; } @@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, struct nilfs_sufile_header *header; void *kaddr; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, ncleanadd); le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(header_bh); } @@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) ret = nilfs_sufile_get_header_block(sufile, &header_bh); if (ret < 0) goto out_sem; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); last_alloc = le64_to_cpu(header->sh_last_alloc); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nsegments = nilfs_sufile_get_nsegments(sufile); maxsegnum = sui->allocmax; @@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) &su_bh); if (ret < 0) goto out_header; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); @@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) continue; /* found a clean segment */ nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); le64_add_cpu(&header->sh_ncleansegs, -1); le64_add_cpu(&header->sh_ndirtysegs, 1); header->sh_last_alloc = cpu_to_le64(segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); sui->ncleansegs--; mark_buffer_dirty(header_bh); @@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) goto out_header; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); } @@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, struct nilfs_segment_usage *su; void *kaddr; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (unlikely(!nilfs_segment_usage_clean(su))) { nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean", __func__, (unsigned long long)segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; @@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, void *kaddr; int clean, dirty; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } clean = nilfs_segment_usage_clean(su); @@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; @@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, void *kaddr; int sudirty; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_clean(su)) { nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean", __func__, (unsigned long long)segnum); - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } if (unlikely(nilfs_segment_usage_error(su))) @@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, (unsigned long long)segnum); nilfs_segment_usage_set_clean(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); @@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) if (ret) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); if (unlikely(nilfs_segment_usage_error(su))) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(bh); if (nilfs_segment_is_active(nilfs, segnum)) { nilfs_error(sufile->i_sb, @@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = -EIO; } else { nilfs_segment_usage_set_dirty(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, if (ret < 0) goto out_sem; - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); if (modtime) { /* @@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_lastmod = cpu_to_le64(modtime); } su->su_nblocks = cpu_to_le32(nblocks); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); @@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) if (ret < 0) goto out_sem; - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); @@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) spin_lock(&nilfs->ns_last_segment_lock); sustat->ss_prot_seq = nilfs->ns_prot_seq; spin_unlock(&nilfs->ns_last_segment_lock); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(header_bh); out_sem: @@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, void *kaddr; int suclean; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_error(su)) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); return; } suclean = nilfs_segment_usage_clean(su); nilfs_segment_usage_set_error(su); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (suclean) { nilfs_sufile_mod_counter(header_bh, -1, 0); @@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, /* hole */ continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); su2 = su; @@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); goto out_header; } @@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, nc++; } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (nc > 0) { mark_buffer_dirty(su_bh); ncleaned += nc; @@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) sui->allocmin = 0; } - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); - kunmap_atomic(kaddr); + kunmap_local(kaddr); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(sufile); @@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); for (j = 0; j < n; @@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, si->sui_flags |= BIT(NILFS_SEGMENT_USAGE_ACTIVE); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(su_bh); } ret = nsegs; @@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, goto out_header; for (;;) { - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, sup->sup_segnum, bh, kaddr); @@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); } - kunmap_atomic(kaddr); + kunmap_local(kaddr); sup = (void *)sup + supsz; if (sup >= supend) @@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) continue; } - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { @@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } if (nblocks >= minlen) { - kunmap_atomic(kaddr); + kunmap_local(kaddr); ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, @@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) } ndiscarded += nblocks; - kaddr = kmap_atomic(su_bh->b_page); + kaddr = kmap_local_page(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); } @@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) start = seg_start; nblocks = seg_end - seg_start + 1; } - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_bh(su_bh); } @@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, goto failed; sui = NILFS_SUI(sufile); - kaddr = kmap_atomic(header_bh->b_page); + kaddr = kmap_local_page(header_bh->b_page); header = kaddr + bh_offset(header_bh); sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); - kunmap_atomic(kaddr); + kunmap_local(kaddr); brelse(header_bh); sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index df8674173b22..ac24ed109ce9 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -448,7 +448,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) sb2off = NILFS_SB2_OFFSET_BYTES(newsize); newnsegs = sb2off >> nilfs->ns_blocksize_bits; - do_div(newnsegs, nilfs->ns_blocks_per_segment); + newnsegs = div64_ul(newnsegs, nilfs->ns_blocks_per_segment); ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); up_write(&nilfs->ns_segctor_sem); @@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_root *root; - struct nilfs_checkpoint *raw_cp; - struct buffer_head *bh_cp; int err = -ENOMEM; root = nilfs_find_or_create_root( @@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, goto reuse; /* already attached checkpoint */ down_read(&nilfs->ns_segctor_sem); - err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, - &bh_cp); + err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size); up_read(&nilfs->ns_segctor_sem); - if (unlikely(err)) { - if (err == -ENOENT || err == -EINVAL) { - nilfs_err(sb, - "Invalid checkpoint (checkpoint number=%llu)", - (unsigned long long)cno); - err = -EINVAL; - } + if (unlikely(err)) goto failed; - } - - err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size, - &raw_cp->cp_ifile_inode, &root->ifile); - if (err) - goto failed_bh; - - atomic64_set(&root->inodes_count, - le64_to_cpu(raw_cp->cp_inodes_count)); - atomic64_set(&root->blocks_count, - le64_to_cpu(raw_cp->cp_blocks_count)); - - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); reuse: *rootp = root; return 0; - failed_bh: - nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); failed: + if (err == -EINVAL) + nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)", + (unsigned long long)cno); nilfs_put_root(root); return err; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 71400496ed36..2ae2c1bbf6d1 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -413,7 +413,7 @@ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs) { u64 max_count = U64_MAX; - do_div(max_count, nilfs->ns_blocks_per_segment); + max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment); return min_t(u64, max_count, ULONG_MAX); } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64a6ef638495..cb40cafbc062 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1615,7 +1615,7 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); - /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); spin_unlock_irqrestore(&lockres->l_lock, flags); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8b6d15010703..0da8e7bd3261 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2763,6 +2763,7 @@ const struct inode_operations ocfs2_file_iops = { const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, + .listxattr = ocfs2_listxattr, .permission = ocfs2_permission, .get_inode_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e7314d6fb8c7..8aabaed2c1cb 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1711,12 +1711,12 @@ static int ocfs2_initialize_mem_caches(void) ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), 0, - (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT), + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, NULL); ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", sizeof(struct ocfs2_quota_chunk), 0, - (SLAB_RECLAIM_ACCOUNT), + SLAB_RECLAIM_ACCOUNT, NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index 3b6982bf6bcf..e75e173a9186 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -22,7 +22,7 @@ int op_cache_initialize(void) op_cache = kmem_cache_create("orangefs_op_cache", sizeof(struct orangefs_kernel_op_s), 0, - ORANGEFS_CACHE_CREATE_FLAGS, + 0, NULL); if (!op_cache) { diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 926d9c0a428a..e2df7eeadc7a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -93,16 +93,6 @@ enum orangefs_vfs_op_states { OP_VFS_STATE_GIVEN_UP = 16, }; -/* - * orangefs kernel memory related flags - */ - -#if (defined CONFIG_DEBUG_SLAB) -#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE -#else -#define ORANGEFS_CACHE_CREATE_FLAGS 0 -#endif - extern const struct xattr_handler * const orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 5254256a224d..34849b4a3243 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -527,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); if (!ORANGEFS_SB(sb)) { d = ERR_PTR(-ENOMEM); - goto free_sb_and_op; + goto free_op; } ret = orangefs_fill_sb(sb, @@ -644,7 +644,7 @@ int orangefs_inode_cache_initialize(void) "orangefs_inode_cache", sizeof(struct orangefs_inode_s), 0, - ORANGEFS_CACHE_CREATE_FLAGS, + 0, offsetof(struct orangefs_inode_s, link_target), sizeof_field(struct orangefs_inode_s, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8586e2f5d243..0762575a1e70 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -234,11 +234,11 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) { loff_t tmp; - if (WARN_ON_ONCE(pos != pos2)) + if (pos != pos2) return -EIO; - if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0)) + if (pos < 0 || len < 0 || totlen < 0) return -EIO; - if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp))) + if (check_add_overflow(pos, len, &tmp)) return -EIO; return 0; } diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 32b1116ae137..d80a1431ef7b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -32,7 +32,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU - select CRASH_CORE + select VMCORE_INFO help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6422e569b080..8e08a9a1b7ed 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,7 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> */ -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/kcore.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3f78ebbb795f..23fbab954c20 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags) return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } -static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, - struct pagemapread *pm) +static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) @@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { @@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; } @@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) @@ -1807,7 +1806,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pte_to_swp_entry(pte); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } if (pte_swp_soft_dirty(pte)) @@ -1873,7 +1872,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } diff --git a/fs/super.c b/fs/super.c index ee05ab6b37e7..71d9779c42b1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1515,11 +1515,29 @@ static int fs_bdev_thaw(struct block_device *bdev) return error; } +static void fs_bdev_super_get(void *data) +{ + struct super_block *sb = data; + + spin_lock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); +} + +static void fs_bdev_super_put(void *data) +{ + struct super_block *sb = data; + + put_super(sb); +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, + .get_holder = fs_bdev_super_get, + .put_holder = fs_bdev_super_put, }; EXPORT_SYMBOL_GPL(fs_holder_ops); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 959551ff9a95..60dcfafdc11a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = { static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; -/* - * Start with fault_pending_wqh and fault_wqh so they're more likely - * to be in the same cacheline. - * - * Locking order: - * fd_wqh.lock - * fault_pending_wqh.lock - * fault_wqh.lock - * event_wqh.lock - * - * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, - * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's - * also taken in IRQ context. - */ -struct userfaultfd_ctx { - /* waitqueue head for the pending (i.e. not read) userfaults */ - wait_queue_head_t fault_pending_wqh; - /* waitqueue head for the userfaults */ - wait_queue_head_t fault_wqh; - /* waitqueue head for the pseudo fd to wakeup poll/read */ - wait_queue_head_t fd_wqh; - /* waitqueue head for events */ - wait_queue_head_t event_wqh; - /* a refile sequence protected by fault_pending_wqh lock */ - seqcount_spinlock_t refile_seq; - /* pseudo fd refcounting */ - refcount_t refcount; - /* userfaultfd syscall flags */ - unsigned int flags; - /* features requested from the userspace */ - unsigned int features; - /* released */ - bool released; - /* memory mappings are changing because of non-cooperative event */ - atomic_t mmap_changing; - /* mm with one ore more vmas attached to this userfaultfd_ctx */ - struct mm_struct *mm; -}; - struct userfaultfd_fork_ctx { struct userfaultfd_ctx *orig; struct userfaultfd_ctx *new; @@ -724,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->flags = octx->flags; ctx->features = octx->features; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + down_write(&octx->map_changing_lock); atomic_inc(&octx->mmap_changing); + up_write(&octx->map_changing_lock); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -776,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ vma_start_write(vma); @@ -822,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -864,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, return -ENOMEM; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1748,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - flags); + ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1800,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); mmput(ctx->mm); } else { return -ESRCH; @@ -1857,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(ctx->mm)) { - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + ret = mwriteprotect_range(ctx, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp); mmput(ctx->mm); } else { return -ESRCH; @@ -1909,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing, flags); + ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, + uffdio_continue.range.len, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1964,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, - uffdio_poison.range.len, - &ctx->mmap_changing, 0); + ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, + uffdio_poison.range.len, 0); mmput(ctx->mm); } else { return -ESRCH; @@ -2040,16 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(mm)) { - mmap_read_lock(mm); - - /* Re-check after taking mmap_lock */ - if (likely(!atomic_read(&ctx->mmap_changing))) - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, - uffdio_move.len, uffdio_move.mode); - else - ret = -EINVAL; - - mmap_read_unlock(mm); + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); mmput(mm); } else { return -ESRCH; @@ -2255,6 +2212,7 @@ static int new_userfaultfd(int flags) ctx->flags = flags; ctx->features = 0; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ |