diff options
Diffstat (limited to 'fs')
230 files changed, 8496 insertions, 4878 deletions
diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 1923affcdc62..ee1b6b06a2fd 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -8,9 +8,8 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#include <linux/fscache.h> - #ifdef CONFIG_9P_FSCACHE +#include <linux/fscache.h> extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, const char *dev_name); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 805151114e96..de009a33e0e2 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -41,14 +41,24 @@ void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid) *pfid = NULL; } +static bool v9fs_is_writeable(int mode) +{ + if (mode & (P9_OWRITE|P9_ORDWR)) + return true; + else + return false; +} + /** * v9fs_fid_find_inode - search for an open fid off of the inode list * @inode: return a fid pointing to a specific inode + * @want_writeable: only consider fids which are writeable * @uid: return a fid belonging to the specified user + * @any: ignore uid as a selection criteria * */ - -static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) +struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable, + kuid_t uid, bool any) { struct hlist_head *h; struct p9_fid *fid, *ret = NULL; @@ -58,7 +68,12 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) spin_lock(&inode->i_lock); h = (struct hlist_head *)&inode->i_private; hlist_for_each_entry(fid, h, ilist) { - if (uid_eq(fid->uid, uid)) { + if (any || uid_eq(fid->uid, uid)) { + if (want_writeable && !v9fs_is_writeable(fid->mode)) { + p9_debug(P9_DEBUG_VFS, " mode: %x not writeable?\n", + fid->mode); + continue; + } p9_fid_get(fid); ret = fid; break; @@ -118,7 +133,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) spin_unlock(&dentry->d_lock); } else { if (dentry->d_inode) - ret = v9fs_fid_find_inode(dentry->d_inode, uid); + ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); } return ret; @@ -299,28 +314,3 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) return v9fs_fid_lookup_with_uid(dentry, uid, any); } -struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) -{ - int err; - struct p9_fid *fid, *ofid; - - ofid = v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0); - fid = clone_fid(ofid); - if (IS_ERR(fid)) - goto error_out; - p9_fid_put(ofid); - /* - * writeback fid will only be used to write back the - * dirty pages. We always request for the open fid in read-write - * mode so that a partial page write which result in page - * read can work. - */ - err = p9_client_open(fid, O_RDWR); - if (err < 0) { - p9_fid_put(fid); - fid = ERR_PTR(err); - goto error_out; - } -error_out: - return fid; -} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 8a4e8cd12ca2..0c51889a60b3 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -7,14 +7,16 @@ #ifndef FS_9P_FID_H #define FS_9P_FID_H #include <linux/list.h> +#include "v9fs.h" +struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable, + kuid_t uid, bool any); struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) { return v9fs_fid_lookup(dentry->d_parent); } void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid); -struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid); static inline struct p9_fid *clone_fid(struct p9_fid *fid) { @@ -32,4 +34,31 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) p9_fid_put(fid); return nfid; } +/** + * v9fs_fid_addmodes - add cache flags to fid mode (for client use only) + * @fid: fid to augment + * @s_flags: session info mount flags + * @s_cache: session info cache flags + * @f_flags: unix open flags + * + * make sure mode reflects flags of underlying mounts + * also qid.version == 0 reflects a synthetic or legacy file system + * NOTE: these are set after open so only reflect 9p client not + * underlying file system on server. + */ +static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags, + int s_cache, unsigned int f_flags) +{ + if (fid->qid.type != P9_QTFILE) + return; + + if ((!s_cache) || + ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || + (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { + fid->mode |= P9L_DIRECT; /* no read or write cache */ + } else if ((!(s_cache & CACHE_WRITEBACK)) || + (f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) { + fid->mode |= P9L_NOWRITECACHE; + } +} #endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 61a51b90600d..c7f774fe398f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -38,9 +38,7 @@ enum { /* String options */ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ - Opt_nodevmap, - /* Cache options */ - Opt_cache_loose, Opt_fscache, Opt_mmap, + Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv, /* Access options */ Opt_access, Opt_posixacl, /* Lock timeout option */ @@ -57,10 +55,10 @@ static const match_table_t tokens = { {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_nodevmap, "nodevmap"}, + {Opt_noxattr, "noxattr"}, + {Opt_directio, "directio"}, + {Opt_ignoreqv, "ignoreqv"}, {Opt_cache, "cache=%s"}, - {Opt_cache_loose, "loose"}, - {Opt_fscache, "fscache"}, - {Opt_mmap, "mmap"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, @@ -68,32 +66,30 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static const char *const v9fs_cache_modes[nr__p9_cache_modes] = { - [CACHE_NONE] = "none", - [CACHE_MMAP] = "mmap", - [CACHE_LOOSE] = "loose", - [CACHE_FSCACHE] = "fscache", -}; - /* Interpret mount options for cache mode */ static int get_cache_mode(char *s) { int version = -EINVAL; if (!strcmp(s, "loose")) { - version = CACHE_LOOSE; + version = CACHE_SC_LOOSE; p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { - version = CACHE_FSCACHE; + version = CACHE_SC_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "mmap")) { - version = CACHE_MMAP; + version = CACHE_SC_MMAP; p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); + } else if (!strcmp(s, "readahead")) { + version = CACHE_SC_READAHEAD; + p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n"); } else if (!strcmp(s, "none")) { - version = CACHE_NONE; + version = CACHE_SC_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); - } else - pr_info("Unknown Cache mode %s\n", s); + } else if (kstrtoint(s, 0, &version) != 0) { + version = -EINVAL; + pr_info("Unknown Cache mode or invalid value %s\n", s); + } return version; } @@ -121,9 +117,9 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) if (v9ses->nodev) seq_puts(m, ",nodevmap"); if (v9ses->cache) - seq_printf(m, ",%s", v9fs_cache_modes[v9ses->cache]); + seq_printf(m, ",cache=%x", v9ses->cache); #ifdef CONFIG_9P_FSCACHE - if (v9ses->cachetag && v9ses->cache == CACHE_FSCACHE) + if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE)) seq_printf(m, ",cachetag=%s", v9ses->cachetag); #endif @@ -143,9 +139,16 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) break; } + if (v9ses->flags & V9FS_IGNORE_QV) + seq_puts(m, ",ignoreqv"); + if (v9ses->flags & V9FS_DIRECT_IO) + seq_puts(m, ",directio"); if (v9ses->flags & V9FS_POSIX_ACL) seq_puts(m, ",posixacl"); + if (v9ses->flags & V9FS_NO_XATTR) + seq_puts(m, ",noxattr"); + return p9_show_client_options(m, v9ses->clnt); } @@ -266,14 +269,14 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_nodevmap: v9ses->nodev = 1; break; - case Opt_cache_loose: - v9ses->cache = CACHE_LOOSE; + case Opt_noxattr: + v9ses->flags |= V9FS_NO_XATTR; break; - case Opt_fscache: - v9ses->cache = CACHE_FSCACHE; + case Opt_directio: + v9ses->flags |= V9FS_DIRECT_IO; break; - case Opt_mmap: - v9ses->cache = CACHE_MMAP; + case Opt_ignoreqv: + v9ses->flags |= V9FS_IGNORE_QV; break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE @@ -468,7 +471,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - if (v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index f3f74d197b5d..06a2514f0d88 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -31,29 +31,54 @@ #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { - V9FS_PROTO_2000U = 0x01, - V9FS_PROTO_2000L = 0x02, - V9FS_ACCESS_SINGLE = 0x04, - V9FS_ACCESS_USER = 0x08, - V9FS_ACCESS_CLIENT = 0x10, - V9FS_POSIX_ACL = 0x20 + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2000L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20, + V9FS_NO_XATTR = 0x40, + V9FS_IGNORE_QV = 0x80, /* ignore qid.version for cache hints */ + V9FS_DIRECT_IO = 0x100, + V9FS_SYNC = 0x200 }; -/* possible values of ->cache */ /** - * enum p9_cache_modes - user specified cache preferences - * @CACHE_NONE: do not cache data, dentries, or directory contents (default) - * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency + * enum p9_cache_shortcuts - human readable cache preferences + * @CACHE_SC_NONE: disable all caches + * @CACHE_SC_READAHEAD: only provide caching for readahead + * @CACHE_SC_MMAP: provide caching to enable mmap + * @CACHE_SC_LOOSE: non-coherent caching for files and meta data + * @CACHE_SC_FSCACHE: persistent non-coherent caching for files and meta-data * - * eventually support loose, tight, time, session, default always none */ -enum p9_cache_modes { - CACHE_NONE, - CACHE_MMAP, - CACHE_LOOSE, - CACHE_FSCACHE, - nr__p9_cache_modes +enum p9_cache_shortcuts { + CACHE_SC_NONE = 0b00000000, + CACHE_SC_READAHEAD = 0b00000001, + CACHE_SC_MMAP = 0b00000101, + CACHE_SC_LOOSE = 0b00001111, + CACHE_SC_FSCACHE = 0b10001111, +}; + +/** + * enum p9_cache_bits - possible values of ->cache + * @CACHE_NONE: caches disabled + * @CACHE_FILE: file caching (open to close) + * @CACHE_META: meta-data and directory caching + * @CACHE_WRITEBACK: write-back caching for files + * @CACHE_LOOSE: don't check cache consistency + * @CACHE_FSCACHE: local persistent caches + * + */ + +enum p9_cache_bits { + CACHE_NONE = 0b00000000, + CACHE_FILE = 0b00000001, + CACHE_META = 0b00000010, + CACHE_WRITEBACK = 0b00000100, + CACHE_LOOSE = 0b00001000, + CACHE_FSCACHE = 0b10000000, }; /** @@ -62,7 +87,7 @@ enum p9_cache_modes { * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle - * @cache: cache mode of type &p9_cache_modes + * @cache: cache mode of type &p9_cache_bits * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as @@ -112,7 +137,6 @@ struct v9fs_inode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; - struct p9_fid *writeback_fid; struct mutex v_mutex; }; diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 75106b9f293d..cdf441f22e07 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -36,10 +36,6 @@ extern const struct file_operations v9fs_dir_operations; extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; -extern const struct file_operations v9fs_cached_file_operations; -extern const struct file_operations v9fs_cached_file_operations_dotl; -extern const struct file_operations v9fs_mmap_file_operations; -extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 425956eb9fde..8a635999a7d6 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -56,8 +56,6 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct inode *inode = file_inode(file); - struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; BUG_ON(!fid); @@ -65,11 +63,8 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - if (rreq->origin == NETFS_READ_FOR_WRITE && - (fid->mode & O_ACCMODE) == O_WRONLY) { - fid = v9inode->writeback_fid; - BUG_ON(!fid); - } + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && + !(fid->mode & P9_ORDWR)); p9_fid_get(fid); rreq->netfs_priv = fid; @@ -119,8 +114,6 @@ const struct netfs_request_ops v9fs_req_ops = { static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) { - struct inode *inode = folio_inode(folio); - if (folio_test_private(folio)) return false; #ifdef CONFIG_9P_FSCACHE @@ -129,8 +122,8 @@ static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) return false; folio_wait_fscache(folio); } + fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); #endif - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode))); return true; } @@ -140,6 +133,7 @@ static void v9fs_invalidate_folio(struct folio *folio, size_t offset, folio_wait_fscache(folio); } +#ifdef CONFIG_9P_FSCACHE static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, bool was_async) { @@ -153,17 +147,19 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, i_size_read(&v9inode->netfs.inode), 0); } } +#endif static int v9fs_vfs_write_folio_locked(struct folio *folio) { struct inode *inode = folio_inode(folio); - struct v9fs_inode *v9inode = V9FS_I(inode); - struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode); loff_t start = folio_pos(folio); loff_t i_size = i_size_read(inode); struct iov_iter from; size_t len = folio_size(folio); + struct p9_fid *writeback_fid; int err; + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); + struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); if (start >= i_size) return 0; /* Simultaneous truncation occurred */ @@ -172,25 +168,33 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - /* We should have writeback_fid always set */ - BUG_ON(!v9inode->writeback_fid); + writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); + if (!writeback_fid) { + WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", + inode->i_private); + return -EINVAL; + } folio_wait_fscache(folio); folio_start_writeback(folio); - p9_client_write(v9inode->writeback_fid, start, &from, &err); + p9_client_write(writeback_fid, start, &from, &err); +#ifdef CONFIG_9P_FSCACHE if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { + fscache_cookie_enabled(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { folio_start_fscache(folio); fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); + folio_mapping(folio), start, len, i_size, + v9fs_write_to_cache_done, v9inode, + true); } +#endif folio_end_writeback(folio); + p9_fid_put(writeback_fid); + return err; } @@ -297,7 +301,6 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct folio *folio = page_folio(subpage); struct inode *inode = mapping->host; - struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -317,7 +320,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (last_pos > inode->i_size) { inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); - fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos); +#ifdef CONFIG_9P_FSCACHE + fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, + &last_pos); +#endif } folio_mark_dirty(folio); out: diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 52bf87934650..45b684b7d8d7 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -196,9 +196,9 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) /** - * v9fs_dir_release - called on a close of a file or directory - * @inode: inode of the directory - * @filp: file pointer to a directory + * v9fs_dir_release - close a directory or a file + * @inode: inode of the directory or file + * @filp: file pointer to a directory or file * */ @@ -213,7 +213,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp, fid ? fid->fid : -1); + if (fid) { + if ((S_ISREG(inode->i_mode)) && (filp->f_mode & FMODE_WRITE)) + retval = filemap_fdatawrite(inode->i_mapping); + spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 367a851eaa82..6c31b8c8112d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -28,7 +28,6 @@ #include "fid.h" #include "cache.h" -static const struct vm_operations_struct v9fs_file_vm_ops; static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** @@ -41,13 +40,11 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops; int v9fs_file_open(struct inode *inode, struct file *file) { int err; - struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *writeback_fid; + struct p9_fid *fid; int omode; p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); - v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = v9fs_open_to_dotl_flags(file->f_flags); @@ -60,7 +57,19 @@ int v9fs_file_open(struct inode *inode, struct file *file) if (IS_ERR(fid)) return PTR_ERR(fid); - err = p9_client_open(fid, omode); + if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) { + int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR; + + p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n"); + err = p9_client_open(fid, writeback_omode); + if (err < 0) { + p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n"); + err = p9_client_open(fid, omode); + fid->mode |= P9L_DIRECT; + } + } else { + err = p9_client_open(fid, omode); + } if (err < 0) { p9_fid_put(fid); return err; @@ -72,36 +81,14 @@ int v9fs_file_open(struct inode *inode, struct file *file) file->private_data = fid; } - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((file->f_flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(writeback_fid)) { - err = PTR_ERR(writeback_fid); - mutex_unlock(&v9inode->v_mutex); - goto out_error; - } - v9inode->writeback_fid = (void *) writeback_fid; - } - mutex_unlock(&v9inode->v_mutex); #ifdef CONFIG_9P_FSCACHE - if (v9ses->cache == CACHE_FSCACHE) - fscache_use_cookie(v9fs_inode_cookie(v9inode), + if (v9ses->cache & CACHE_FSCACHE) + fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)), file->f_mode & FMODE_WRITE); #endif + v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); return 0; -out_error: - p9_fid_put(file->private_data); - file->private_data = NULL; - return err; } /** @@ -368,8 +355,13 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct p9_fid *fid = iocb->ki_filp->private_data; int ret, err = 0; - p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", - iov_iter_count(to), iocb->ki_pos); + p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", + fid->fid, iov_iter_count(to), iocb->ki_pos); + + if (!(fid->mode & P9L_DIRECT)) { + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return generic_file_read_iter(iocb, to); + } if (iocb->ki_filp->f_flags & O_NONBLOCK) ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); @@ -392,10 +384,18 @@ static ssize_t v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct p9_fid *fid = file->private_data; ssize_t retval; loff_t origin; int err = 0; + p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); + + if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return generic_file_write_iter(iocb, from); + } + retval = generic_write_checks(iocb, from); if (retval <= 0) return retval; @@ -477,45 +477,18 @@ static int v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; + struct inode *inode = file_inode(filp); + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp); - retval = generic_file_mmap(filp, vma); - if (!retval) - vma->vm_ops = &v9fs_file_vm_ops; - - return retval; -} - -static int -v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) -{ - int retval; - struct inode *inode; - struct v9fs_inode *v9inode; - struct p9_fid *fid; - - inode = file_inode(filp); - v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if (!v9inode->writeback_fid && - (vma->vm_flags & VM_SHARED) && - (vma->vm_flags & VM_WRITE)) { - /* - * clone a fid and add it to writeback_fid - * we do it during mmap instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - fid = v9fs_writeback_fid(file_dentry(filp)); - if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - mutex_unlock(&v9inode->v_mutex); - return retval; - } - v9inode->writeback_fid = (void *) fid; + if (!(v9ses->cache & CACHE_WRITEBACK)) { + p9_debug(P9_DEBUG_CACHE, "(no mmap mode)"); + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + invalidate_inode_pages2(filp->f_mapping); + return generic_file_readonly_mmap(filp, vma); } - mutex_unlock(&v9inode->v_mutex); retval = generic_file_mmap(filp, vma); if (!retval) @@ -527,7 +500,6 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct v9fs_inode *v9inode; struct folio *folio = page_folio(vmf->page); struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); @@ -536,8 +508,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", folio, (unsigned long)filp->private_data); - v9inode = V9FS_I(inode); - /* Wait for the page to be written to the cache before we allow it to * be modified. We then assume the entire page will need writing back. */ @@ -550,7 +520,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(filp); - BUG_ON(!v9inode->writeback_fid); if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; if (folio_mapping(folio) != inode->i_mapping) @@ -563,35 +532,6 @@ out_unlock: return VM_FAULT_NOPAGE; } -/** - * v9fs_mmap_file_read_iter - read from a file - * @iocb: The operation parameters - * @to: The buffer to read into - * - */ -static ssize_t -v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - /* TODO: Check if there are dirty pages */ - return v9fs_file_read_iter(iocb, to); -} - -/** - * v9fs_mmap_file_write_iter - write to a file - * @iocb: The operation parameters - * @from: The data to write - * - */ -static ssize_t -v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - /* - * TODO: invalidate mmaps on filp's inode between - * offset and offset+count - */ - return v9fs_file_write_iter(iocb, from); -} - static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { struct inode *inode; @@ -614,13 +554,6 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } - -static const struct vm_operations_struct v9fs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = v9fs_vm_page_mkwrite, -}; - static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, @@ -628,34 +561,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .page_mkwrite = v9fs_vm_page_mkwrite, }; - -const struct file_operations v9fs_cached_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock, - .mmap = v9fs_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync, -}; - -const struct file_operations v9fs_cached_file_operations_dotl = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock_dotl, - .flock = v9fs_file_flock_dotl, - .mmap = v9fs_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync_dotl, -}; - const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, @@ -677,34 +582,7 @@ const struct file_operations v9fs_file_operations_dotl = { .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, - .mmap = generic_file_readonly_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync_dotl, -}; - -const struct file_operations v9fs_mmap_file_operations = { - .llseek = generic_file_llseek, - .read_iter = v9fs_mmap_file_read_iter, - .write_iter = v9fs_mmap_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock, - .mmap = v9fs_mmap_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync, -}; - -const struct file_operations v9fs_mmap_file_operations_dotl = { - .llseek = generic_file_llseek, - .read_iter = v9fs_mmap_file_read_iter, - .write_iter = v9fs_mmap_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock_dotl, - .flock = v9fs_file_flock_dotl, - .mmap = v9fs_mmap_file_mmap, + .mmap = v9fs_file_mmap, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 502ac74e4959..36b466e35887 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -229,7 +229,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; - v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); return &v9inode->netfs.inode; @@ -286,24 +285,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - if (v9ses->cache == CACHE_LOOSE || - v9ses->cache == CACHE_FSCACHE) - inode->i_fop = - &v9fs_cached_file_operations_dotl; - else if (v9ses->cache == CACHE_MMAP) - inode->i_fop = &v9fs_mmap_file_operations_dotl; - else - inode->i_fop = &v9fs_file_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - if (v9ses->cache == CACHE_LOOSE || - v9ses->cache == CACHE_FSCACHE) - inode->i_fop = - &v9fs_cached_file_operations; - else if (v9ses->cache == CACHE_MMAP) - inode->i_fop = &v9fs_mmap_file_operations; - else - inode->i_fop = &v9fs_file_operations; + inode->i_fop = &v9fs_file_operations; } break; @@ -385,20 +370,23 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) */ void v9fs_evict_inode(struct inode *inode) { - struct v9fs_inode *v9inode = V9FS_I(inode); - __le32 version; + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); + __le32 __maybe_unused version; truncate_inode_pages_final(&inode->i_data); + +#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, &version); +#endif + clear_inode(inode); filemap_fdatawrite(&inode->i_data); +#ifdef CONFIG_9P_FSCACHE fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); - /* clunk the fid stashed in writeback_fid */ - p9_fid_put(v9inode->writeback_fid); - v9inode->writeback_fid = NULL; +#endif } static int v9fs_test_inode(struct inode *inode, void *data) @@ -778,7 +766,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; else if (IS_ERR(fid)) inode = ERR_CAST(fid); - else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + else if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); @@ -807,11 +795,12 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, { int err; u32 perm; - struct v9fs_inode *v9inode; + struct v9fs_inode __maybe_unused *v9inode; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *inode_fid; + struct p9_fid *fid; struct dentry *res = NULL; struct inode *inode; + int p9_omode; if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); @@ -830,9 +819,14 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); - fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, - v9fs_proto_dotu(v9ses))); + p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses)); + + if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { + p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; + p9_debug(P9_DEBUG_CACHE, + "write-only file with writeback enabled, creating w/ O_RDWR\n"); + } + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode); if (IS_ERR(fid)) { err = PTR_ERR(fid); goto error; @@ -841,33 +835,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_invalidate_inode_attr(dir); inode = d_inode(dentry); v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto error; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); err = finish_open(file, dentry, generic_file_open); if (err) goto error; file->private_data = fid; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif + + v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); file->f_mode |= FMODE_CREATED; @@ -1029,15 +1008,24 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_wstat *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; + } else if (v9ses->cache & CACHE_WRITEBACK) { + if (S_ISREG(inode->i_mode)) { + int retval = filemap_fdatawrite(inode->i_mapping); + + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during getattr returned %d\n", retval); + } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -1069,7 +1057,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, { int retval, use_dentry = 0; struct inode *inode = d_inode(dentry); - struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_wstat wstat; @@ -1114,8 +1101,12 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, } /* Write all dirty data */ - if (d_is_reg(dentry)) - filemap_write_and_wait(inode->i_mapping); + if (d_is_reg(dentry)) { + retval = filemap_fdatawrite(inode->i_mapping); + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during setattr returned %d\n", retval); + } retval = p9_client_wstat(fid, &wstat); @@ -1126,9 +1117,17 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + truncate_pagecache(inode, iattr->ia_size); + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) { + struct v9fs_inode *v9inode = V9FS_I(inode); + + fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + } +#endif } v9fs_invalidate_inode_attr(inode); @@ -1412,7 +1411,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * We don't want to refresh inode->i_size, * because we may have cached data */ - flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode(st, inode, inode->i_sb, flags); out: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index a7da49906d99..5361cd2d7996 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -231,12 +231,12 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, int err = 0; kgid_t gid; umode_t mode; + int p9_omode = v9fs_open_to_dotl_flags(flags); const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; - struct v9fs_inode *v9inode; - struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL; + struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; @@ -281,14 +281,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n", err); goto out; } - err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), - mode, gid, &qid); + + if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { + p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; + p9_debug(P9_DEBUG_CACHE, + "write-only file with writeback enabled, creating w/ O_RDWR\n"); + } + err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid); if (err < 0) { - p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n", err); goto out; } @@ -313,36 +318,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); - v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto out; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) goto out; file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE - if (v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & CACHE_FSCACHE) { + struct v9fs_inode *v9inode = V9FS_I(inode); fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); + } #endif + v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags); v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: @@ -414,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, } /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -457,13 +445,22 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct inode *inode = d_inode(dentry); struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; + } else if (v9ses->cache) { + if (S_ISREG(inode->i_mode)) { + int retval = filemap_fdatawrite(inode->i_mapping); + + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during getattr returned %d\n", retval); + } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -539,12 +536,13 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; + struct inode *inode = d_inode(dentry); + struct v9fs_session_info __maybe_unused *v9ses; struct p9_fid *fid = NULL; struct p9_iattr_dotl p9attr = { .uid = INVALID_UID, .gid = INVALID_GID, }; - struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -552,6 +550,8 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if (retval) return retval; + v9ses = v9fs_dentry2v9ses(dentry); + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); if (iattr->ia_valid & ATTR_MODE) p9attr.mode = iattr->ia_mode; @@ -582,8 +582,12 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(fid); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) - filemap_write_and_wait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) { + retval = filemap_fdatawrite(inode->i_mapping); + if (retval < 0) + p9_debug(P9_DEBUG_ERROR, + "Flushing file prior to setattr failed: %d\n", retval); + } retval = p9_client_setattr(fid, &p9attr); if (retval < 0) { @@ -592,9 +596,17 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, return retval; } - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != + i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); + truncate_pagecache(inode, iattr->ia_size); + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) + fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)), + iattr->ia_size); +#endif + } v9fs_invalidate_inode_attr(inode); setattr_copy(&nop_mnt_idmap, inode, iattr); @@ -721,7 +733,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { @@ -798,7 +810,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Get the latest stat info from server. */ struct p9_fid *fid; @@ -875,7 +887,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, } /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -960,7 +972,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) * We don't want to refresh inode->i_size, * because we may have cached data */ - flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode_dotl(st, inode, flags); out: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 10449994a972..73db55c050bf 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -63,7 +63,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_magic = V9FS_MAGIC; if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; - sb->s_xattr = v9fs_xattr_handlers; + if (!(v9ses->flags & V9FS_NO_XATTR)) + sb->s_xattr = v9fs_xattr_handlers; } else { sb->s_op = &v9fs_super_ops; sb->s_time_max = U32_MAX; @@ -83,9 +84,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } - sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; - if (!v9ses->cache) - sb->s_flags |= SB_SYNCHRONOUS; + sb->s_flags |= SB_ACTIVE; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) @@ -136,7 +135,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, if (retval) goto release_sb; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; @@ -277,7 +276,7 @@ static int v9fs_drop_inode(struct inode *inode) struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) return generic_drop_inode(inode); /* * in case of non cached mode always drop the @@ -290,49 +289,30 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - struct p9_wstat wstat; struct v9fs_inode *v9inode; + /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - v9inode = V9FS_I(inode); - if (!v9inode->writeback_fid) - return 0; - v9fs_blank_wstat(&wstat); - ret = p9_client_wstat(v9inode->writeback_fid, &wstat); - if (ret < 0) { - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; - } + v9inode = V9FS_I(inode); fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); + return 0; } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - int ret; struct v9fs_inode *v9inode; - /* - * send an fsync request to server irrespective of - * wbc->sync_mode. - */ + v9inode = V9FS_I(inode); - p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", - __func__, inode, v9inode->writeback_fid); - if (!v9inode->writeback_fid) - return 0; - - ret = p9_client_fsync(v9inode->writeback_fid, 0); - if (ret < 0) { - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; - } + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); + return 0; } diff --git a/fs/Kconfig b/fs/Kconfig index e99830c65033..cc07a0cd3172 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -250,16 +250,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -# -# Select this config option from the architecture Kconfig, if it is preferred -# to enable the feature of HugeTLB Vmemmap Optimization (HVO). -# -config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP - bool - config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON diff --git a/fs/Makefile b/fs/Makefile index 05f89b5c962f..834f1c3dba46 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,7 +6,6 @@ # Rewritten to use lists instead of if-statements. # -obj-$(CONFIG_SYSCTL) += sysctls.o obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ @@ -50,7 +49,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o -obj-$(CONFIG_SYSCTL) += drop_caches.o +obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += iomap/ @@ -124,7 +123,7 @@ obj-$(CONFIG_9P_FS) += 9p/ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ -obj-$(CONFIG_HOSTFS) += hostfs/ +obj-y += hostfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_TRACING) += tracefs/ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 82690d1dd49a..4dd97afa536c 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -275,6 +275,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) loff_t i_size; int nr_pages, i; int ret; + loff_t remote_size = 0; _enter(""); @@ -289,6 +290,8 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) expand: i_size = i_size_read(&dvnode->netfs.inode); + if (i_size < remote_size) + i_size = remote_size; if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -319,16 +322,16 @@ expand: struct folio *folio; folio = filemap_get_folio(mapping, i); - if (!folio) { + if (IS_ERR(folio)) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; folio = __filemap_get_folio(mapping, i, FGP_LOCK | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto error; + } folio_attach_private(folio, (void *)1); folio_unlock(folio); } @@ -364,6 +367,7 @@ expand: * buffer. */ up_write(&dvnode->validate_lock); + remote_size = req->file_size; goto expand; } @@ -524,7 +528,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, FGP_ACCESSED, 0); - if (!folio) { + if (IS_ERR(folio)) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..f0eddccbdd95 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); else if (folio && !folio_test_private(folio)) folio_attach_private(folio, (void *)1); diff --git a/fs/afs/file.c b/fs/afs/file.c index 68d6d5dc608d..719b31374879 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma) static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - struct afs_file *af = vmf->vma->vm_file->private_data; - switch (afs_validate(vnode, af->key)) { - case 0: + if (afs_pagecache_valid(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); - case -ENOMEM: - return VM_FAULT_OOM; - case -EINTR: - case -ERESTARTSYS: - return VM_FAULT_RETRY; - case -ESTALE: - default: - return VM_FAULT_SIGBUS; - } + return 0; } static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0167e96e5198..866bab860a88 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -230,6 +230,7 @@ static void afs_apply_status(struct afs_operation *op, set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); } change_size = true; + data_changed = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -449,7 +450,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) 0 : FSCACHE_ADV_SINGLE_CHUNK, &key, sizeof(key), &aux, sizeof(aux), - vnode->status.size)); + i_size_read(&vnode->netfs.inode))); #endif } @@ -668,6 +669,24 @@ bool afs_check_validity(struct afs_vnode *vnode) } /* + * Returns true if the pagecache is still valid. Does not sleep. + */ +bool afs_pagecache_valid(struct afs_vnode *vnode) +{ + if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); + return true; + } + + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && + afs_check_validity(vnode)) + return true; + + return false; +} + +/* * validate a vnode/inode * - there are several things we need to check * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, @@ -684,14 +703,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - goto valid; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) + if (afs_pagecache_valid(vnode)) goto valid; down_write(&vnode->validate_lock); @@ -765,6 +777,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; + + /* Lie about the size of directories. We maintain a locally + * edited copy and may make different allocation decisions on + * it, but we need to give userspace the server's size. + */ + if (S_ISDIR(inode->i_mode)) + stat->size = vnode->netfs.remote_i_size; } while (need_seqretry(&vnode->cb_lock, seq)); done_seqretry(&vnode->cb_lock, seq); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 68ae91d21b57..9d3d64921106 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); +bool afs_pagecache_valid(struct afs_vnode *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); diff --git a/fs/afs/write.c b/fs/afs/write.c index 571f3b9a417e..c822d6006033 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping, _debug("kill %lx (to %lx)", index, last); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } @@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, _debug("redirty %llx @%llx", len, start); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8a884e795f6a..1033fbdfdbec 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2058,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm) has_dumped = 1; - offset += sizeof(elf); /* Elf header */ + offset += sizeof(elf); /* ELF header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2174,7 +2174,6 @@ static void __exit exit_elf_binfmt(void) core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); -MODULE_LICENSE("GPL"); #ifdef CONFIG_BINFMT_ELF_KUNIT_TEST #include "binfmt_elf_test.c" diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a05eafcacfb2..05a1471d5283 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1540,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); - offset = sizeof(*elf); /* Elf header */ + offset = sizeof(*elf); /* ELF header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ diff --git a/fs/buffer.c b/fs/buffer.c index b3eb905f87d6..a7fc561758b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -842,7 +842,7 @@ int remove_inode_buffers(struct inode *inode) } /* - * Create the appropriate buffers when given a page for data area and + * Create the appropriate buffers when given a folio for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. @@ -850,8 +850,8 @@ int remove_inode_buffers(struct inode *inode) * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */ -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry) +struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, + bool retry) { struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; @@ -861,12 +861,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - /* The page lock pins the memcg */ - memcg = page_memcg(page); + /* The folio lock pins the memcg */ + memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); head = NULL; - offset = PAGE_SIZE; + offset = folio_size(folio); while ((offset -= size) >= 0) { bh = alloc_buffer_head(gfp); if (!bh) @@ -878,8 +878,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bh->b_size = size; - /* Link the buffer to its page */ - set_bh_page(bh, page, offset); + /* Link the buffer to its folio */ + folio_set_bh(bh, folio, offset); } out: set_active_memcg(old_memcg); @@ -898,6 +898,13 @@ no_grow: goto out; } +EXPORT_SYMBOL_GPL(folio_alloc_buffers); + +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + bool retry) +{ + return folio_alloc_buffers(page_folio(page), size, retry); +} EXPORT_SYMBOL_GPL(alloc_page_buffers); static inline void @@ -1484,6 +1491,21 @@ void set_bh_page(struct buffer_head *bh, } EXPORT_SYMBOL(set_bh_page); +void folio_set_bh(struct buffer_head *bh, struct folio *folio, + unsigned long offset) +{ + bh->b_folio = folio; + BUG_ON(offset >= folio_size(folio)); + if (folio_test_highmem(folio)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = folio_address(folio) + offset; +} +EXPORT_SYMBOL(folio_set_bh); + /* * Called when truncating a buffer on a page completely. */ @@ -1571,18 +1593,17 @@ out: } EXPORT_SYMBOL(block_invalidate_folio); - /* * We attach and possibly dirty the buffers atomically wrt * block_dirty_folio() via private_lock. try_to_free_buffers - * is already excluded via the page lock. + * is already excluded via the folio lock. */ -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) +void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, + unsigned long b_state) { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -1591,19 +1612,26 @@ void create_empty_buffers(struct page *page, } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); - if (PageUptodate(page) || PageDirty(page)) { + spin_lock(&folio->mapping->private_lock); + if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { - if (PageDirty(page)) + if (folio_test_dirty(folio)) set_buffer_dirty(bh); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); bh = bh->b_this_page; } while (bh != head); } - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); + folio_attach_private(folio, head); + spin_unlock(&folio->mapping->private_lock); +} +EXPORT_SYMBOL(folio_create_empty_buffers); + +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + folio_create_empty_buffers(page_folio(page), blocksize, b_state); } EXPORT_SYMBOL(create_empty_buffers); @@ -1694,14 +1722,17 @@ static inline int block_size_bits(unsigned int blocksize) return ilog2(blocksize); } -static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +static struct buffer_head *folio_create_buffers(struct folio *folio, + struct inode *inode, + unsigned int b_state) { - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), - b_state); - return page_buffers(page); + if (!folio_buffers(folio)) + folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), + b_state); + return folio_buffers(folio); } /* @@ -1745,8 +1776,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = create_page_buffers(page, inode, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + head = folio_create_buffers(page_folio(page), inode, + (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio @@ -2009,7 +2040,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); @@ -2295,7 +2326,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 58f8aec964e4..18de8a876b02 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -22,18 +22,9 @@ static struct ctl_table cachefiles_sysctls[] = { {} }; -static struct ctl_table cachefiles_sysctls_root[] = { - { - .procname = "cachefiles", - .mode = 0555, - .child = cachefiles_sysctls, - }, - {} -}; - int __init cachefiles_register_error_injection(void) { - cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root); + cachefiles_sysctl = register_sysctl("cachefiles", cachefiles_sysctls); if (!cachefiles_sysctl) return -ENOMEM; return 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d5335f445233..6bb251a4d613 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -808,6 +808,7 @@ static int ceph_writepages_start(struct address_space *mapping, bool should_loop, range_whole = false; bool done = false; bool caching = ceph_is_cache_enabled(inode); + xa_mark_t tag; if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) @@ -834,6 +835,11 @@ static int ceph_writepages_start(struct address_space *mapping, start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { + tag = PAGECACHE_TAG_TOWRITE; + } else { + tag = PAGECACHE_TAG_DIRTY; + } retry: /* find oldest snap context with dirty data */ snapc = get_oldest_context(inode, &ceph_wbc, NULL); @@ -872,6 +878,9 @@ retry: dout(" non-head snapc, range whole\n"); } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(last_snapc); last_snapc = snapc; @@ -888,7 +897,7 @@ retry: get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, - end, PAGECACHE_TAG_DIRTY, &fbatch); + end, tag, &fbatch); dout("pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7cc20772eac9..789be30d6ee2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -431,7 +431,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, * * Called with i_ceph_lock held. */ -static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index bec3c4549c07..3904333fa6c3 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -248,14 +248,20 @@ static int metrics_caps_show(struct seq_file *s, void *p) return 0; } -static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +static int caps_show_cb(struct inode *inode, int mds, void *p) { + struct ceph_inode_info *ci = ceph_inode(inode); struct seq_file *s = p; - - seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), - cap->session->s_mds, - ceph_cap_string(cap->issued), - ceph_cap_string(cap->implemented)); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap) + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + spin_unlock(&ci->i_ceph_lock); return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0ced8b570e42..cb67ac821f0e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1050,6 +1050,9 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + if (dentry->d_flags & DCACHE_DISCONNECTED) + return -EINVAL; + err = ceph_wait_on_conflict_unlink(dentry); if (err) return err; @@ -1057,8 +1060,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("link in dir %p old_dentry %p dentry %p\n", dir, - old_dentry, dentry); + dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", + dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); if (IS_ERR(req)) { d_drop(dentry); @@ -1067,6 +1070,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + /* + * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we + * will just pass the ino# to MDSs. + */ + if (old_dentry->d_flags & DCACHE_DISCONNECTED) + req->r_ino2 = ceph_vino(d_inode(old_dentry)); req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 27a245d959c0..29cf00220b09 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1632,8 +1632,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, * Caller must hold session s_mutex. */ int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) + int (*cb)(struct inode *, int mds, void *), + void *arg) { struct list_head *p; struct ceph_cap *cap; @@ -1645,6 +1645,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { + int mds; + cap = list_entry(p, struct ceph_cap, session_caps); inode = igrab(&cap->ci->netfs.inode); if (!inode) { @@ -1652,6 +1654,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, continue; } session->s_cap_iterator = cap; + mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { @@ -1663,7 +1666,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, old_cap = NULL; } - ret = cb(inode, cap, arg); + ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); @@ -1696,20 +1699,25 @@ out: return ret; } -static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); bool invalidate = false; - int iputs; + struct ceph_cap *cap; + int iputs = 0; - dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); - iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + cap = __get_cap_for_mds(ci, mds); + if (cap) { + dout(" removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->netfs.inode); + + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + } spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); + if (cap) + wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); while (iputs--) @@ -1780,8 +1788,7 @@ enum { * * caller must hold s_mutex. */ -static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned long ev = (unsigned long)arg; @@ -1792,12 +1799,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { - if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) { - /* mds did not re-issue stale cap */ - spin_lock(&ci->i_ceph_lock); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + /* mds did not re-issue stale cap */ + if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) cap->issued = cap->implemented = CEPH_CAP_PIN; - spin_unlock(&ci->i_ceph_lock); - } + spin_unlock(&ci->i_ceph_lock); } else if (ev == FORCE_RO) { } wake_up_all(&ci->i_cap_wq); @@ -1959,16 +1968,22 @@ out: * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ -static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +static int trim_caps_cb(struct inode *inode, int mds, void *arg) { int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; + struct ceph_cap *cap; if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + return 0; + } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); wanted = __ceph_caps_file_wanted(ci); @@ -2555,6 +2570,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; + struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; @@ -2572,7 +2588,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, } /* If r_old_dentry is set, then assume that its parent is locked */ - ret = set_request_path_attr(NULL, req->r_old_dentry, + if (req->r_old_dentry && + !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) + old_dentry = req->r_old_dentry; + ret = set_request_path_attr(NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); @@ -3911,26 +3930,22 @@ out_unlock: /* * Encode information about a cap for a reconnect with the MDS. */ -static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci = cap->ci; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; + struct ceph_cap *cap; char *path; - int pathlen = 0, err; + int pathlen = 0, err = 0; u64 pathbase; u64 snap_follows; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", - inode, ceph_vinop(inode), cap, cap->cap_id, - ceph_cap_string(cap->issued)); - dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ @@ -3947,6 +3962,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + goto out_err; + } + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0598faa50e2e..724307ff89cd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -355,8 +355,8 @@ struct ceph_snapid_map { struct rb_node node; struct list_head lru; atomic_t ref; - u64 snap; dev_t dev; + u64 snap; unsigned long last_used; }; @@ -541,8 +541,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, - struct ceph_cap *, void *), + int (*cb)(struct inode *, int mds, void *), void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6ecca2c6d137..d24bf0db5234 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1192,6 +1192,8 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci); +extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 1fe1b62abebd..806183959c47 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -535,6 +535,8 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, return NULL; } +#define MAX_XATTR_VAL_PRINT_LEN 256 + static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, @@ -597,7 +599,7 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->should_free_name = update_xattr; ci->i_xattrs.count++; - dout("__set_xattr count=%d\n", ci->i_xattrs.count); + dout("%s count=%d\n", __func__, ci->i_xattrs.count); } else { kfree(*newxattr); *newxattr = NULL; @@ -625,11 +627,13 @@ static int __set_xattr(struct ceph_inode_info *ci, if (new) { rb_link_node(&xattr->node, parent, p); rb_insert_color(&xattr->node, &ci->i_xattrs.index); - dout("__set_xattr_val p=%p\n", p); + dout("%s p=%p\n", __func__, p); } - dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); + dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__, + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, + min(val_len, MAX_XATTR_VAL_PRINT_LEN), val, + val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); return 0; } @@ -655,13 +659,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, else if (c > 0) p = &(*p)->rb_right; else { - dout("__get_xattr %s: found %.*s\n", name, - xattr->val_len, xattr->val); + int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN); + + dout("%s %s: found %.*s%s\n", __func__, name, len, + xattr->val, xattr->val_len > len ? "..." : ""); return xattr; } } - dout("__get_xattr %s: not found\n", name); + dout("%s %s: not found\n", __func__, name); return NULL; } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 415176b2cf32..74cd6fafb33e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -162,6 +162,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 41 -#define CIFS_VERSION "2.42" +#define SMB3_PRODUCT_BUILD 43 +#define CIFS_VERSION "2.43" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1cbb90587995..7bfef741f758 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1916,18 +1916,22 @@ void cifs_put_smb_ses(struct cifs_ses *ses) /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_GOOD) ses->ses_status = SES_EXITING; - cifs_free_ipc(ses); - if (ses->ses_status == SES_EXITING && server->ops->logoff) { + spin_unlock(&ses->ses_lock); + cifs_free_ipc(ses); xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n", __func__, rc); _free_xid(xid); + } else { + spin_unlock(&ses->ses_lock); + cifs_free_ipc(ses); } spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b33d2e7b0f98..c5fcefdfd797 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4882,6 +4882,8 @@ void cifs_oplock_break(struct work_struct *work) struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; bool purge_cache = false; + struct cifs_deferred_close *dclose; + bool is_deferred = false; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -4918,6 +4920,20 @@ void cifs_oplock_break(struct work_struct *work) oplock_break_ack: /* + * When oplock break is received and there are no active + * file handles but cached, then schedule deferred close immediately. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + spin_unlock(&CIFS_I(inode)->deferred_lock); + + if (!CIFS_CACHE_HANDLE(cinode) && is_deferred && + cfile->deferred_close_scheduled && delayed_work_pending(&cfile->deferred)) { + cifs_close_deferred_file(cinode); + } + + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do * not bother sending an oplock release if session to server still is diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7f085ed2d866..cd914be905b2 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -749,7 +749,9 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&cifs_inode->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&cifs_inode->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) @@ -762,7 +764,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) spin_unlock(&cifs_inode->open_file_lock); list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { - _cifsFileInfo_put(tmp_list->cfile, true, false); + _cifsFileInfo_put(tmp_list->cfile, false, false); list_del(&tmp_list->list); kfree(tmp_list); } @@ -780,7 +782,9 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) @@ -815,7 +819,9 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) if (strstr(full_path, path)) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 366f0c3b799b..0521aa1da644 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -821,7 +821,6 @@ create_posix_buf(umode_t mode) static int add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); @@ -830,11 +829,6 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix)); *num_iovec = num + 1; return 0; } @@ -2069,7 +2063,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) static void parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf) { - struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc; + struct create_disk_id_rsp *pdisk_id = (struct create_disk_id_rsp *)cc; cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n", pdisk_id->DiskFileId, pdisk_id->VolumeId); @@ -2172,10 +2166,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server, } static int -add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, +add_lease_context(struct TCP_Server_Info *server, + struct smb2_create_req *req, + struct kvec *iov, unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); @@ -2183,12 +2178,6 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, - server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -2267,18 +2256,12 @@ static int add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_durable_v2_buf(oparms); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_v2); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2)); *num_iovec = num + 1; return 0; } @@ -2287,7 +2270,6 @@ static int add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; /* indicate that we don't need to relock the file */ @@ -2297,12 +2279,6 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, - sizeof(struct create_durable_handle_reconnect_v2)); *num_iovec = num + 1; return 0; } @@ -2311,7 +2287,6 @@ static int add_durable_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms, bool use_persistent) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; if (use_persistent) { @@ -2331,11 +2306,6 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; } @@ -2369,18 +2339,12 @@ create_twarp_buf(__u64 timewarp) static int add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_twarp_buf(timewarp); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct crt_twarp_ctxt); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_twarp_ctxt)); *num_iovec = num + 1; return 0; } @@ -2503,7 +2467,6 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) static int add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set_owner) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; unsigned int len = 0; @@ -2511,11 +2474,6 @@ add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = len; - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, len); *num_iovec = num + 1; return 0; } @@ -2546,18 +2504,12 @@ create_query_id_buf(void) static int add_query_id_context(struct kvec *iov, unsigned int *num_iovec) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_query_id_buf(); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct crt_query_id_ctxt); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt)); *num_iovec = num + 1; return 0; } @@ -2720,6 +2672,9 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rc = add_posix_context(iov, &n_iov, mode); if (rc) goto err_free_req; + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[1].iov_len); pc_buf = iov[n_iov-1].iov_base; } @@ -2857,21 +2812,13 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, (oparms->create_options & CREATE_NOT_FILE)) req->RequestedOplockLevel = *oplock; /* no srv lease support */ else { - rc = add_lease_context(server, iov, &n_iov, + rc = add_lease_context(server, req, iov, &n_iov, oparms->fid->lease_key, oplock); if (rc) return rc; } if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { - /* need to set Next field of lease context if we request it */ - if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(server->vals->create_lease_size); - } - rc = add_durable_context(iov, &n_iov, oparms, tcon->use_persistent); if (rc) @@ -2879,13 +2826,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } if (tcon->posix_extensions) { - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(iov[n_iov-1].iov_len); - } - rc = add_posix_context(iov, &n_iov, oparms->mode); if (rc) return rc; @@ -2893,13 +2833,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (tcon->snapshot_time) { cifs_dbg(FYI, "adding snapshot context\n"); - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(iov[n_iov-1].iov_len); - } - rc = add_twarp_context(iov, &n_iov, tcon->snapshot_time); if (rc) return rc; @@ -2923,12 +2856,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, set_owner = false; if (set_owner | set_mode) { - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len); - } - cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode); rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner); if (rc) @@ -2936,12 +2863,30 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } } + add_query_id_context(iov, &n_iov); + if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len); + /* + * We have create contexts behind iov[1] (the file + * name), point at them from the main create request + */ + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[1].iov_len); + req->CreateContextsLength = 0; + + for (unsigned int i = 2; i < (n_iov-1); i++) { + struct kvec *v = &iov[i]; + size_t len = v->iov_len; + struct create_context *cctx = + (struct create_context *)v->iov_base; + + cctx->Next = cpu_to_le32(len); + le32_add_cpu(&req->CreateContextsLength, len); + } + le32_add_cpu(&req->CreateContextsLength, + iov[n_iov-1].iov_len); } - add_query_id_context(iov, &n_iov); rqst->rq_nvec = n_iov; return 0; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2114e8a0c63a..220994d0a0f7 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -132,17 +132,6 @@ struct share_redirect_error_context_rsp { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -struct create_durable { - struct create_context ccontext; - __u8 Name[8]; - union { - __u8 Reserved[16]; - struct { - __u64 PersistentFileId; - __u64 VolatileFileId; - } Fid; - } Data; -} __packed; /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ @@ -170,15 +159,6 @@ struct durable_reconnect_context_v2 { __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */ } __packed; -/* See MS-SMB2 2.2.14.2.9 */ -struct create_on_disk_id { - struct create_context ccontext; - __u8 Name[8]; - __le64 DiskFileId; - __le64 VolumeId; - __u32 Reserved[4]; -} __packed; - /* See MS-SMB2 2.2.14.2.12 */ struct durable_reconnect_context_v2_rsp { __le32 Timeout; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index b39580ad4ce5..3c3148588491 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -361,7 +361,7 @@ static int __init init_coda_psdev(void) __func__, CODA_PSDEV_MAJOR); return -EIO; } - coda_psdev_class = class_create(THIS_MODULE, "coda"); + coda_psdev_class = class_create("coda"); if (IS_ERR(coda_psdev_class)) { err = PTR_ERR(coda_psdev_class); goto out_chrdev; diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index fda3b702b1c5..a247c14aaab7 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -39,19 +39,10 @@ static struct ctl_table coda_table[] = { {} }; -static struct ctl_table fs_table[] = { - { - .procname = "coda", - .mode = 0555, - .child = coda_table - }, - {} -}; - void coda_sysctl_init(void) { if ( !fs_table_header ) - fs_table_header = register_sysctl_table(fs_table); + fs_table_header = register_sysctl("coda", coda_table); } void coda_sysctl_clean(void) diff --git a/fs/coredump.c b/fs/coredump.c index 5df1e6e1eb2b..ece7badf701b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) pos = file->f_pos; bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); + iov_iter_set_copy_mc(&iter); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4f757a71f99b..980483455cc0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,7 +43,7 @@ * LOCKING: * There are three level of locking required by epoll : * - * 1) epmutex (mutex) + * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * @@ -57,14 +57,8 @@ * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). - * Then we also need a global mutex to serialize eventpoll_release_file() - * and ep_free(). - * This mutex is acquired by ep_free() during the epoll file - * cleanup path and it is also acquired by eventpoll_release_file() - * if a file has been pushed inside an epoll set and it is then - * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). - * It is also acquired when inserting an epoll fd onto another epoll - * fd. We do this so that we walk the epoll tree and ensure that this + * The epnested_mutex is acquired when inserting an epoll fd onto another + * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and @@ -80,9 +74,9 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->lock") to have it working, + * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. - * Events that require holding "epmutex" are very rare, while for + * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ @@ -153,6 +147,13 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; + /* + * Protected by file->f_lock, true for to-be-released epitem already + * removed from the "struct file" items list; together with + * eventpoll->refcount orchestrates "struct eventpoll" disposal + */ + bool dying; + /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -217,6 +218,12 @@ struct eventpoll { u64 gen; struct hlist_head refs; + /* + * usage count, used together with epitem->dying to + * orchestrate the disposal of this struct + */ + refcount_t refcount; + #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; @@ -240,10 +247,8 @@ struct ep_pqueue { /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; -/* - * This mutex is used to serialize ep_free() and eventpoll_release_file(). - */ -static DEFINE_MUTEX(epmutex); +/* Used for cycles detection */ +static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; @@ -258,7 +263,7 @@ static struct kmem_cache *pwq_cache __read_mostly; /* * List of files with newly added links, where we may need to limit the number - * of emanating paths. Protected by the epmutex. + * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; @@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) /* * This function unregisters poll callbacks from the associated file - * descriptor. Must be called with "mtx" held (or "epmutex" if called from - * ep_free). + * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { @@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +static void ep_get(struct eventpoll *ep) +{ + refcount_inc(&ep->refcount); +} + +/* + * Returns true if the event poll can be disposed + */ +static bool ep_refcount_dec_and_test(struct eventpoll *ep) +{ + if (!refcount_dec_and_test(&ep->refcount)) + return false; + + WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); + return true; +} + +static void ep_free(struct eventpoll *ep) +{ + mutex_destroy(&ep->mtx); + free_uid(ep->user); + wakeup_source_unregister(ep->ws); + kfree(ep); +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. + * If the dying flag is set, do the removal only if force is true. + * This prevents ep_clear_and_put() from dropping all the ep references + * while running concurrently with eventpoll_release_file(). + * Returns true if the eventpoll can be disposed. */ -static int ep_remove(struct eventpoll *ep, struct epitem *epi) +static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; @@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); + if (epi->dying && !force) { + spin_unlock(&file->f_lock); + return false; + } + to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { @@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); + return ep_refcount_dec_and_test(ep); +} - return 0; +/* + * ep_remove variant for callers owing an additional reference to the ep + */ +static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) +{ + WARN_ON_ONCE(__ep_remove(ep, epi, false)); } -static void ep_free(struct eventpoll *ep) +static void ep_clear_and_put(struct eventpoll *ep) { - struct rb_node *rbp; + struct rb_node *rbp, *next; struct epitem *epi; + bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() while we're freeing the "struct eventpoll". - * We do not need to hold "ep->mtx" here because the epoll file - * is on the way to be removed and no one has references to it - * anymore. The only hit might come from eventpoll_release_file() but - * holding "epmutex" is sufficient here. - */ - mutex_lock(&epmutex); + mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. @@ -767,26 +805,25 @@ static void ep_free(struct eventpoll *ep) } /* - * Walks through the whole tree by freeing each "struct epitem". At this - * point we are sure no poll callbacks will be lingering around, and also by - * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". - * We do not need to lock ep->mtx, either, we only do it to prevent - * a lockdep warning. + * Walks through the whole tree and try to free each "struct epitem". + * Note that ep_remove_safe() will not remove the epitem in case of a + * racing eventpoll_release_file(); the latter will do the removal. + * At this point we are sure no poll callbacks will be lingering around. + * Since we still own a reference to the eventpoll struct, the loop can't + * dispose it. */ - mutex_lock(&ep->mtx); - while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { + next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); - ep_remove(ep, epi); + ep_remove_safe(ep, epi); cond_resched(); } + + dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - mutex_unlock(&epmutex); - mutex_destroy(&ep->mtx); - free_uid(ep->user); - wakeup_source_unregister(ep->ws); - kfree(ep); + if (dispose) + ep_free(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) @@ -794,7 +831,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) struct eventpoll *ep = file->private_data; if (ep) - ep_free(ep); + ep_clear_and_put(ep); return 0; } @@ -906,33 +943,34 @@ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; - struct hlist_node *next; + bool dispose; /* - * We don't want to get "file->f_lock" because it is not - * necessary. It is not necessary because we're in the "struct file" - * cleanup path, and this means that no one is using this file anymore. - * So, for example, epoll_ctl() cannot hit here since if we reach this - * point, the file counter already went to zero and fget() would fail. - * The only hit might come from ep_free() but by holding the mutex - * will correctly serialize the operation. We do need to acquire - * "ep->mtx" after "epmutex" because ep_remove() requires it when called - * from anywhere but ep_free(). - * - * Besides, ep_remove() acquires the lock, so we can't hold it here. + * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from + * touching the epitems list before eventpoll_release_file() can access + * the ep->mtx. */ - mutex_lock(&epmutex); - if (unlikely(!file->f_ep)) { - mutex_unlock(&epmutex); - return; - } - hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { +again: + spin_lock(&file->f_lock); + if (file->f_ep && file->f_ep->first) { + epi = hlist_entry(file->f_ep->first, struct epitem, fllink); + epi->dying = true; + spin_unlock(&file->f_lock); + + /* + * ep access is safe as we still own a reference to the ep + * struct + */ ep = epi->ep; - mutex_lock_nested(&ep->mtx, 0); - ep_remove(ep, epi); + mutex_lock(&ep->mtx); + dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); + + if (dispose) + ep_free(ep); + goto again; } - mutex_unlock(&epmutex); + spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) @@ -955,6 +993,7 @@ static int ep_alloc(struct eventpoll **pep) ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; + refcount_set(&ep->refcount, 1); *pep = ep; @@ -1223,10 +1262,10 @@ out_unlock: */ list_del_init(&wait->entry); /* - * ->whead != NULL protects us from the race with ep_free() - * or ep_remove(), ep_remove_wait_queue() takes whead->lock - * held by the caller. Once we nullify it, nothing protects - * ep/epi or even wait. + * ->whead != NULL protects us from the race with + * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() + * takes whead->lock held by the caller. Once we nullify it, + * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } @@ -1298,7 +1337,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify - * and delete can't add additional paths. Protected by the epmutex. + * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; @@ -1496,16 +1535,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (tep) mutex_unlock(&tep->mtx); + /* + * ep_remove_safe() calls in the later error paths can't lead to + * ep_free() as the ep file itself still holds an ep reference. + */ + ep_get(ep); + /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return error; } } @@ -1529,7 +1574,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ if (unlikely(!epq.epi)) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -ENOMEM; } @@ -2025,7 +2070,7 @@ static int do_epoll_create(int flags) out_free_fd: put_unused_fd(fd); out_free_ep: - ep_free(ep); + ep_clear_and_put(ep); return error; } @@ -2135,7 +2180,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the - * 'epmutex' on add is to prevent complex toplogies such as loops and + * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ @@ -2146,7 +2191,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); - error = epoll_mutex_lock(&epmutex, 0, nonblock); + error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; @@ -2180,10 +2225,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -EEXIST; break; case EPOLL_CTL_DEL: - if (epi) - error = ep_remove(ep, epi); - else + if (epi) { + /* + * The eventpoll itself is still alive: the refcount + * can't go to zero here. + */ + ep_remove_safe(ep, epi); + error = 0; + } else { error = -ENOENT; + } break; case EPOLL_CTL_MOD: if (epi) { @@ -2201,7 +2252,7 @@ error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; - mutex_unlock(&epmutex); + mutex_unlock(&epnested_mutex); } fdput(tf); diff --git a/fs/exec.c b/fs/exec.c index 7c44d0c65b1b..a466e797c8e2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> +#include <linux/user_events.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1034,7 +1035,7 @@ static int exec_mmap(struct mm_struct *mm) mmput(old_mm); return 0; } - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); return 0; } @@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm, current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e79c767cc5e0..35703dce23a3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5795,7 +5795,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) * mapped - no physical clusters have been allocated, and the * file has no extents */ - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || + ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b9fb1177fff6..859bc4e2c9b0 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -566,9 +566,9 @@ retry: * started */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); - if (!folio) { - ret = -ENOMEM; - goto out; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); @@ -633,6 +633,7 @@ out: folio_unlock(folio); folio_put(folio); } +out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) @@ -693,8 +694,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); - if (!folio) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out; } @@ -854,8 +855,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { @@ -947,8 +948,8 @@ retry_journal: */ folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); - if (!folio) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out_journal; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8dbd352e3986..0d5ba922e411 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1162,8 +1162,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, retry_grab: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -2906,8 +2906,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); @@ -2982,6 +2982,9 @@ static int ext4_da_write_end(struct file *file, ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3618,8 +3621,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; @@ -5201,7 +5204,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 4681fff6665f..4022bc713421 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -282,6 +282,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (mmp_block < le32_to_cpu(es->s_first_data_block) || mmp_block >= ext4_blocks_count(es)) { ext4_warning(sb, "Invalid MMP block in superblock"); + retval = -EINVAL; goto failed; } @@ -307,6 +308,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq == EXT4_MMP_SEQ_FSCK) { dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + retval = -EBUSY; goto failed; } @@ -320,6 +322,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + retval = -ETIMEDOUT; goto failed; } @@ -330,6 +333,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -349,6 +353,7 @@ skip: */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount"); + retval = -ETIMEDOUT; goto failed; } @@ -359,6 +364,7 @@ skip: if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -378,6 +384,7 @@ skip: EXT4_SB(sb)->s_mmp_tsk = NULL; ext4_warning(sb, "Unable to create kmmpd thread for %s.", sb->s_id); + retval = -ENOMEM; goto failed; } @@ -385,5 +392,5 @@ skip: failed: brelse(bh); - return 1; + return retval; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e509c22a21ed..b5af2fc03b2f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -140,18 +140,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d03bf0ecf505..d39f386e9baf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1259,7 +1259,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; - int i, err; + int err; /* * Unregister sysfs before destroying jbd2 journal. @@ -1311,7 +1311,7 @@ static void ext4_put_super(struct super_block *sb) ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif @@ -5196,10 +5196,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; - int ret = -ENOMEM; - unsigned int i; int needs_recovery; - int err = 0; + int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; @@ -5212,8 +5210,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - /* -EINVAL is default */ - ret = -EINVAL; err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; @@ -5239,7 +5235,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (ext4_inode_info_init(sb, es)) + err = ext4_inode_info_init(sb, es); + if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); @@ -5255,10 +5252,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); - if (ext4_encoding_init(sb, es)) + err = ext4_encoding_init(sb, es); + if (err) goto failed_mount; - if (ext4_check_journal_data_mode(sb)) + err = ext4_check_journal_data_mode(sb); + if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5267,18 +5266,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; - if (ext4_check_feature_compatibility(sb, es, silent)) + err = ext4_check_feature_compatibility(sb, es, silent); + if (err) goto failed_mount; - if (ext4_block_group_meta_init(sb, silent)) + err = ext4_block_group_meta_init(sb, silent); + if (err) goto failed_mount; ext4_hash_info_init(sb); - if (ext4_handle_clustersize(sb)) + err = ext4_handle_clustersize(sb); + if (err) goto failed_mount; - if (ext4_check_geometry(sb, es)) + err = ext4_check_geometry(sb, es); + if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); @@ -5289,8 +5292,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount3; - /* Register extent status tree shrinker */ - if (ext4_es_register_shrinker(sbi)) + err = ext4_es_register_shrinker(sbi); + if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); @@ -5329,10 +5332,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { + err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); + if (err) goto failed_mount3a; + } + err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -5384,6 +5390,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); + err = -EINVAL; goto failed_mount_wq; } @@ -5392,6 +5399,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); + err = -EINVAL; goto failed_mount_wq; } } @@ -5426,7 +5434,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } @@ -5438,28 +5446,28 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); - ret = PTR_ERR(root); + err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); + err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } - ret = ext4_setup_super(sb, es, sb_rdonly(sb)); - if (ret == -EROFS) { + err = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (err == -EROFS) { sb->s_flags |= SB_RDONLY; - ret = 0; - } else if (ret) + } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); @@ -5503,7 +5511,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - if (ext4_percpu_param_init(sbi)) + err = ext4_percpu_param_init(sbi); + if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) @@ -5511,7 +5520,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount6; } @@ -5628,7 +5637,7 @@ failed_mount: #endif #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); @@ -5637,7 +5646,7 @@ failed_mount: ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; - return err ? err : ret; + return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) @@ -6565,12 +6574,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; sb->s_flags &= ~SB_RDONLY; - if (ext4_has_feature_mmp(sb)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; + if (ext4_has_feature_mmp(sb)) { + err = ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block)); + if (err) goto restore_opts; - } + } #ifdef CONFIG_QUOTA enable_quota = 1; #endif diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 3b01247066dd..2f37e1ea3955 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -366,14 +366,16 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); - if (!folio || !folio_test_uptodate(folio)) { + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); - if (folio) + if (!IS_ERR(folio)) folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } return folio_file_page(folio, index); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 179a5c5e28fd..91e89e68177e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -623,7 +623,7 @@ static int __init cuse_init(void) /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ cuse_channel_fops.unlocked_ioctl = NULL; - cuse_class = class_create(THIS_MODULE, "cuse"); + cuse_class = class_create("cuse"); if (IS_ERR(cuse_class)) return PTR_ERR(cuse_class); diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile index 587bcd6e50a3..16be592e8085 100644 --- a/fs/hostfs/Makefile +++ b/fs/hostfs/Makefile @@ -3,9 +3,11 @@ # Licensed under the GPL # -hostfs-objs := hostfs_kern.o hostfs_user.o +hostfs-objs := hostfs_kern.o -obj-y := +hostfs-builtin-$(CONFIG_HOSTFS) += hostfs_user.o hostfs_user_exp.o + +obj-y := $(hostfs-builtin-y) $(hostfs-builtin-m) obj-$(CONFIG_HOSTFS) += hostfs.o include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/fs/hostfs/hostfs_user_exp.c b/fs/hostfs/hostfs_user_exp.c new file mode 100644 index 000000000000..250c91c55c46 --- /dev/null +++ b/fs/hostfs/hostfs_user_exp.c @@ -0,0 +1,28 @@ +#include <linux/module.h> +#include "hostfs.h" + +EXPORT_SYMBOL_GPL(stat_file); +EXPORT_SYMBOL_GPL(access_file); +EXPORT_SYMBOL_GPL(open_file); +EXPORT_SYMBOL_GPL(open_dir); +EXPORT_SYMBOL_GPL(seek_dir); +EXPORT_SYMBOL_GPL(read_dir); +EXPORT_SYMBOL_GPL(read_file); +EXPORT_SYMBOL_GPL(write_file); +EXPORT_SYMBOL_GPL(lseek_file); +EXPORT_SYMBOL_GPL(fsync_file); +EXPORT_SYMBOL_GPL(replace_file); +EXPORT_SYMBOL_GPL(close_file); +EXPORT_SYMBOL_GPL(close_dir); +EXPORT_SYMBOL_GPL(file_create); +EXPORT_SYMBOL_GPL(set_attr); +EXPORT_SYMBOL_GPL(make_symlink); +EXPORT_SYMBOL_GPL(unlink_file); +EXPORT_SYMBOL_GPL(do_mkdir); +EXPORT_SYMBOL_GPL(hostfs_do_rmdir); +EXPORT_SYMBOL_GPL(do_mknod); +EXPORT_SYMBOL_GPL(link_file); +EXPORT_SYMBOL_GPL(hostfs_do_readlink); +EXPORT_SYMBOL_GPL(rename_file); +EXPORT_SYMBOL_GPL(rename2_file); +EXPORT_SYMBOL_GPL(do_statfs); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9062da6da567..ecfdfb2529a3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, struct folio *folio; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); diff --git a/fs/inode.c b/fs/inode.c index 3ec5a8f7b644..577799b7855f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; + mm_account_reclaimed_pages(reap); } iput(inode); spin_lock(lru_lock); diff --git a/fs/internal.h b/fs/internal.h index ab36ed8fa41c..bd3b2810a36b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc); */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); -extern int vfs_path_lookup(struct dentry *, struct vfsmount *, - const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 10a203515583..063133ec77f4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; - struct folio *folio; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (folio) - return folio; - - if (iter->flags & IOMAP_NOWAIT) - return ERR_PTR(-EAGAIN); - return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); - if (!folio) { + if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f771001574d0..019cc87d0fb3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -130,6 +130,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0) ret += dio->done_before; + trace_iomap_dio_complete(iocb, dio->error, ret); kfree(dio); return ret; @@ -493,6 +494,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct iomap_dio *dio; + trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); + if (!iomi.len) return NULL; @@ -541,7 +544,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { + if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; /* @@ -650,8 +653,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, */ dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!wait_for_completion) + if (!wait_for_completion) { + trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len); return ERR_PTR(-EIOCBQUEUED); + } for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c index da217246b1a9..728d5443daf5 100644 --- a/fs/iomap/trace.c +++ b/fs/iomap/trace.c @@ -3,6 +3,7 @@ * Copyright (c) 2019 Christoph Hellwig */ #include <linux/iomap.h> +#include <linux/uio.h> /* * We include this last to have the helpers above available for the trace diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index f6ea9540d082..c16fd55f5595 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -83,6 +83,7 @@ DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); +DEFINE_RANGE_EVENT(iomap_dio_rw_queued); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ @@ -107,6 +108,11 @@ DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } +#define IOMAP_DIO_STRINGS \ + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), @@ -183,6 +189,78 @@ TRACE_EVENT(iomap_iter, (void *)__entry->caller) ); +TRACE_EVENT(iomap_dio_rw_begin, + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int dio_flags, size_t done_before), + TP_ARGS(iocb, iter, dio_flags, done_before), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(size_t, count) + __field(size_t, done_before) + __field(int, ki_flags) + __field(unsigned int, dio_flags) + __field(bool, aio) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->done_before = done_before; + __entry->ki_flags = iocb->ki_flags; + __entry->dio_flags = dio_flags; + __entry->aio = !is_sync_kiocb(iocb); + ), + TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __entry->count, + __entry->done_before, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS), + __entry->aio) +); + +TRACE_EVENT(iomap_dio_complete, + TP_PROTO(struct kiocb *iocb, int error, ssize_t ret), + TP_ARGS(iocb, error, ret), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(int, ki_flags) + __field(bool, aio) + __field(int, error) + __field(ssize_t, ret) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->ki_flags = iocb->ki_flags; + __entry->aio = !is_sync_kiocb(iocb); + __entry->error = error; + __entry->ret = ret; + ), + TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __entry->aio, + __entry->error, + __entry->ret) +); + #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 90de0e498371..45b6919903e6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() @@ -196,9 +196,9 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } @@ -224,9 +224,9 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); @@ -294,10 +294,10 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) struct kernfs_node *parent; unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } @@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; /* Update timestamps on the parent */ + down_write(&root->kernfs_iattr_rwsem); + ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* @@ -940,6 +943,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); + init_rwsem(&root->kernfs_iattr_rwsem); + init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* @@ -1462,11 +1467,14 @@ static void __kernfs_remove(struct kernfs_node *kn) pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } @@ -1723,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ - spin_lock_irq(&kernfs_rename_lock); + write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; @@ -1734,7 +1742,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kn->name = new_name; } - spin_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e4a50e4ff0d2..40c4661f15b7 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -922,8 +922,8 @@ repeat: root = kernfs_root(kn); /* kick fsnotify */ - down_write(&root->kernfs_rwsem); + down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; @@ -960,7 +960,7 @@ repeat: iput(inode); } - up_write(&root->kernfs_rwsem); + up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 30494dcb0df3..b22b74d1a115 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int ret; struct kernfs_root *root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return ret; } @@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EINVAL; root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, setattr_copy(&nop_mnt_idmap, inode, iattr); out: - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return error; } @@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, inode, stat); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return 0; } @@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap, kn = inode->i_private; root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return ret; } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 236c3a6113f1..a9b854cdfdb5 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -47,6 +47,8 @@ struct kernfs_root { wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; + struct rw_semaphore kernfs_iattr_rwsem; + struct rw_semaphore kernfs_supers_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e08e8d999807..d49606accb07 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -351,9 +351,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); @@ -380,9 +380,9 @@ void kernfs_kill_sb(struct super_block *sb) struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_del(&info->node); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 0d8242789dc8..0979577fd23e 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -415,7 +415,7 @@ int server_queue_ctrl_reset_work(void) return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET); } -static ssize_t stats_show(struct class *class, struct class_attribute *attr, +static ssize_t stats_show(const struct class *class, const struct class_attribute *attr, char *buf) { /* @@ -434,8 +434,8 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr, server_conf.ipc_last_active / HZ); } -static ssize_t kill_server_store(struct class *class, - struct class_attribute *attr, const char *buf, +static ssize_t kill_server_store(const struct class *class, + const struct class_attribute *attr, const char *buf, size_t len) { if (!sysfs_streq(buf, "hard")) @@ -455,7 +455,7 @@ static const char * const debug_type_strings[] = {"smb", "auth", "vfs", "oplock", "ipc", "conn", "rdma"}; -static ssize_t debug_show(struct class *class, struct class_attribute *attr, +static ssize_t debug_show(const struct class *class, const struct class_attribute *attr, char *buf) { ssize_t sz = 0; @@ -473,7 +473,7 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr, return sz; } -static ssize_t debug_store(struct class *class, struct class_attribute *attr, +static ssize_t debug_store(const struct class *class, const struct class_attribute *attr, const char *buf, size_t len) { int i; @@ -513,7 +513,6 @@ ATTRIBUTE_GROUPS(ksmbd_control_class); static struct class ksmbd_control_class = { .name = "ksmbd-control", - .owner = THIS_MODULE, .class_groups = ksmbd_control_class_groups, }; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 67b7e766a06b..bbc9e92935fb 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -756,19 +756,6 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt, pneg_ctxt->Ciphers[0] = cipher_type; } -static void build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt, - __le16 comp_algo) -{ - pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES; - pneg_ctxt->DataLength = - cpu_to_le16(sizeof(struct smb2_compression_capabilities_context) - - sizeof(struct smb2_neg_context)); - pneg_ctxt->Reserved = cpu_to_le32(0); - pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1); - pneg_ctxt->Flags = cpu_to_le32(0); - pneg_ctxt->CompressionAlgorithms[0] = comp_algo; -} - static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt, __le16 sign_algo) { @@ -808,7 +795,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, struct smb2_negotiate_rsp *rsp, void *smb2_buf_len) { - char *pneg_ctxt = (char *)rsp + + char * const pneg_ctxt = (char *)rsp + le32_to_cpu(rsp->NegotiateContextOffset); int neg_ctxt_cnt = 1; int ctxt_size; @@ -817,61 +804,46 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, conn->preauth_info->Preauth_HashId); - rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING); ctxt_size = sizeof(struct smb2_preauth_neg_context); - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8); if (conn->cipher_type) { + /* Round to 8 byte boundary */ ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_ENCRYPTION_CAPABILITIES context\n"); - build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt, + build_encrypt_ctxt((struct smb2_encryption_neg_context *) + (pneg_ctxt + ctxt_size), conn->cipher_type); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2; - /* Round to 8 byte boundary */ - pneg_ctxt += - round_up(sizeof(struct smb2_encryption_neg_context) + 2, - 8); } - if (conn->compress_algorithm) { - ctxt_size = round_up(ctxt_size, 8); - ksmbd_debug(SMB, - "assemble SMB2_COMPRESSION_CAPABILITIES context\n"); - /* Temporarily set to SMB3_COMPRESS_NONE */ - build_compression_ctxt((struct smb2_compression_capabilities_context *)pneg_ctxt, - conn->compress_algorithm); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); - ctxt_size += sizeof(struct smb2_compression_capabilities_context) + 2; - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_compression_capabilities_context) + 2, - 8); - } + /* compression context not yet supported */ + WARN_ON(conn->compress_algorithm != SMB3_COMPRESS_NONE); if (conn->posix_ext_supported) { ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n"); - build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + build_posix_ctxt((struct smb2_posix_neg_context *) + (pneg_ctxt + ctxt_size)); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_posix_neg_context); - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8); } if (conn->signing_negotiated) { ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_SIGNING_CAPABILITIES context\n"); - build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt, + build_sign_cap_ctxt((struct smb2_signing_capabilities *) + (pneg_ctxt + ctxt_size), conn->signing_algorithm); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_signing_capabilities) + 2; } + rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); inc_rfc1001_len(smb2_buf_len, ctxt_size); } @@ -2436,7 +2408,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path(work, name, 0, path, 0); + rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2727,8 +2699,10 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1); + rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1); if (!rc) { + file_present = true; + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { /* * If file exists with under flags, return access @@ -2737,7 +2711,6 @@ int smb2_open(struct ksmbd_work *work) if (req->CreateDisposition == FILE_OVERWRITE_IF_LE || req->CreateDisposition == FILE_OPEN_IF_LE) { rc = -EACCES; - path_put(&path); goto err_out; } @@ -2745,26 +2718,23 @@ int smb2_open(struct ksmbd_work *work) ksmbd_debug(SMB, "User does not have write permission\n"); rc = -EACCES; - path_put(&path); goto err_out; } } else if (d_is_symlink(path.dentry)) { rc = -EACCES; - path_put(&path); goto err_out; } - } - if (rc) { + file_present = true; + idmap = mnt_idmap(path.mnt); + } else { if (rc != -ENOENT) goto err_out; ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n", name, rc); rc = 0; - } else { - file_present = true; - idmap = mnt_idmap(path.mnt); } + if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { if (s_type == DATA_STREAM) { @@ -2892,8 +2862,9 @@ int smb2_open(struct ksmbd_work *work) if ((daccess & FILE_DELETE_LE) || (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { - rc = ksmbd_vfs_may_delete(idmap, - path.dentry); + rc = inode_permission(idmap, + d_inode(path.dentry->d_parent), + MAY_EXEC | MAY_WRITE); if (rc) goto err_out; } @@ -3264,10 +3235,13 @@ int smb2_open(struct ksmbd_work *work) } err_out: - if (file_present || created) - path_put(&path); + if (file_present || created) { + inode_unlock(d_inode(path.dentry->d_parent)); + dput(path.dentry); + } ksmbd_revert_fsids(work); err_out1: + if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -5418,44 +5392,19 @@ int smb2_echo(struct ksmbd_work *work) static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - struct mnt_idmap *idmap, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { struct ksmbd_share_config *share = fp->tcon->share_conf; - char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL; - char *pathname = NULL; - struct path path; - bool file_present = true; - int rc; + char *new_name = NULL; + int rc, flags = 0; ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n"); - pathname = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; - - abs_oldname = file_path(fp->filp, pathname, PATH_MAX); - if (IS_ERR(abs_oldname)) { - rc = -EINVAL; - goto out; - } - old_name = strrchr(abs_oldname, '/'); - if (old_name && old_name[1] != '\0') { - old_name++; - } else { - ksmbd_debug(SMB, "can't get last component in path %s\n", - abs_oldname); - rc = -ENOENT; - goto out; - } - new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); - if (IS_ERR(new_name)) { - rc = PTR_ERR(new_name); - goto out; - } + if (IS_ERR(new_name)) + return PTR_ERR(new_name); if (strchr(new_name, ':')) { int s_type; @@ -5481,7 +5430,7 @@ static int smb2_rename(struct ksmbd_work *work, if (rc) goto out; - rc = ksmbd_vfs_setxattr(idmap, + rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5496,47 +5445,18 @@ static int smb2_rename(struct ksmbd_work *work, } ksmbd_debug(SMB, "new name %s\n", new_name); - rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1); - if (rc) { - if (rc != -ENOENT) - goto out; - file_present = false; - } else { - path_put(&path); - } - if (ksmbd_share_veto_filename(share, new_name)) { rc = -ENOENT; ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name); goto out; } - if (file_info->ReplaceIfExists) { - if (file_present) { - rc = ksmbd_vfs_remove_file(work, new_name); - if (rc) { - if (rc != -ENOTEMPTY) - rc = -EINVAL; - ksmbd_debug(SMB, "cannot delete %s, rc %d\n", - new_name, rc); - goto out; - } - } - } else { - if (file_present && - strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) { - rc = -EEXIST; - ksmbd_debug(SMB, - "cannot rename already existing file\n"); - goto out; - } - } + if (!file_info->ReplaceIfExists) + flags = RENAME_NOREPLACE; - rc = ksmbd_vfs_fp_rename(work, fp, new_name); + rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); out: - kfree(pathname); - if (!IS_ERR(new_name)) - kfree(new_name); + kfree(new_name); return rc; } @@ -5576,18 +5496,17 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0); + rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, + &path, 0); if (rc) { if (rc != -ENOENT) goto out; file_present = false; - } else { - path_put(&path); } if (file_info->ReplaceIfExists) { if (file_present) { - rc = ksmbd_vfs_remove_file(work, link_name); + rc = ksmbd_vfs_remove_file(work, &path); if (rc) { rc = -EINVAL; ksmbd_debug(SMB, "cannot delete %s\n", @@ -5607,6 +5526,10 @@ static int smb2_create_link(struct ksmbd_work *work, if (rc) rc = -EINVAL; out: + if (file_present) { + inode_unlock(d_inode(path.dentry->d_parent)); + path_put(&path); + } if (!IS_ERR(link_name)) kfree(link_name); kfree(pathname); @@ -5784,12 +5707,6 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, struct smb2_file_rename_info *rename_info, unsigned int buf_len) { - struct mnt_idmap *idmap; - struct ksmbd_file *parent_fp; - struct dentry *parent; - struct dentry *dentry = fp->filp->f_path.dentry; - int ret; - if (!(fp->daccess & FILE_DELETE_LE)) { pr_err("no right to delete : 0x%x\n", fp->daccess); return -EACCES; @@ -5799,32 +5716,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(rename_info->FileNameLength)) return -EINVAL; - idmap = file_mnt_idmap(fp->filp); - if (ksmbd_stream_fd(fp)) - goto next; - - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; - } - - parent_fp = ksmbd_lookup_fd_inode(d_inode(parent)); - inode_unlock(d_inode(parent)); - dput(parent); + if (!le32_to_cpu(rename_info->FileNameLength)) + return -EINVAL; - if (parent_fp) { - if (parent_fp->daccess & FILE_DELETE_LE) { - pr_err("parent dir is opened with delete access\n"); - ksmbd_fd_put(work, parent_fp); - return -ESHARE; - } - ksmbd_fd_put(work, parent_fp); - } -next: - return smb2_rename(work, fp, idmap, rename_info, - work->conn->local_nls); + return smb2_rename(work, fp, rename_info, work->conn->local_nls); } static int set_file_disposition_info(struct ksmbd_file *fp, diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 9420dd2813fb..67dc552f2ef7 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -70,18 +70,6 @@ struct create_durable_req_v2 { __u8 CreateGuid[16]; } __packed; -struct create_durable_reconn_req { - struct create_context ccontext; - __u8 Name[8]; - union { - __u8 Reserved[16]; - struct { - __u64 PersistentFileId; - __u64 VolatileFileId; - } Fid; - } Data; -} __packed; - struct create_durable_reconn_v2_req { struct create_context ccontext; __u8 Name[8]; @@ -109,12 +97,6 @@ struct create_app_inst_id_vers { __le64 AppInstanceVersionLow; } __packed; -struct create_mxac_req { - struct create_context ccontext; - __u8 Name[8]; - __le64 Timestamp; -} __packed; - struct create_alloc_size_req { struct create_context ccontext; __u8 Name[8]; @@ -137,21 +119,6 @@ struct create_durable_v2_rsp { __le32 Flags; } __packed; -struct create_mxac_rsp { - struct create_context ccontext; - __u8 Name[8]; - __le32 QueryStatus; - __le32 MaximalAccess; -} __packed; - -struct create_disk_id_rsp { - struct create_context ccontext; - __u8 Name[8]; - __le64 DiskFileId; - __le64 VolumeId; - __u8 Reserved[16]; -} __packed; - /* equivalent of the contents of SMB3.1.1 POSIX open context response */ struct create_posix_rsp { struct create_context ccontext; diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 5ea9229dad2c..778c152708e4 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -18,8 +18,7 @@ #include <linux/vmalloc.h> #include <linux/sched/xacct.h> #include <linux/crc32c.h> - -#include "../internal.h" /* for vfs_path_lookup */ +#include <linux/namei.h> #include "glob.h" #include "oplock.h" @@ -37,19 +36,6 @@ #include "mgmt/user_session.h" #include "mgmt/user_config.h" -static char *extract_last_component(char *path) -{ - char *p = strrchr(path, '/'); - - if (p && p[1] != '\0') { - *p = '\0'; - p++; - } else { - p = NULL; - } - return p; -} - static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, struct inode *parent_inode, struct inode *inode) @@ -63,65 +49,77 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, /** * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable - * - * the parent dentry got by dget_parent or @parent could be - * unstable, we try to lock a parent inode and lookup the - * child dentry again. - * - * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, - struct dentry *child) +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) { - struct dentry *dentry; - int ret = 0; - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one(idmap, child->d_name.name, parent, - child->d_name.len); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out_err; - } - - if (dentry != child) { - ret = -ESTALE; - dput(dentry); - goto out_err; + if (child->d_parent != parent) { + inode_unlock(d_inode(parent)); + return -ENOENT; } - dput(dentry); return 0; -out_err: - inode_unlock(d_inode(parent)); - return ret; } -int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, - struct dentry *dentry) +static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, + char *pathname, unsigned int flags, + struct path *path) { - struct dentry *parent; - int ret; + struct qstr last; + struct filename *filename; + struct path *root_share_path = &share_conf->vfs_path; + int err, type; + struct path parent_path; + struct dentry *d; + + if (pathname[0] == '\0') { + pathname = share_conf->path; + root_share_path = NULL; + } else { + flags |= LOOKUP_BENEATH; + } + + filename = getname_kernel(pathname); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + err = vfs_path_parent_lookup(filename, flags, + &parent_path, &last, &type, + root_share_path); + putname(filename); + if (err) + return err; - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; + if (unlikely(type != LAST_NORM)) { + path_put(&parent_path); + return -ENOENT; } - ret = inode_permission(idmap, d_inode(parent), - MAY_EXEC | MAY_WRITE); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + if (IS_ERR(d)) + goto err_out; - inode_unlock(d_inode(parent)); - dput(parent); - return ret; + if (d_is_negative(d)) { + dput(d); + goto err_out; + } + + path->dentry = d; + path->mnt = share_conf->vfs_path.mnt; + path_put(&parent_path); + + return 0; + +err_out: + inode_unlock(parent_path.dentry->d_inode); + path_put(&parent_path); + return -ENOENT; } int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { - struct dentry *parent; int ret = 0; *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); @@ -138,18 +136,9 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC)) *daccess |= FILE_EXECUTE_LE; - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; - } - - if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; - inode_unlock(d_inode(parent)); - dput(parent); return ret; } @@ -582,54 +571,32 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) +int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) { struct mnt_idmap *idmap; - struct path path; - struct dentry *parent; + struct dentry *parent = path->dentry->d_parent; int err; if (ksmbd_override_fsids(work)) return -ENOMEM; - err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false); - if (err) { - ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); - ksmbd_revert_fsids(work); - return err; - } - - idmap = mnt_idmap(path.mnt); - parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry); - if (err) { - dput(parent); - path_put(&path); - ksmbd_revert_fsids(work); - return err; - } - - if (!d_inode(path.dentry)->i_nlink) { + if (!d_inode(path->dentry)->i_nlink) { err = -ENOENT; goto out_err; } - if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(idmap, d_inode(parent), path.dentry); + idmap = mnt_idmap(path->mnt); + if (S_ISDIR(d_inode(path->dentry)->i_mode)) { + err = vfs_rmdir(idmap, d_inode(parent), path->dentry); if (err && err != -ENOTEMPTY) - ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, - err); + ksmbd_debug(VFS, "rmdir failed, err %d\n", err); } else { - err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL); + err = vfs_unlink(idmap, d_inode(parent), path->dentry, NULL); if (err) - ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, - err); + ksmbd_debug(VFS, "unlink failed, err %d\n", err); } out_err: - inode_unlock(d_inode(parent)); - dput(parent); - path_put(&path); ksmbd_revert_fsids(work); return err; } @@ -688,149 +655,114 @@ out1: return err; } -static int ksmbd_validate_entry_in_use(struct dentry *src_dent) +int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, + char *newname, int flags) { - struct dentry *dst_dent; - - spin_lock(&src_dent->d_lock); - list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) { - struct ksmbd_file *child_fp; + struct dentry *old_parent, *new_dentry, *trap; + struct dentry *old_child = old_path->dentry; + struct path new_path; + struct qstr new_last; + struct renamedata rd; + struct filename *to; + struct ksmbd_share_config *share_conf = work->tcon->share_conf; + struct ksmbd_file *parent_fp; + int new_type; + int err, lookup_flags = LOOKUP_NO_SYMLINKS; - if (d_really_is_negative(dst_dent)) - continue; + if (ksmbd_override_fsids(work)) + return -ENOMEM; - child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent)); - if (child_fp) { - spin_unlock(&src_dent->d_lock); - ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n"); - return -EACCES; - } + to = getname_kernel(newname); + if (IS_ERR(to)) { + err = PTR_ERR(to); + goto revert_fsids; } - spin_unlock(&src_dent->d_lock); - return 0; -} +retry: + err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, + &new_path, &new_last, &new_type, + &share_conf->vfs_path); + if (err) + goto out1; -static int __ksmbd_vfs_rename(struct ksmbd_work *work, - struct mnt_idmap *src_idmap, - struct dentry *src_dent_parent, - struct dentry *src_dent, - struct mnt_idmap *dst_idmap, - struct dentry *dst_dent_parent, - struct dentry *trap_dent, - char *dst_name) -{ - struct dentry *dst_dent; - int err; + if (old_path->mnt != new_path.mnt) { + err = -EXDEV; + goto out2; + } - if (!work->tcon->posix_extensions) { - err = ksmbd_validate_entry_in_use(src_dent); - if (err) - return err; + trap = lock_rename_child(old_child, new_path.dentry); + + old_parent = dget(old_child->d_parent); + if (d_unhashed(old_child)) { + err = -EINVAL; + goto out3; } - if (d_really_is_negative(src_dent_parent)) - return -ENOENT; - if (d_really_is_negative(dst_dent_parent)) - return -ENOENT; - if (d_really_is_negative(src_dent)) - return -ENOENT; - if (src_dent == trap_dent) - return -EINVAL; + parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent)); + if (parent_fp) { + if (parent_fp->daccess & FILE_DELETE_LE) { + pr_err("parent dir is opened with delete access\n"); + err = -ESHARE; + ksmbd_fd_put(work, parent_fp); + goto out3; + } + ksmbd_fd_put(work, parent_fp); + } - if (ksmbd_override_fsids(work)) - return -ENOMEM; + new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, + lookup_flags | LOOKUP_RENAME_TARGET); + if (IS_ERR(new_dentry)) { + err = PTR_ERR(new_dentry); + goto out3; + } - dst_dent = lookup_one(dst_idmap, dst_name, - dst_dent_parent, strlen(dst_name)); - err = PTR_ERR(dst_dent); - if (IS_ERR(dst_dent)) { - pr_err("lookup failed %s [%d]\n", dst_name, err); - goto out; + if (d_is_symlink(new_dentry)) { + err = -EACCES; + goto out4; } - err = -ENOTEMPTY; - if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { - struct renamedata rd = { - .old_mnt_idmap = src_idmap, - .old_dir = d_inode(src_dent_parent), - .old_dentry = src_dent, - .new_mnt_idmap = dst_idmap, - .new_dir = d_inode(dst_dent_parent), - .new_dentry = dst_dent, - }; - err = vfs_rename(&rd); + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) { + err = -EEXIST; + goto out4; } - if (err) - pr_err("vfs_rename failed err %d\n", err); - if (dst_dent) - dput(dst_dent); -out: - ksmbd_revert_fsids(work); - return err; -} -int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - char *newname) -{ - struct mnt_idmap *idmap; - struct path dst_path; - struct dentry *src_dent_parent, *dst_dent_parent; - struct dentry *src_dent, *trap_dent, *src_child; - char *dst_name; - int err; + if (old_child == trap) { + err = -EINVAL; + goto out4; + } - dst_name = extract_last_component(newname); - if (!dst_name) { - dst_name = newname; - newname = ""; + if (new_dentry == trap) { + err = -ENOTEMPTY; + goto out4; } - src_dent_parent = dget_parent(fp->filp->f_path.dentry); - src_dent = fp->filp->f_path.dentry; + rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.old_dir = d_inode(old_parent), + rd.old_dentry = old_child, + rd.new_mnt_idmap = mnt_idmap(new_path.mnt), + rd.new_dir = new_path.dentry->d_inode, + rd.new_dentry = new_dentry, + rd.flags = flags, + err = vfs_rename(&rd); + if (err) + ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); - err = ksmbd_vfs_kern_path(work, newname, - LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, - &dst_path, false); - if (err) { - ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); - goto out; +out4: + dput(new_dentry); +out3: + dput(old_parent); + unlock_rename(old_parent, new_path.dentry); +out2: + path_put(&new_path); + + if (retry_estale(err, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; } - dst_dent_parent = dst_path.dentry; - - trap_dent = lock_rename(src_dent_parent, dst_dent_parent); - dget(src_dent); - dget(dst_dent_parent); - idmap = file_mnt_idmap(fp->filp); - src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent, - src_dent->d_name.len); - if (IS_ERR(src_child)) { - err = PTR_ERR(src_child); - goto out_lock; - } - - if (src_child != src_dent) { - err = -ESTALE; - dput(src_child); - goto out_lock; - } - dput(src_child); - - err = __ksmbd_vfs_rename(work, - idmap, - src_dent_parent, - src_dent, - mnt_idmap(dst_path.mnt), - dst_dent_parent, - trap_dent, - dst_name); -out_lock: - dput(src_dent); - dput(dst_dent_parent); - unlock_rename(src_dent_parent, dst_dent_parent); - path_put(&dst_path); -out: - dput(src_dent_parent); +out1: + putname(to); +revert_fsids: + ksmbd_revert_fsids(work); return err; } @@ -1081,14 +1013,16 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, return vfs_removexattr(idmap, dentry, attr_name); } -int ksmbd_vfs_unlink(struct mnt_idmap *idmap, - struct dentry *dir, struct dentry *dentry) +int ksmbd_vfs_unlink(struct file *filp) { int err = 0; + struct dentry *dir, *dentry = filp->f_path.dentry; + struct mnt_idmap *idmap = file_mnt_idmap(filp); - err = ksmbd_vfs_lock_parent(idmap, dir, dentry); + dir = dget_parent(dentry); + err = ksmbd_vfs_lock_parent(dir, dentry); if (err) - return err; + goto out; dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) @@ -1100,6 +1034,8 @@ int ksmbd_vfs_unlink(struct mnt_idmap *idmap, inode_unlock(d_inode(dir)); if (err) ksmbd_debug(VFS, "failed to delete, err %d\n", err); +out: + dput(dir); return err; } @@ -1202,7 +1138,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, } /** - * ksmbd_vfs_kern_path() - lookup a file and get path info + * ksmbd_vfs_kern_path_locked() - lookup a file and get path info * @name: file path that is relative to share * @flags: lookup flags * @path: if lookup succeed, return path info @@ -1210,24 +1146,20 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, bool caseless) +int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, + bool caseless) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; + struct path parent_path; - flags |= LOOKUP_BENEATH; - err = vfs_path_lookup(share_conf->vfs_path.dentry, - share_conf->vfs_path.mnt, - name, - flags, - path); + err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path); if (!err) - return 0; + return err; if (caseless) { char *filepath; - struct path parent; size_t path_len, remain_len; filepath = kstrdup(name, GFP_KERNEL); @@ -1237,10 +1169,10 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, path_len = strlen(filepath); remain_len = path_len; - parent = share_conf->vfs_path; - path_get(&parent); + parent_path = share_conf->vfs_path; + path_get(&parent_path); - while (d_can_lookup(parent.dentry)) { + while (d_can_lookup(parent_path.dentry)) { char *filename = filepath + path_len - remain_len; char *next = strchrnul(filename, '/'); size_t filename_len = next - filename; @@ -1249,12 +1181,11 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, if (filename_len == 0) break; - err = ksmbd_vfs_lookup_in_dir(&parent, filename, + err = ksmbd_vfs_lookup_in_dir(&parent_path, filename, filename_len, work->conn->um); - path_put(&parent); if (err) - goto out; + goto out2; next[0] = '\0'; @@ -1262,23 +1193,31 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, share_conf->vfs_path.mnt, filepath, flags, - &parent); + path); if (err) - goto out; - else if (is_last) { - *path = parent; - goto out; - } + goto out2; + else if (is_last) + goto out1; + path_put(&parent_path); + parent_path = *path; next[0] = '/'; remain_len -= filename_len + 1; } - path_put(&parent); err = -EINVAL; -out: +out2: + path_put(&parent_path); +out1: kfree(filepath); } + + if (!err) { + err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry); + if (err) + dput(path->dentry); + path_put(&parent_path); + } return err; } diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 9d676ab0cd25..a4ae89f3230d 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -71,9 +71,7 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, - struct dentry *child); -int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry); +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); @@ -84,12 +82,12 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf, size_t count, loff_t *pos, bool sync, ssize_t *written); int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); -int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); +int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path); int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, const char *newname); int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat); -int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - char *newname); +int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, + char *newname, int flags); int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size); struct srv_copychunk; @@ -116,9 +114,9 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name); -int ksmbd_vfs_kern_path(struct ksmbd_work *work, - char *name, unsigned int flags, struct path *path, - bool caseless); +int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, + bool caseless); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -131,8 +129,7 @@ struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, unsigned int in_count, unsigned int *out_count); -int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, - struct dentry *dentry); +int ksmbd_vfs_unlink(struct file *filp); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, struct mnt_idmap *idmap, diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 054a7d2e0f48..2d0138e72d78 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -244,7 +244,6 @@ void ksmbd_release_inode_hash(void) static void __ksmbd_inode_close(struct ksmbd_file *fp) { - struct dentry *dir, *dentry; struct ksmbd_inode *ci = fp->f_ci; int err; struct file *filp; @@ -263,11 +262,9 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (atomic_dec_and_test(&ci->m_count)) { write_lock(&ci->m_lock); if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) { - dentry = filp->f_path.dentry; - dir = dentry->d_parent; ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); write_unlock(&ci->m_lock); - ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry); + ksmbd_vfs_unlink(filp); write_lock(&ci->m_lock); } write_unlock(&ci->m_lock); diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 6d5e83ed4476..ac9f9d84510e 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -3,10 +3,12 @@ # Makefile for the linux lock manager stuff # +ccflags-y += -I$(src) # needed for trace events + obj-$(CONFIG_LOCKD) += lockd.o -lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o xdr.o +lockd-objs-y += clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ + svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o lockd-objs-$(CONFIG_PROC_FS) += procfs.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 82b19a30e0f0..e3972aa3045a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,9 +14,12 @@ #include <linux/nfs_fs.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> +#include "trace.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT /* @@ -29,18 +32,6 @@ static int reclaimer(void *ptr); * client perspective. */ -/* - * This is the representation of a blocked client lock. - */ -struct nlm_wait { - struct list_head b_list; /* linked list */ - wait_queue_head_t b_wait; /* where to wait on */ - struct nlm_host * b_host; - struct file_lock * b_lock; /* local file lock */ - unsigned short b_reclaim; /* got to reclaim lock */ - __be32 b_status; /* grant callback status */ -}; - static LIST_HEAD(nlm_blocked); static DEFINE_SPINLOCK(nlm_blocked_lock); @@ -94,41 +85,42 @@ void nlmclnt_done(struct nlm_host *host) } EXPORT_SYMBOL_GPL(nlmclnt_done); +void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct file_lock *fl) +{ + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = nlm_lck_blocked; +} + /* * Queue up a lock for blocking so that the GRANTED request can see it */ -struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) +void nlmclnt_queue_block(struct nlm_wait *block) { - struct nlm_wait *block; - - block = kmalloc(sizeof(*block), GFP_KERNEL); - if (block != NULL) { - block->b_host = host; - block->b_lock = fl; - init_waitqueue_head(&block->b_wait); - block->b_status = nlm_lck_blocked; - - spin_lock(&nlm_blocked_lock); - list_add(&block->b_list, &nlm_blocked); - spin_unlock(&nlm_blocked_lock); - } - return block; + spin_lock(&nlm_blocked_lock); + list_add(&block->b_list, &nlm_blocked); + spin_unlock(&nlm_blocked_lock); } -void nlmclnt_finish_block(struct nlm_wait *block) +/* + * Dequeue the block and return its final status + */ +__be32 nlmclnt_dequeue_block(struct nlm_wait *block) { - if (block == NULL) - return; + __be32 status; + spin_lock(&nlm_blocked_lock); list_del(&block->b_list); + status = block->b_status; spin_unlock(&nlm_blocked_lock); - kfree(block); + return status; } /* * Block on a lock */ -int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) +int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout) { long ret; @@ -154,7 +146,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* Reset the lock status after a server reboot so we resend */ if (block->b_status == nlm_lck_denied_grace_period) block->b_status = nlm_lck_blocked; - req->a_res.status = block->b_status; return 0; } @@ -198,6 +189,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) res = nlm_granted; } spin_unlock(&nlm_blocked_lock); + trace_nlmclnt_grant(lock, addr, svc_addr_len(addr), res); return res; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 16b4de868cd2..fba6c7fa7474 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -20,6 +20,8 @@ #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> +#include "trace.h" + #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) #define NLMCLNT_POLL_TIMEOUT (30*HZ) @@ -451,6 +453,9 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) status = nlm_stat_to_errno(req->a_res.status); } out: + trace_nlmclnt_test(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; } @@ -516,9 +521,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) const struct cred *cred = nfs_file_cred(fl->fl_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - struct nlm_wait *block = NULL; + struct nlm_wait block; unsigned char fl_flags = fl->fl_flags; unsigned char fl_type; + __be32 b_status; int status = -ENOLCK; if (nsm_monitor(host) < 0) @@ -531,31 +537,41 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) goto out; - block = nlmclnt_prepare_block(host, fl); + nlmclnt_prepare_block(&block, host, fl); again: /* * Initialise resp->status to a valid non-zero value, * since 0 == nlm_lck_granted */ resp->status = nlm_lck_blocked; - for(;;) { + + /* + * A GRANTED callback can come at any time -- even before the reply + * to the LOCK request arrives, so we queue the wait before + * requesting the lock. + */ + nlmclnt_queue_block(&block); + for (;;) { /* Reboot protection */ fl->fl_u.nfs_fl.state = host->h_state; status = nlmclnt_call(cred, req, NLMPROC_LOCK); if (status < 0) break; /* Did a reclaimer thread notify us of a server reboot? */ - if (resp->status == nlm_lck_denied_grace_period) + if (resp->status == nlm_lck_denied_grace_period) continue; if (resp->status != nlm_lck_blocked) break; /* Wait on an NLM blocking lock */ - status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; - if (resp->status != nlm_lck_blocked) + if (block.b_status != nlm_lck_blocked) break; } + b_status = nlmclnt_dequeue_block(&block); + if (resp->status == nlm_lck_blocked) + resp->status = b_status; /* if we were interrupted while blocking, then cancel the lock request * and exit @@ -564,7 +580,7 @@ again: if (!req->a_args.block) goto out_unlock; if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) - goto out_unblock; + goto out; } if (resp->status == nlm_granted) { @@ -593,16 +609,19 @@ again: status = -ENOLCK; else status = nlm_stat_to_errno(resp->status); -out_unblock: - nlmclnt_finish_block(block); out: + trace_nlmclnt_lock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; out_unlock: /* Fatal error: ensure that we remove the lock altogether */ + trace_nlmclnt_lock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); dprintk("lockd: lock attempt ended in fatal error.\n" " Attempting to unlock.\n"); - nlmclnt_finish_block(block); fl_type = fl->fl_type; fl->fl_type = F_UNLCK; down_read(&host->h_rwsem); @@ -696,6 +715,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: + trace_nlmclnt_unlock(&req->a_args.lock, + (const struct sockaddr *)&req->a_host->h_addr, + req->a_host->h_addrlen, req->a_res.status); nlmclnt_release_call(req); return status; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index cdc8e12cdac4..127a728fcbc8 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -629,6 +629,7 @@ nlm_shutdown_hosts_net(struct net *net) rpc_shutdown_client(host->h_rpcclnt); host->h_rpcclnt = NULL; } + nlmsvc_free_host_resources(host); } /* Then, perform a garbage collection pass */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9a47303b2cba..bb94949bc223 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -510,24 +510,6 @@ static struct ctl_table nlm_sysctls[] = { { } }; -static struct ctl_table nlm_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nlm_sysctls, - }, - { } -}; - -static struct ctl_table nlm_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nlm_sysctl_dir, - }, - { } -}; - #endif /* CONFIG_SYSCTL */ /* @@ -644,7 +626,7 @@ static int __init init_nlm(void) #ifdef CONFIG_SYSCTL err = -ENOMEM; - nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); + nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 4e30f3c50970..c43ccdf28ed9 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -954,19 +954,32 @@ void nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) { struct nlm_block *block; + struct file_lock *fl; + int error; dprintk("grant_reply: looking for cookie %x, s=%d \n", *(unsigned int *)(cookie->data), status); if (!(block = nlmsvc_find_block(cookie))) return; - if (status == nlm_lck_denied_grace_period) { + switch (status) { + case nlm_lck_denied_grace_period: /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); - } else { + break; + case nlm_lck_denied: + /* Client doesn't want it, just unlock it */ + nlmsvc_unlink_block(block); + fl = &block->b_call->a_args.lock.fl; + fl->fl_type = F_UNLCK; + error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL); + if (error) + pr_warn("lockd: unable to unlock lock rejected by client!\n"); + break; + default: /* - * Lock is now held by client, or has been rejected. - * In both cases, the block should be removed. + * Either it was accepted or the status makes no sense + * just unlink it either way. */ nlmsvc_unlink_block(block); } diff --git a/fs/lockd/trace.c b/fs/lockd/trace.c new file mode 100644 index 000000000000..d9a6ff6e673c --- /dev/null +++ b/fs/lockd/trace.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h new file mode 100644 index 000000000000..7461b13b6e74 --- /dev/null +++ b/fs/lockd/trace.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lockd + +#if !defined(_TRACE_LOCKD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_LOCKD_H + +#include <linux/tracepoint.h> +#include <linux/crc32.h> +#include <linux/nfs.h> +#include <linux/lockd/lockd.h> + +#ifdef CONFIG_LOCKD_V4 +#define NLM_STATUS_LIST \ + nlm_status_code(LCK_GRANTED) \ + nlm_status_code(LCK_DENIED) \ + nlm_status_code(LCK_DENIED_NOLOCKS) \ + nlm_status_code(LCK_BLOCKED) \ + nlm_status_code(LCK_DENIED_GRACE_PERIOD) \ + nlm_status_code(DEADLCK) \ + nlm_status_code(ROFS) \ + nlm_status_code(STALE_FH) \ + nlm_status_code(FBIG) \ + nlm_status_code_end(FAILED) +#else +#define NLM_STATUS_LIST \ + nlm_status_code(LCK_GRANTED) \ + nlm_status_code(LCK_DENIED) \ + nlm_status_code(LCK_DENIED_NOLOCKS) \ + nlm_status_code(LCK_BLOCKED) \ + nlm_status_code_end(LCK_DENIED_GRACE_PERIOD) +#endif + +#undef nlm_status_code +#undef nlm_status_code_end +#define nlm_status_code(x) TRACE_DEFINE_ENUM(NLM_##x); +#define nlm_status_code_end(x) TRACE_DEFINE_ENUM(NLM_##x); + +NLM_STATUS_LIST + +#undef nlm_status_code +#undef nlm_status_code_end +#define nlm_status_code(x) { NLM_##x, #x }, +#define nlm_status_code_end(x) { NLM_##x, #x } + +#define show_nlm_status(x) __print_symbolic(x, NLM_STATUS_LIST) + +DECLARE_EVENT_CLASS(nlmclnt_lock_event, + TP_PROTO( + const struct nlm_lock *lock, + const struct sockaddr *addr, + unsigned int addrlen, + __be32 status + ), + + TP_ARGS(lock, addr, addrlen, status), + + TP_STRUCT__entry( + __field(u32, oh) + __field(u32, svid) + __field(u32, fh) + __field(unsigned long, status) + __field(u64, start) + __field(u64, len) + __sockaddr(addr, addrlen) + ), + + TP_fast_assign( + __entry->oh = ~crc32_le(0xffffffff, lock->oh.data, lock->oh.len); + __entry->svid = lock->svid; + __entry->fh = nfs_fhandle_hash(&lock->fh); + __entry->start = lock->lock_start; + __entry->len = lock->lock_len; + __entry->status = be32_to_cpu(status); + __assign_sockaddr(addr, addr, addrlen); + ), + + TP_printk( + "addr=%pISpc oh=0x%08x svid=0x%08x fh=0x%08x start=%llu len=%llu status=%s", + __get_sockaddr(addr), __entry->oh, __entry->svid, + __entry->fh, __entry->start, __entry->len, + show_nlm_status(__entry->status) + ) +); + +#define DEFINE_NLMCLNT_EVENT(name) \ + DEFINE_EVENT(nlmclnt_lock_event, name, \ + TP_PROTO( \ + const struct nlm_lock *lock, \ + const struct sockaddr *addr, \ + unsigned int addrlen, \ + __be32 status \ + ), \ + TP_ARGS(lock, addr, addrlen, status)) + +DEFINE_NLMCLNT_EVENT(nlmclnt_test); +DEFINE_NLMCLNT_EVENT(nlmclnt_lock); +DEFINE_NLMCLNT_EVENT(nlmclnt_unlock); +DEFINE_NLMCLNT_EVENT(nlmclnt_grant); + +#endif /* _TRACE_LOCKD_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/mpage.c b/fs/mpage.c index 22b9de5ddd68..242e213ee064 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -43,23 +43,49 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio) +static void mpage_read_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); + + bio_for_each_folio_all(fi, bio) { + if (err) + folio_set_error(fi.folio); + else + folio_mark_uptodate(fi.folio); + folio_unlock(fi.folio); + } + + bio_put(bio); +} - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - page_endio(page, bio_op(bio), - blk_status_to_errno(bio->bi_status)); +static void mpage_write_end_io(struct bio *bio) +{ + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); + + bio_for_each_folio_all(fi, bio) { + if (err) { + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, err); + } + folio_end_writeback(fi.folio); } bio_put(bio); } -static struct bio *mpage_bio_submit(struct bio *bio) +static struct bio *mpage_bio_submit_read(struct bio *bio) +{ + bio->bi_end_io = mpage_read_end_io; + guard_bio_eod(bio); + submit_bio(bio); + return NULL; +} + +static struct bio *mpage_bio_submit_write(struct bio *bio) { - bio->bi_end_io = mpage_end_io; + bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; @@ -265,7 +291,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { @@ -278,7 +304,7 @@ alloc_new: length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } @@ -286,7 +312,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -294,7 +320,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else @@ -356,7 +382,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) args.bio = do_mpage_readpage(&args); } if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); @@ -373,7 +399,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) args.bio = do_mpage_readpage(&args); if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); @@ -577,7 +603,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { @@ -596,7 +622,7 @@ alloc_new: wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); goto alloc_new; } @@ -606,7 +632,7 @@ alloc_new: folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -618,7 +644,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable @@ -652,7 +678,7 @@ mpage_writepages(struct address_space *mapping, blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) - mpage_bio_submit(mpd.bio); + mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return ret; } diff --git a/fs/namei.c b/fs/namei.c index f04f7be58933..e4fe0879ae55 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -254,6 +254,7 @@ getname_kernel(const char * filename) return result; } +EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { @@ -271,6 +272,7 @@ void putname(struct filename *name) } else __putname(name); } +EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking @@ -1581,8 +1583,9 @@ static struct dentry *lookup_dcache(const struct qstr *name, * when directory is guaranteed to have no in-lookup children * at all. */ -static struct dentry *__lookup_hash(const struct qstr *name, - struct dentry *base, unsigned int flags) +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, + unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; @@ -1606,6 +1609,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, } return dentry; } +EXPORT_SYMBOL(lookup_one_qstr_excl); static struct dentry *lookup_fast(struct nameidata *nd) { @@ -2532,16 +2536,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags, } /* Note: this does not consume "name" */ -static int filename_parentat(int dfd, struct filename *name, - unsigned int flags, struct path *parent, - struct qstr *last, int *type) +static int __filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type, + const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); - set_nameidata(&nd, dfd, name, NULL); + set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); @@ -2556,6 +2561,13 @@ static int filename_parentat(int dfd, struct filename *name, return retval; } +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + return __filename_parentat(dfd, name, flags, parent, last, type, NULL); +} + /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(struct filename *name, struct path *path) { @@ -2571,7 +2583,7 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = __lookup_hash(&last, path->dentry, 0); + d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); @@ -2600,6 +2612,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path) EXPORT_SYMBOL(kern_path); /** + * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair + * @filename: filename structure + * @flags: lookup flags + * @parent: pointer to struct path to fill + * @last: last component + * @type: type of the last component + * @root: pointer to struct path of the base directory + */ +int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, + struct path *parent, struct qstr *last, int *type, + const struct path *root) +{ + return __filename_parentat(AT_FDCWD, filename, flags, parent, last, + type, root); +} +EXPORT_SYMBOL(vfs_path_parent_lookup); + +/** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory @@ -2980,20 +3010,10 @@ static inline int may_create(struct mnt_idmap *idmap, return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } -/* - * p1 and p2 should be directories on the same fs. - */ -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p; - if (p1 == p2) { - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - return NULL; - } - - mutex_lock(&p1->d_sb->s_vfs_rename_mutex); - p = d_ancestor(p2, p1); if (p) { inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); @@ -3012,8 +3032,64 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } + +/* + * p1 and p2 should be directories on the same fs. + */ +struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +{ + if (p1 == p2) { + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + return NULL; + } + + mutex_lock(&p1->d_sb->s_vfs_rename_mutex); + return lock_two_directories(p1, p2); +} EXPORT_SYMBOL(lock_rename); +/* + * c1 and p2 should be on the same fs. + */ +struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +{ + if (READ_ONCE(c1->d_parent) == p2) { + /* + * hopefully won't need to touch ->s_vfs_rename_mutex at all. + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + /* + * now that p2 is locked, nobody can move in or out of it, + * so the test below is safe. + */ + if (likely(c1->d_parent == p2)) + return NULL; + + /* + * c1 got moved out of p2 while we'd been taking locks; + * unlock and fall back to slow case. + */ + inode_unlock(p2->d_inode); + } + + mutex_lock(&c1->d_sb->s_vfs_rename_mutex); + /* + * nobody can move out of any directories on this fs. + */ + if (likely(c1->d_parent != p2)) + return lock_two_directories(c1->d_parent, p2); + + /* + * c1 got moved into p2 while we were taking locks; + * we need p2 locked and ->s_vfs_rename_mutex unlocked, + * for consistency with lock_rename(). + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); + return NULL; +} +EXPORT_SYMBOL(lock_rename_child); + void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); @@ -3806,7 +3882,8 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); + dentry = lookup_one_qstr_excl(&last, path->dentry, + reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -4166,7 +4243,7 @@ retry: goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; @@ -4299,7 +4376,7 @@ retry: goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -4863,7 +4940,8 @@ retry: retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); - old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); + old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, + lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; @@ -4871,7 +4949,8 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; - new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); + new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, + lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index e3d754a9e1b0..3404707ddbe7 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -349,8 +349,8 @@ int netfs_write_begin(struct netfs_inode *ctx, retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index c1c7ed2fd860..b6fc169be1b1 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -170,6 +170,7 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + select NETFS_SUPPORT help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6fbcbb8d6587..8257de6dba45 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static void nfs_readdir_free_folio(struct folio *); +static void nfs_readdir_clear_array(struct folio *); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = { }; const struct address_space_operations nfs_dir_aops = { - .free_folio = nfs_readdir_free_folio, + .free_folio = nfs_readdir_clear_array, }; #define NFS_INIT_DTSIZE PAGE_SIZE @@ -146,18 +146,18 @@ struct nfs_cache_array { u64 change_attr; u64 last_cookie; unsigned int size; - unsigned char page_full : 1, - page_is_eof : 1, + unsigned char folio_full : 1, + folio_is_eof : 1, cookies_are_ordered : 1; struct nfs_cache_array_entry array[]; }; struct nfs_readdir_descriptor { struct file *file; - struct page *page; + struct folio *folio; struct dir_context *ctx; - pgoff_t page_index; - pgoff_t page_index_max; + pgoff_t folio_index; + pgoff_t folio_index_max; u64 dir_cookie; u64 last_cookie; loff_t current_index; @@ -198,17 +198,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) nfs_set_dtsize(desc, desc->dtsize << 1); } -static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, - u64 change_attr) +static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, + u64 change_attr) { struct nfs_cache_array *array; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; - array->page_full = 0; - array->page_is_eof = 0; + array->folio_full = 0; + array->folio_is_eof = 0; array->cookies_are_ordered = 1; kunmap_local(array); } @@ -216,44 +216,39 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static void nfs_readdir_clear_array(struct page *page) +static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; unsigned int i; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; kunmap_local(array); } -static void nfs_readdir_free_folio(struct folio *folio) +static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, + u64 change_attr) { - nfs_readdir_clear_array(&folio->page); + nfs_readdir_clear_array(folio); + nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } -static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie, - u64 change_attr) +static struct folio * +nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) { - nfs_readdir_clear_array(page); - nfs_readdir_page_init_array(page, last_cookie, change_attr); + struct folio *folio = folio_alloc(gfp_flags, 0); + if (folio) + nfs_readdir_folio_init_array(folio, last_cookie, 0); + return folio; } -static struct page * -nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) +static void nfs_readdir_folio_array_free(struct folio *folio) { - struct page *page = alloc_page(gfp_flags); - if (page) - nfs_readdir_page_init_array(page, last_cookie, 0); - return page; -} - -static void nfs_readdir_page_array_free(struct page *page) -{ - if (page) { - nfs_readdir_clear_array(page); - put_page(page); + if (folio) { + nfs_readdir_clear_array(folio); + folio_put(folio); } } @@ -264,13 +259,13 @@ static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { - array->page_is_eof = 1; - array->page_full = 1; + array->folio_is_eof = 1; + array->folio_full = 1; } static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) { - return array->page_full; + return array->folio_full; } /* @@ -302,18 +297,18 @@ static size_t nfs_readdir_array_maxentries(void) */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { - if (array->page_full) + if (array->folio_full) return -ENOSPC; if (array->size == nfs_readdir_array_maxentries()) { - array->page_full = 1; + array->folio_full = 1; return -ENOSPC; } return 0; } -static int nfs_readdir_page_array_append(struct page *page, - const struct nfs_entry *entry, - u64 *cookie) +static int nfs_readdir_folio_array_append(struct folio *folio, + const struct nfs_entry *entry, + u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; @@ -322,7 +317,7 @@ static int nfs_readdir_page_array_append(struct page *page, name = nfs_readdir_copy_name(entry->name, entry->len); - array = kmap_atomic(page); + array = kmap_atomic(folio_page(folio, 0)); if (!name) goto out; ret = nfs_readdir_array_can_expand(array); @@ -361,17 +356,17 @@ out: * 127 readdir entries for a typical 64-bit system, that works out to a * cache of ~ 33 million entries per directory. */ -static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) +static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) { if (cookie == 0) return 0; return hash_64(cookie, 18); } -static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, - u64 change_attr) +static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, + u64 change_attr) { - struct nfs_cache_array *array = kmap_local_page(page); + struct nfs_cache_array *array = kmap_local_folio(folio, 0); int ret = true; if (array->change_attr != change_attr) @@ -382,81 +377,83 @@ static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, return ret; } -static void nfs_readdir_page_unlock_and_put(struct page *page) +static void nfs_readdir_folio_unlock_and_put(struct folio *folio) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } -static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie, - u64 change_attr) +static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, + u64 change_attr) { - if (PageUptodate(page)) { - if (nfs_readdir_page_validate(page, cookie, change_attr)) + if (folio_test_uptodate(folio)) { + if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; - nfs_readdir_clear_array(page); + nfs_readdir_clear_array(folio); } - nfs_readdir_page_init_array(page, cookie, change_attr); - SetPageUptodate(page); + nfs_readdir_folio_init_array(folio, cookie, change_attr); + folio_mark_uptodate(folio); } -static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, - u64 cookie, u64 change_attr) +static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, + u64 cookie, u64 change_attr) { - pgoff_t index = nfs_readdir_page_cookie_hash(cookie); - struct page *page; + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; - page = grab_cache_page(mapping, index); - if (!page) + folio = filemap_grab_folio(mapping, index); + if (!folio) return NULL; - nfs_readdir_page_init_and_validate(page, cookie, change_attr); - return page; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + return folio; } -static u64 nfs_readdir_page_last_cookie(struct page *page) +static u64 nfs_readdir_folio_last_cookie(struct folio *folio) { struct nfs_cache_array *array; u64 ret; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); ret = array->last_cookie; kunmap_local(array); return ret; } -static bool nfs_readdir_page_needs_filling(struct page *page) +static bool nfs_readdir_folio_needs_filling(struct folio *folio) { struct nfs_cache_array *array; bool ret; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); ret = !nfs_readdir_array_is_full(array); kunmap_local(array); return ret; } -static void nfs_readdir_page_set_eof(struct page *page) +static void nfs_readdir_folio_set_eof(struct folio *folio) { struct nfs_cache_array *array; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); nfs_readdir_array_set_eof(array); kunmap_local(array); } -static struct page *nfs_readdir_page_get_next(struct address_space *mapping, - u64 cookie, u64 change_attr) +static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, + u64 cookie, u64 change_attr) { - pgoff_t index = nfs_readdir_page_cookie_hash(cookie); - struct page *page; + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; - page = grab_cache_page_nowait(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + mapping_gfp_mask(mapping)); + if (!folio) return NULL; - nfs_readdir_page_init_and_validate(page, cookie, change_attr); - if (nfs_readdir_page_last_cookie(page) != cookie) - nfs_readdir_page_reinit_array(page, cookie, change_attr); - return page; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + if (nfs_readdir_folio_last_cookie(folio) != cookie) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; } static inline @@ -481,11 +478,11 @@ bool nfs_readdir_use_cookie(const struct file *filp) static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { - if (array->page_full) { + if (array->folio_full) { desc->last_cookie = array->last_cookie; desc->current_index += array->size; desc->cache_entry_index = 0; - desc->page_index++; + desc->folio_index++; } else desc->last_cookie = nfs_readdir_array_index_cookie(array); } @@ -494,7 +491,7 @@ static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) { desc->current_index = 0; desc->last_cookie = 0; - desc->page_index = 0; + desc->folio_index = 0; } static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, @@ -506,7 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->page_is_eof) + if (array->folio_is_eof) goto out_eof; nfs_readdir_seek_next_array(array, desc); return -EAGAIN; @@ -554,7 +551,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, } } check_eof: - if (array->page_is_eof) { + if (array->folio_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; @@ -568,7 +565,7 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) struct nfs_cache_array *array; int status; - array = kmap_local_page(desc->page); + array = kmap_local_folio(desc->folio, 0); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -819,16 +816,17 @@ static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, } /* Perform conversion from xdr to cache array */ -static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, - struct nfs_entry *entry, - struct page **xdr_pages, unsigned int buflen, - struct page **arrays, size_t narrays, - u64 change_attr) +static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, unsigned int buflen, + struct folio **arrays, size_t narrays, + u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; + struct folio *new, *folio = *arrays; struct xdr_stream stream; + struct page *scratch; struct xdr_buf buf; - struct page *scratch, *new, *page = *arrays; u64 cookie; int status; @@ -844,36 +842,36 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, if (status != 0) break; - status = nfs_readdir_page_array_append(page, entry, &cookie); + status = nfs_readdir_folio_array_append(folio, entry, &cookie); if (status != -ENOSPC) continue; - if (page->mapping != mapping) { + if (folio->mapping != mapping) { if (!--narrays) break; - new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL); + new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; - *arrays = page = new; + *arrays = folio = new; } else { - new = nfs_readdir_page_get_next(mapping, cookie, - change_attr); + new = nfs_readdir_folio_get_next(mapping, cookie, + change_attr); if (!new) break; - if (page != *arrays) - nfs_readdir_page_unlock_and_put(page); - page = new; + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); + folio = new; } - desc->page_index_max++; - status = nfs_readdir_page_array_append(page, entry, &cookie); + desc->folio_index_max++; + status = nfs_readdir_folio_array_append(folio, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: if (!entry->eof) break; - nfs_readdir_page_set_eof(page); + nfs_readdir_folio_set_eof(folio); fallthrough; case -EAGAIN: status = 0; @@ -886,8 +884,8 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, ; } - if (page != *arrays) - nfs_readdir_page_unlock_and_put(page); + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); put_page(scratch); return status; @@ -927,11 +925,11 @@ out_freepages: static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, - struct page **arrays, size_t narrays) + struct folio **arrays, size_t narrays) { u64 change_attr; struct page **pages; - struct page *page = *arrays; + struct folio *folio = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); @@ -942,7 +940,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; - entry->cookie = nfs_readdir_page_last_cookie(page); + entry->cookie = nfs_readdir_folio_last_cookie(folio); entry->fh = nfs_alloc_fhandle(); entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); entry->server = NFS_SERVER(inode); @@ -962,10 +960,10 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, pglen = status; if (pglen != 0) - status = nfs_readdir_page_filler(desc, entry, pages, pglen, - arrays, narrays, change_attr); + status = nfs_readdir_folio_filler(desc, entry, pages, pglen, + arrays, narrays, change_attr); else - nfs_readdir_page_set_eof(page); + nfs_readdir_folio_set_eof(folio); desc->buffer_fills++; free_pages: @@ -977,33 +975,33 @@ out: return status; } -static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc) +static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { - put_page(desc->page); - desc->page = NULL; + folio_put(desc->folio); + desc->folio = NULL; } static void -nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) +nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - unlock_page(desc->page); - nfs_readdir_page_put(desc); + folio_unlock(desc->folio); + nfs_readdir_folio_put(desc); } -static struct page * -nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) +static struct folio * +nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); u64 cookie = desc->last_cookie; - struct page *page; + struct folio *folio; - page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); - if (!page) + folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); + if (!folio) return NULL; - if (desc->clear_cache && !nfs_readdir_page_needs_filling(page)) - nfs_readdir_page_reinit_array(page, cookie, change_attr); - return page; + if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; } /* @@ -1017,21 +1015,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; - desc->page = nfs_readdir_page_get_cached(desc); - if (!desc->page) + desc->folio = nfs_readdir_folio_get_cached(desc); + if (!desc->folio) return -ENOMEM; - if (nfs_readdir_page_needs_filling(desc->page)) { + if (nfs_readdir_folio_needs_filling(desc->folio)) { /* Grow the dtsize if we had to go back for more pages */ - if (desc->page_index == desc->page_index_max) + if (desc->folio_index == desc->folio_index_max) nfs_grow_dtsize(desc); - desc->page_index_max = desc->page_index; + desc->folio_index_max = desc->folio_index; trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, desc->last_cookie, - desc->page->index, desc->dtsize); + desc->folio->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, - &desc->page, 1); + &desc->folio, 1); if (res < 0) { - nfs_readdir_page_unlock_and_put_cached(desc); + nfs_readdir_folio_unlock_and_put_cached(desc); trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); @@ -1059,7 +1057,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) res = nfs_readdir_search_array(desc); if (res == 0) return 0; - nfs_readdir_page_unlock_and_put_cached(desc); + nfs_readdir_folio_unlock_and_put_cached(desc); return res; } @@ -1087,7 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, unsigned int i; bool first_emit = !desc->dir_cookie; - array = kmap_local_page(desc->page); + array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -1114,7 +1112,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, break; } } - if (array->page_is_eof) + if (array->folio_is_eof) desc->eof = !desc->eob; kunmap_local(array); @@ -1136,7 +1134,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page **arrays; + struct folio **arrays; size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status = -ENOMEM; @@ -1147,14 +1145,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); if (!arrays) goto out; - arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL); + arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); if (!arrays[0]) goto out; - desc->page_index = 0; + desc->folio_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; - desc->page_index_max = 0; + desc->folio_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, -1, desc->dtsize); @@ -1166,10 +1164,10 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { - desc->page = arrays[i]; + desc->folio = arrays[i]; nfs_do_filldir(desc, verf); } - desc->page = NULL; + desc->folio = NULL; /* * Grow the dtsize if we have to go back for more pages, @@ -1179,16 +1177,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) if (!desc->eob) nfs_grow_dtsize(desc); else if (desc->buffer_fills == 1 && - i < (desc->page_index_max >> 1)) + i < (desc->folio_index_max >> 1)) nfs_shrink_dtsize(desc); } out_free: for (i = 0; i < sz && arrays[i]; i++) - nfs_readdir_page_array_free(arrays[i]); + nfs_readdir_folio_array_free(arrays[i]); out: if (!nfs_readdir_use_cookie(desc->file)) nfs_readdir_rewind_search(desc); - desc->page_index_max = -1; + desc->folio_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; @@ -1240,11 +1238,11 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out; desc->file = file; desc->ctx = ctx; - desc->page_index_max = -1; + desc->folio_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; - desc->page_index = dir_ctx->page_index; + desc->folio_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; @@ -1291,8 +1289,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; nfs_do_filldir(desc, nfsi->cookieverf); - nfs_readdir_page_unlock_and_put_cached(desc); - if (desc->page_index == desc->page_index_max) + nfs_readdir_folio_unlock_and_put_cached(desc); + if (desc->folio_index == desc->folio_index_max) desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); @@ -1300,7 +1298,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->dir_cookie = desc->dir_cookie; dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; - dir_ctx->page_index = desc->page_index; + dir_ctx->page_index = desc->folio_index; dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; dir_ctx->dtsize = desc->dtsize; diff --git a/fs/nfs/export.c b/fs/nfs/export.c index d6a6d1ebb8fd..be686b8e0c54 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -149,7 +149,10 @@ const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, - .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| - EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + .flags = EXPORT_OP_NOWCC | + EXPORT_OP_NOSUBTREECHK | + EXPORT_OP_CLOSE_BEFORE_UNLINK | + EXPORT_OP_REMOTE_FS | + EXPORT_OP_NOATOMIC_ATTR | + EXPORT_OP_FLUSH_ON_CLOSE, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2474cbc30712..f0edf5a36237 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -328,8 +328,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, start: folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); *pagep = &folio->page; ret = nfs_flush_incompatible(file, folio); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ea5f2976dfab..8c35d88a84b1 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/iversion.h> +#include <linux/xarray.h> +#include <linux/fscache.h> +#include <linux/netfs.h> #include "internal.h" #include "iostat.h" @@ -163,13 +166,14 @@ void nfs_fscache_init_inode(struct inode *inode) struct nfs_server *nfss = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); - nfsi->fscache = NULL; + netfs_inode(inode)->cache = NULL; if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; nfs_fscache_update_auxdata(&auxdata, inode); - nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + netfs_inode(inode)->cache = fscache_acquire_cookie( + nfss->fscache, 0, nfsi->fh.data, /* index_key */ nfsi->fh.size, @@ -183,11 +187,8 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfs_i_fscache(inode); - - fscache_relinquish_cookie(cookie, false); - nfsi->fscache = NULL; + fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false); + netfs_inode(inode)->cache = NULL; } /* @@ -212,7 +213,7 @@ void nfs_fscache_clear_inode(struct inode *inode) void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); bool open_for_write = inode_is_open_for_write(inode); if (!fscache_cookie_valid(cookie)) @@ -230,115 +231,160 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file); void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); loff_t i_size = i_size_read(inode); nfs_fscache_update_auxdata(&auxdata, inode); fscache_unuse_cookie(cookie, &auxdata, &i_size); } -/* - * Fallback page reading interface. - */ -static int fscache_fallback_read_page(struct inode *inode, struct page *page) +int nfs_netfs_read_folio(struct file *file, struct folio *folio) { - struct netfs_cache_resources cres; - struct fscache_cookie *cookie = nfs_i_fscache(inode); - struct iov_iter iter; - struct bio_vec bvec; - int ret; - - memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); - - ret = fscache_begin_read_operation(&cres, cookie); - if (ret < 0) - return ret; - - ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, - NULL, NULL); - fscache_end_operation(&cres); - return ret; + if (!netfs_inode(folio_inode(folio))->cache) + return -ENOBUFS; + + return netfs_read_folio(file, folio); } -/* - * Fallback page writing interface. - */ -static int fscache_fallback_write_page(struct inode *inode, struct page *page, - bool no_space_allocated_yet) +int nfs_netfs_readahead(struct readahead_control *ractl) { - struct netfs_cache_resources cres; - struct fscache_cookie *cookie = nfs_i_fscache(inode); - struct iov_iter iter; - struct bio_vec bvec; - loff_t start = page_offset(page); - size_t len = PAGE_SIZE; - int ret; - - memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); - - ret = fscache_begin_write_operation(&cres, cookie); - if (ret < 0) - return ret; - - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), - no_space_allocated_yet); - if (ret == 0) - ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); - fscache_end_operation(&cres); - return ret; + struct inode *inode = ractl->mapping->host; + + if (!netfs_inode(inode)->cache) + return -ENOBUFS; + + netfs_readahead(ractl); + return 0; } -/* - * Retrieve a page from fscache - */ -int __nfs_fscache_read_page(struct inode *inode, struct page *page) +static atomic_t nfs_netfs_debug_id; +static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { - int ret; + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); + rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); - trace_nfs_fscache_read_page(inode, page); - if (PageChecked(page)) { - ClearPageChecked(page); - ret = 1; - goto out; - } + return 0; +} - ret = fscache_fallback_read_page(inode, page); - if (ret < 0) { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - SetPageChecked(page); - goto out; - } +static void nfs_netfs_free_request(struct netfs_io_request *rreq) +{ + put_nfs_open_context(rreq->netfs_priv); +} - /* Read completed synchronously */ - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); - SetPageUptodate(page); - ret = 0; -out: - trace_nfs_fscache_read_page_exit(inode, page, ret); - return ret; +static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) +{ + return fscache_begin_read_operation(&rreq->cache_resources, + netfs_i_cookie(netfs_inode(rreq->inode))); } -/* - * Store a newly fetched page in fscache. We can be certain there's no page - * stored in the cache as yet otherwise we would've read it from there. - */ -void __nfs_fscache_write_page(struct inode *inode, struct page *page) +static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { - int ret; + struct nfs_netfs_io_data *netfs; + + netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT); + if (!netfs) + return NULL; + netfs->sreq = sreq; + refcount_set(&netfs->refcount, 1); + return netfs; +} - trace_nfs_fscache_write_page(inode, page); +static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) +{ + size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; - ret = fscache_fallback_write_page(inode, page, true); + sreq->len = min(sreq->len, rsize); + return true; +} - if (ret != 0) { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); - } else { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); +static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) +{ + struct nfs_netfs_io_data *netfs; + struct nfs_pageio_descriptor pgio; + struct inode *inode = sreq->rreq->inode; + struct nfs_open_context *ctx = sreq->rreq->netfs_priv; + struct page *page; + int err; + pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + pgoff_t last = ((sreq->start + sreq->len - + sreq->transferred - 1) >> PAGE_SHIFT); + XA_STATE(xas, &sreq->rreq->mapping->i_pages, start); + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + netfs = nfs_netfs_alloc(sreq); + if (!netfs) + return netfs_subreq_terminated(sreq, -ENOMEM, false); + + pgio.pg_netfs = netfs; /* used in completion */ + + xas_lock(&xas); + xas_for_each(&xas, page, last) { + /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */ + xas_pause(&xas); + xas_unlock(&xas); + err = nfs_read_add_folio(&pgio, ctx, page_folio(page)); + if (err < 0) { + netfs->error = err; + goto out; + } + xas_lock(&xas); } - trace_nfs_fscache_write_page_exit(inode, page, ret); + xas_unlock(&xas); +out: + nfs_pageio_complete_read(&pgio); + nfs_netfs_put(netfs); +} + +void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) +{ + struct nfs_netfs_io_data *netfs = hdr->netfs; + + if (!netfs) + return; + + nfs_netfs_get(netfs); +} + +int nfs_netfs_folio_unlock(struct folio *folio) +{ + struct inode *inode = folio_file_mapping(folio)->host; + + /* + * If fscache is enabled, netfs will unlock pages. + */ + if (netfs_inode(inode)->cache) + return 0; + + return 1; } + +void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) +{ + struct nfs_netfs_io_data *netfs = hdr->netfs; + struct netfs_io_subrequest *sreq; + + if (!netfs) + return; + + sreq = netfs->sreq; + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); + + if (hdr->error) + netfs->error = hdr->error; + else + atomic64_add(hdr->res.count, &netfs->transferred); + + nfs_netfs_put(netfs); + hdr->netfs = NULL; +} + +const struct netfs_request_ops nfs_netfs_ops = { + .init_request = nfs_netfs_init_request, + .free_request = nfs_netfs_free_request, + .begin_cache_operation = nfs_netfs_begin_cache_operation, + .issue_read = nfs_netfs_issue_read, + .clamp_length = nfs_netfs_clamp_length +}; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 2a37af880978..e1706e736c64 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -34,6 +34,58 @@ struct nfs_fscache_inode_auxdata { u64 change_attr; }; +struct nfs_netfs_io_data { + /* + * NFS may split a netfs_io_subrequest into multiple RPCs, each + * with their own read completion. In netfs, we can only call + * netfs_subreq_terminated() once for each subrequest. Use the + * refcount here to double as a marker of the last RPC completion, + * and only call netfs via netfs_subreq_terminated() once. + */ + refcount_t refcount; + struct netfs_io_subrequest *sreq; + + /* + * Final disposition of the netfs_io_subrequest, sent in + * netfs_subreq_terminated() + */ + atomic64_t transferred; + int error; +}; + +static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) +{ + refcount_inc(&netfs->refcount); +} + +static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) +{ + ssize_t final_len; + + /* Only the last RPC completion should call netfs_subreq_terminated() */ + if (!refcount_dec_and_test(&netfs->refcount)) + return; + + /* + * The NFS pageio interface may read a complete page, even when netfs + * only asked for a partial page. Specifically, this may be seen when + * one thread is truncating a file while another one is reading the last + * page of the file. + * Correct the final length here to be no larger than the netfs subrequest + * length, and thus avoid netfs's "Subreq overread" warning message. + */ + final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); + netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + kfree(netfs); +} +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) +{ + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); +} +extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); +extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); +extern int nfs_netfs_folio_unlock(struct folio *folio); + /* * fscache.c */ @@ -44,9 +96,8 @@ extern void nfs_fscache_init_inode(struct inode *); extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); extern void nfs_fscache_release_file(struct inode *, struct file *); - -extern int __nfs_fscache_read_page(struct inode *, struct page *); -extern void __nfs_fscache_write_page(struct inode *, struct page *); +extern int nfs_netfs_readahead(struct readahead_control *ractl); +extern int nfs_netfs_read_folio(struct file *file, struct folio *folio); static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) { @@ -54,34 +105,11 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); - fscache_note_page_release(nfs_i_fscache(folio->mapping->host)); - nfs_inc_fscache_stats(folio->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); } + fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host))); return true; } -/* - * Retrieve a page from an inode data storage object. - */ -static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) -{ - if (nfs_i_fscache(inode)) - return __nfs_fscache_read_page(inode, page); - return -ENOBUFS; -} - -/* - * Store a page newly fetched from the server in an inode data storage object - * in the cache. - */ -static inline void nfs_fscache_write_page(struct inode *inode, - struct page *page) -{ - if (nfs_i_fscache(inode)) - __nfs_fscache_write_page(inode, page); -} - static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, struct inode *inode) { @@ -101,13 +129,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * static inline void nfs_fscache_invalidate(struct inode *inode, int flags) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs); - if (nfsi->fscache) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_invalidate(nfsi->fscache, &auxdata, - i_size_read(inode), flags); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags); } /* @@ -120,7 +145,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server) return "no "; } +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) +{ + hdr->netfs = desc->pg_netfs; +} +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + desc->pg_netfs = hdr->netfs; +} +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) +{ + desc->pg_netfs = NULL; +} #else /* CONFIG_NFS_FSCACHE */ +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {} +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {} +static inline int nfs_netfs_folio_unlock(struct folio *folio) +{ + return 1; +} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode(struct inode *inode) {} @@ -128,22 +174,29 @@ static inline void nfs_fscache_clear_inode(struct inode *inode) {} static inline void nfs_fscache_open_file(struct inode *inode, struct file *filp) {} static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {} - -static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) +static inline int nfs_netfs_readahead(struct readahead_control *ractl) { - return true; /* may release folio */ + return -ENOBUFS; } -static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) +static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio) { return -ENOBUFS; } -static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {} + +static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) +{ + return true; /* may release folio */ +} static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { return "no "; } - +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) {} +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {} #endif /* CONFIG_NFS_FSCACHE */ #endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 97a76706fd54..a910b9a638c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfsi->cache_validity |= flags; - if (inode->i_mapping->nrpages == 0) - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA | - NFS_INO_DATA_INVAL_DEFER); - else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; + if (inode->i_mapping->nrpages == 0) { + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(nfsi); + } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + nfs_ooo_clear(nfsi); + } trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); @@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) trace_nfs_size_truncate(inode, offset); i_size_write(inode, offset); /* Optimisation */ - if (offset == 0) - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA | - NFS_INO_DATA_INVAL_DEFER); + if (offset == 0) { + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(NFS_I(inode)); + } NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); @@ -1107,7 +1109,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && - (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfs_ooo_test(nfsi)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); @@ -1351,8 +1353,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= - ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(nfsi); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1814,6 +1816,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, return 0; } +static void nfs_ooo_merge(struct nfs_inode *nfsi, + u64 start, u64 end) +{ + int i, cnt; + + if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) + /* No point merging anything */ + return; + + if (!nfsi->ooo) { + nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC); + if (!nfsi->ooo) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + return; + } + nfsi->ooo->cnt = 0; + } + + /* add this range, merging if possible */ + cnt = nfsi->ooo->cnt; + for (i = 0; i < cnt; i++) { + if (end == nfsi->ooo->gap[i].start) + end = nfsi->ooo->gap[i].end; + else if (start == nfsi->ooo->gap[i].end) + start = nfsi->ooo->gap[i].start; + else + continue; + /* Remove 'i' from table and loop to insert the new range */ + cnt -= 1; + nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt]; + i = -1; + } + if (start != end) { + if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + kfree(nfsi->ooo); + nfsi->ooo = NULL; + return; + } + nfsi->ooo->gap[cnt].start = start; + nfsi->ooo->gap[cnt].end = end; + cnt += 1; + } + nfsi->ooo->cnt = cnt; +} + +static void nfs_ooo_record(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + /* This reply was out-of-order, so record in the + * pre/post change id, possibly cancelling + * gaps created when iversion was jumpped forward. + */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) + nfs_ooo_merge(nfsi, + fattr->change_attr, + fattr->pre_change_attr); +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { @@ -1824,8 +1886,12 @@ static int nfs_refresh_inode_locked(struct inode *inode, if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); - else if (attr_cmp == 0) - ret = nfs_check_inode_attributes(inode, fattr); + else { + nfs_ooo_record(NFS_I(inode), fattr); + + if (attr_cmp == 0) + ret = nfs_check_inode_attributes(inode, fattr); + } trace_nfs_refresh_inode_exit(inode, ret); return ret; @@ -1916,6 +1982,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa if (attr_cmp < 0) return 0; if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { + /* Record the pre/post change info before clearing PRECHANGE */ + nfs_ooo_record(NFS_I(inode), fattr); fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME @@ -2070,6 +2138,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 && + nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) { + /* There is one remaining gap that hasn't been + * merged into iversion - do that now. + */ + inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start); + kfree(nfsi->ooo); + nfsi->ooo = NULL; + } if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ if (!(have_writers || have_delegation)) { @@ -2091,8 +2168,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - } else if (!have_delegation) - nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + } else if (!have_delegation) { + nfs_ooo_record(nfsi, fattr); + nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode), + fattr->change_attr); + } inode_set_iversion_raw(inode, fattr->change_attr); } } else { @@ -2246,18 +2326,22 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; + nfsi->ooo = NULL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_2 nfsi->xattr_cache = NULL; #endif + nfs_netfs_inode_init(nfsi); + return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); void nfs_free_inode(struct inode *inode) { + kfree(NFS_I(inode)->ooo); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } EXPORT_SYMBOL_GPL(nfs_free_inode); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a65fe2a63ab..3cc027d3bd58 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -452,6 +452,10 @@ extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); +#ifdef CONFIG_NFS_FSCACHE +extern const struct netfs_request_ops nfs_netfs_ops; +#endif + /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); @@ -481,9 +485,14 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool struct nfs_pgio_completion_ops; /* read.c */ +extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio); +extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); @@ -846,27 +855,12 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) } #ifdef CONFIG_CRC32 -/** - * nfs_fhandle_hash - calculate the crc32 hash for the filehandle - * @fh - pointer to filehandle - * - * returns a crc32 hash for the filehandle that is compatible with - * the one displayed by "wireshark". - */ -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); -} static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } #else -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return 0; -} static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) { return 0; diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 2ddaab1ac653..5aa776b5a3e7 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -17,9 +17,6 @@ struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; -#ifdef CONFIG_NFS_FSCACHE - unsigned long long fscache[__NFSIOS_FSCACHEMAX]; -#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; @@ -49,20 +46,6 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -#ifdef CONFIG_NFS_FSCACHE -static inline void nfs_add_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat, - long addend) -{ - this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); -} -static inline void nfs_inc_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat) -{ - this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]); -} -#endif - static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 349cc4f60a28..18d8f6529f61 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -21,9 +21,8 @@ static void nfs3_prepare_get_acl(struct posix_acl **p) { struct posix_acl *sentinel = uncached_acl_sentinel(current); - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) { - /* Not the first reader or sentinel already in place. */ - } + /* If the ACL isn't being read yet, set our sentinel. */ + cmpxchg(p, ACL_NOT_CACHED, sentinel); } static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index d80ee88ca996..a6df815a140c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1122,7 +1122,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) uint32_t segments; struct read_plus_segment *segs; int status, i; - char scratch_buf[16]; __be32 *p; status = decode_op_hdr(xdr, OP_READ_PLUS); @@ -1143,7 +1142,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (!segs) return -ENOMEM; - xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf)); status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); @@ -1348,6 +1346,8 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; + xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5607b1e2b821..18f25ff4bff7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5439,6 +5439,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + if (hdr->res.scratch) + kfree(hdr->res.scratch); if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5452,17 +5454,22 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } #if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS -static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, struct rpc_message *msg) { /* Note: We don't use READ_PLUS with pNFS yet */ - if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) + if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; + hdr->res.scratch = kmalloc(32, GFP_KERNEL); + return hdr->res.scratch != NULL; + } + return false; } #else -static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, struct rpc_message *msg) { + return false; } #endif /* CONFIG_NFS_V4_2 */ @@ -5472,8 +5479,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs42_read_plus_support(hdr, msg); + if (!nfs42_read_plus_support(hdr, msg)) + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2a0ca5c7f082..bbe49315d99e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -67,6 +67,8 @@ #define OPENOWNER_POOL_SIZE 8 +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp); + const nfs4_stateid zero_stateid = { { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, @@ -330,6 +332,8 @@ do_confirm: status = nfs4_proc_create_session(clp, cred); if (status != 0) goto out; + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)) + nfs4_state_start_reclaim_reboot(clp); nfs41_finish_session_reset(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -1205,10 +1209,6 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; - struct rpc_clnt *cl = clp->cl_rpcclient; - - while (cl != cl->cl_parent) - cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index c394e4447100..e776200e9a11 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -37,27 +37,10 @@ static struct ctl_table nfs4_cb_sysctls[] = { { } }; -static struct ctl_table nfs4_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs4_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs4_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs4_cb_sysctl_dir, - }, - { } -}; - int nfs4_register_sysctl(void) { - nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + nfs4_callback_sysctl_table = register_sysctl("fs/nfs", + nfs4_cb_sysctls); if (nfs4_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a778713343df..4e90ca531176 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ - { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ @@ -1243,96 +1242,6 @@ TRACE_EVENT(nfs_readpage_short, ) ); -DECLARE_EVENT_CLASS(nfs_fscache_page_event, - TP_PROTO( - const struct inode *inode, - struct page *page - ), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field(dev_t, dev) - __field(u32, fhandle) - __field(u64, fileid) - __field(loff_t, offset) - ), - - TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); - const struct nfs_fh *fh = &nfsi->fh; - - __entry->offset = page_index(page) << PAGE_SHIFT; - __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; - __entry->fhandle = nfs_fhandle_hash(fh); - ), - - TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long long)__entry->fileid, - __entry->fhandle, - (long long)__entry->offset - ) -); -DECLARE_EVENT_CLASS(nfs_fscache_page_event_done, - TP_PROTO( - const struct inode *inode, - struct page *page, - int error - ), - - TP_ARGS(inode, page, error), - - TP_STRUCT__entry( - __field(int, error) - __field(dev_t, dev) - __field(u32, fhandle) - __field(u64, fileid) - __field(loff_t, offset) - ), - - TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); - const struct nfs_fh *fh = &nfsi->fh; - - __entry->offset = page_index(page) << PAGE_SHIFT; - __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; - __entry->fhandle = nfs_fhandle_hash(fh); - __entry->error = error; - ), - - TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld error=%d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long long)__entry->fileid, - __entry->fhandle, - (long long)__entry->offset, __entry->error - ) -); -#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \ - DEFINE_EVENT(nfs_fscache_page_event, name, \ - TP_PROTO( \ - const struct inode *inode, \ - struct page *page \ - ), \ - TP_ARGS(inode, page)) -#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \ - DEFINE_EVENT(nfs_fscache_page_event_done, name, \ - TP_PROTO( \ - const struct inode *inode, \ - struct page *page, \ - int error \ - ), \ - TP_ARGS(inode, page, error)) -DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page); -DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit); -DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page); -DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit); TRACE_EVENT(nfs_pgio_error, TP_PROTO( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 64fa8de199de..6efb5068c116 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -26,6 +26,7 @@ #include "internal.h" #include "pnfs.h" #include "nfstrace.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -105,6 +106,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->good_bytes = mirror->pg_count; hdr->io_completion = desc->pg_io_completion; hdr->dreq = desc->pg_dreq; + nfs_netfs_set_pgio_header(hdr, desc); hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) @@ -941,6 +943,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_lseg = NULL; desc->pg_io_completion = NULL; desc->pg_dreq = NULL; + nfs_netfs_reset_pageio_descriptor(desc); desc->pg_bsize = bsize; desc->pg_mirror_count = 1; @@ -1477,6 +1480,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, desc->pg_io_completion = hdr->io_completion; desc->pg_dreq = hdr->dreq; + nfs_netfs_set_pageio_descriptor(desc, hdr); list_splice_init(&hdr->pages, &pages); while (!list_empty(&pages)) { struct nfs_page *req = nfs_list_entry(pages.next); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e90988591df4..f71eeee67e20 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -31,7 +31,7 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; @@ -74,7 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) +void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *pgm; unsigned long npages; @@ -110,28 +110,17 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_readpage_release(struct nfs_page *req, int error) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); struct folio *folio = nfs_page_to_folio(req); - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) folio_set_error(folio); - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (folio_test_uptodate(folio)) - nfs_fscache_write_page(inode, &folio->page); - folio_unlock(folio); - } + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); + nfs_release_request(req); } -struct nfs_readdesc { - struct nfs_pageio_descriptor pgio; - struct nfs_open_context *ctx; -}; - static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) @@ -153,7 +142,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in readpage_async_filler */ + * request are zeroed in nfs_read_add_folio + */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -181,6 +171,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); nfs_readpage_release(req, error); } + nfs_netfs_read_completion(hdr); + out: hdr->release(hdr); } @@ -191,6 +183,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, struct rpc_task_setup *task_setup_data, int how) { rpc_ops->read_setup(hdr, msg); + nfs_netfs_initiate_read(hdr); trace_nfs_initiate_read(hdr); } @@ -206,7 +199,7 @@ nfs_async_read_error(struct list_head *head, int error) } } -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .error_cleanup = nfs_async_read_error, .completion = nfs_read_completion, }; @@ -281,7 +274,9 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } -static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) +int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_server *server = NFS_SERVER(inode); @@ -297,29 +292,21 @@ static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize); - if (!IS_SYNC(inode)) { - error = nfs_fscache_read_page(inode, &folio->page); - if (error == 0) - goto out_unlock; + new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len); + if (IS_ERR(new)) { + error = PTR_ERR(new); + goto out; } - new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len); - if (IS_ERR(new)) - goto out_error; - if (len < fsize) folio_zero_segment(folio, len, fsize); - if (!nfs_pageio_add_request(&desc->pgio, new)) { + if (!nfs_pageio_add_request(pgio, new)) { nfs_list_remove_request(new); - error = desc->pgio.pg_error; + error = pgio->pg_error; nfs_readpage_release(new, error); goto out; } return 0; -out_error: - error = PTR_ERR(new); -out_unlock: - folio_unlock(folio); out: return error; } @@ -332,8 +319,9 @@ out: */ int nfs_read_folio(struct file *file, struct folio *folio) { - struct nfs_readdesc desc; struct inode *inode = file_inode(file); + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; int ret; trace_nfs_aop_readpage(inode, folio); @@ -357,38 +345,43 @@ int nfs_read_folio(struct file *file, struct folio *folio) if (NFS_STALE(inode)) goto out_unlock; - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + ret = nfs_netfs_read_folio(file, folio); + if (!ret) + goto out; + + ctx = get_nfs_open_context(nfs_file_open_context(file)); - xchg(&desc.ctx->error, 0); - nfs_pageio_init_read(&desc.pgio, inode, false, + xchg(&ctx->error, 0); + nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - ret = readpage_async_filler(&desc, folio); + ret = nfs_read_add_folio(&pgio, ctx, folio); if (ret) - goto out; + goto out_put; - nfs_pageio_complete_read(&desc.pgio); - ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; + nfs_pageio_complete_read(&pgio); + ret = pgio.pg_error < 0 ? pgio.pg_error : 0; if (!ret) { ret = folio_wait_locked_killable(folio); if (!folio_test_uptodate(folio) && !ret) - ret = xchg(&desc.ctx->error, 0); + ret = xchg(&ctx->error, 0); } +out_put: + put_nfs_open_context(ctx); out: - put_nfs_open_context(desc.ctx); trace_nfs_aop_readpage_done(inode, folio, ret); return ret; out_unlock: folio_unlock(folio); - trace_nfs_aop_readpage_done(inode, folio, ret); - return ret; + goto out; } void nfs_readahead(struct readahead_control *ractl) { + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; unsigned int nr_pages = readahead_count(ractl); struct file *file = ractl->file; - struct nfs_readdesc desc; struct inode *inode = ractl->mapping->host; struct folio *folio; int ret; @@ -401,26 +394,30 @@ void nfs_readahead(struct readahead_control *ractl) if (NFS_STALE(inode)) goto out; + ret = nfs_netfs_readahead(ractl); + if (!ret) + goto out; + if (file == NULL) { ret = -EBADF; - desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (desc.ctx == NULL) + ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (ctx == NULL) goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + ctx = get_nfs_open_context(nfs_file_open_context(file)); - nfs_pageio_init_read(&desc.pgio, inode, false, + nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); while ((folio = readahead_folio(ractl)) != NULL) { - ret = readpage_async_filler(&desc, folio); + ret = nfs_read_add_folio(&pgio, ctx, folio); if (ret) break; } - nfs_pageio_complete_read(&desc.pgio); + nfs_pageio_complete_read(&pgio); - put_nfs_open_context(desc.ctx); + put_nfs_open_context(ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 397c096d874e..30e53e93049e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -692,10 +692,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; -#ifdef CONFIG_NFS_FSCACHE - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - totals.fscache[i] += stats->fscache[i]; -#endif preempt_enable(); } @@ -706,13 +702,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_puts(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); -#ifdef CONFIG_NFS_FSCACHE - if (nfss->options & NFS_OPTION_FSCACHE) { - seq_puts(m, "\n\tfsc:\t"); - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - seq_printf(m, "%Lu ", totals.fscache[i]); - } -#endif seq_putc(m, '\n'); rpc_clnt_show_stats(m, nfss->client); diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 7aea195ddb35..f39e2089bc4c 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,27 +32,9 @@ static struct ctl_table nfs_cb_sysctls[] = { { } }; -static struct ctl_table nfs_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs_cb_sysctl_dir, - }, - { } -}; - int nfs_register_sysctl(void) { - nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); + nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls); if (nfs_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c index 7c1509e968c8..832246b22c51 100644 --- a/fs/nfs_common/nfs_ssc.c +++ b/fs/nfs_common/nfs_ssc.c @@ -12,7 +12,6 @@ #include <linux/nfs_ssc.h> #include "../nfs/nfs4_fs.h" -MODULE_LICENSE("GPL"); struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl; EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 668c7527b17e..ae85257b4238 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -123,11 +123,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) /* OK, we seem to have a valid key */ key.h.flags = 0; - key.h.expiry_time = get_expiry(&mesg); - if (key.h.expiry_time == 0) + err = get_expiry(&mesg, &key.h.expiry_time); + if (err) goto out; - key.ek_client = dom; + key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); @@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } return 0; - } #ifdef CONFIG_NFSD_V4 @@ -546,6 +545,29 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) +{ + unsigned int i, mode, listsize; + int err; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > NFSEXP_XPRTSEC_NUM) + return -EINVAL; + + exp->ex_xprtsec_modes = 0; + for (i = 0; i < listsize; i++) { + err = get_uint(mesg, &mode); + if (err) + return err; + if (mode > NFSEXP_XPRTSEC_MTLS) + return -EINVAL; + exp->ex_xprtsec_modes |= mode; + } + return 0; +} + static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { @@ -608,11 +630,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; + exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ - err = -EINVAL; - exp.h.expiry_time = get_expiry(&mesg); - if (exp.h.expiry_time == 0) + err = get_expiry(&mesg, &exp.h.expiry_time); + if (err) goto out3; /* flags */ @@ -624,7 +646,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err || an_int < 0) goto out3; exp.ex_flags= an_int; - + /* anon uid */ err = get_int(&mesg, &an_int); if (err) @@ -650,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); + else if (strcmp(buf, "xprtsec") == 0) + err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set @@ -663,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; + /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the @@ -824,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } + new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) @@ -1035,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd, __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f; - struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + goto ok; + } + goto denied; +ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return 0; @@ -1062,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) if (nfsd4_spo_must_allow(rqstp)) return 0; +denied: return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index d03f7f6a8642..2df8ae25aad3 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -77,6 +77,7 @@ struct svc_export { struct cache_detail *cd; struct rcu_head ex_rcu; struct export_stats ex_stats; + unsigned long ex_xprtsec_modes; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 6e8712bd7c99..ee9c923192e0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -74,70 +74,9 @@ static struct list_lru nfsd_file_lru; static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static struct delayed_work nfsd_filecache_laundrette; -static struct rhashtable nfsd_file_rhash_tbl +static struct rhltable nfsd_file_rhltable ____cacheline_aligned_in_smp; -enum nfsd_file_lookup_type { - NFSD_FILE_KEY_INODE, - NFSD_FILE_KEY_FULL, -}; - -struct nfsd_file_lookup_key { - struct inode *inode; - struct net *net; - const struct cred *cred; - unsigned char need; - bool gc; - enum nfsd_file_lookup_type type; -}; - -/* - * The returned hash value is based solely on the address of an in-code - * inode, a pointer to a slab-allocated object. The entropy in such a - * pointer is concentrated in its middle bits. - */ -static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) -{ - unsigned long ptr = (unsigned long)inode; - u32 k; - - k = ptr >> L1_CACHE_SHIFT; - k &= 0x00ffffff; - return jhash2(&k, 1, seed); -} - -/** - * nfsd_file_key_hashfn - Compute the hash value of a lookup key - * @data: key on which to compute the hash value - * @len: rhash table's key_len parameter (unused) - * @seed: rhash table's random seed of the day - * - * Return value: - * Computed 32-bit hash value - */ -static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nfsd_file_lookup_key *key = data; - - return nfsd_file_inode_hash(key->inode, seed); -} - -/** - * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file - * @data: object on which to compute the hash value - * @len: rhash table's key_len parameter (unused) - * @seed: rhash table's random seed of the day - * - * Return value: - * Computed 32-bit hash value - */ -static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nfsd_file *nf = data; - - return nfsd_file_inode_hash(nf->nf_inode, seed); -} - static bool nfsd_match_cred(const struct cred *c1, const struct cred *c2) { @@ -158,53 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2) return true; } -/** - * nfsd_file_obj_cmpfn - Match a cache item against search criteria - * @arg: search criteria - * @ptr: cache item to check - * - * Return values: - * %0 - Item matches search criteria - * %1 - Item does not match search criteria - */ -static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct nfsd_file_lookup_key *key = arg->key; - const struct nfsd_file *nf = ptr; - - switch (key->type) { - case NFSD_FILE_KEY_INODE: - if (nf->nf_inode != key->inode) - return 1; - break; - case NFSD_FILE_KEY_FULL: - if (nf->nf_inode != key->inode) - return 1; - if (nf->nf_may != key->need) - return 1; - if (nf->nf_net != key->net) - return 1; - if (!nfsd_match_cred(nf->nf_cred, key->cred)) - return 1; - if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc) - return 1; - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) - return 1; - break; - } - return 0; -} - static const struct rhashtable_params nfsd_file_rhash_params = { .key_len = sizeof_field(struct nfsd_file, nf_inode), .key_offset = offsetof(struct nfsd_file, nf_inode), - .head_offset = offsetof(struct nfsd_file, nf_rhash), - .hashfn = nfsd_file_key_hashfn, - .obj_hashfn = nfsd_file_obj_hashfn, - .obj_cmpfn = nfsd_file_obj_cmpfn, - /* Reduce resizing churn on light workloads */ - .min_size = 512, /* buckets */ + .head_offset = offsetof(struct nfsd_file, nf_rlist), + + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, .automatic_shrinking = true, }; @@ -307,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) } static struct nfsd_file * -nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) +nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, + bool want_gc) { struct nfsd_file *nf; nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); - if (nf) { - INIT_LIST_HEAD(&nf->nf_lru); - nf->nf_birthtime = ktime_get(); - nf->nf_file = NULL; - nf->nf_cred = get_current_cred(); - nf->nf_net = key->net; - nf->nf_flags = 0; - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); - if (key->gc) - __set_bit(NFSD_FILE_GC, &nf->nf_flags); - nf->nf_inode = key->inode; - refcount_set(&nf->nf_ref, 1); - nf->nf_may = key->need; - nf->nf_mark = NULL; - } + if (unlikely(!nf)) + return NULL; + + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_birthtime = ktime_get(); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = want_gc ? + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); + nf->nf_inode = inode; + refcount_set(&nf->nf_ref, 1); + nf->nf_may = need; + nf->nf_mark = NULL; return nf; } @@ -352,8 +254,8 @@ static void nfsd_file_hash_remove(struct nfsd_file *nf) { trace_nfsd_file_unhash(nf); - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, - nfsd_file_rhash_params); + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); } static bool @@ -380,10 +282,8 @@ nfsd_file_free(struct nfsd_file *nf) if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { - get_file(nf->nf_file); - filp_close(nf->nf_file, NULL); nfsd_file_check_write_error(nf); - fput(nf->nf_file); + filp_close(nf->nf_file, NULL); } /* @@ -402,13 +302,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf) struct file *file = nf->nf_file; struct address_space *mapping; - if (!file || !(file->f_mode & FMODE_WRITE)) + /* File not open for write? */ + if (!(file->f_mode & FMODE_WRITE)) + return false; + + /* + * Some filesystems (e.g. NFS) flush all dirty data on close. + * On others, there is no need to wait for writeback. + */ + if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE)) return false; + mapping = file->f_mapping; return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } + static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); @@ -492,49 +402,26 @@ nfsd_file_dispose_list(struct list_head *dispose) } } -static void -nfsd_file_list_remove_disposal(struct list_head *dst, - struct nfsd_fcache_disposal *l) -{ - spin_lock(&l->lock); - list_splice_init(&l->freeme, dst); - spin_unlock(&l->lock); -} - -static void -nfsd_file_list_add_disposal(struct list_head *files, struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfsd_fcache_disposal *l = nn->fcache_disposal; - - spin_lock(&l->lock); - list_splice_tail_init(files, &l->freeme); - spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); -} - -static void -nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src, - struct net *net) -{ - struct nfsd_file *nf, *tmp; - - list_for_each_entry_safe(nf, tmp, src, nf_lru) { - if (nf->nf_net == net) - list_move_tail(&nf->nf_lru, dst); - } -} - +/** + * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list + * @dispose: list of nfsd_files to be disposed + * + * Transfers each file to the "freeme" list for its nfsd_net, to eventually + * be disposed of by the per-net garbage collector. + */ static void nfsd_file_dispose_list_delayed(struct list_head *dispose) { - LIST_HEAD(list); - struct nfsd_file *nf; - while(!list_empty(dispose)) { - nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - nfsd_file_list_add_pernet(&list, dispose, nf->nf_net); - nfsd_file_list_add_disposal(&list, nf->nf_net); + struct nfsd_file *nf = list_first_entry(dispose, + struct nfsd_file, nf_lru); + struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + spin_lock(&l->lock); + list_move_tail(&nf->nf_lru, &l->freeme); + spin_unlock(&l->lock); + queue_work(nfsd_filecache_wq, &l->work); } } @@ -678,8 +565,8 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) * @inode: inode on which to close out nfsd_files * @dispose: list on which to gather nfsd_files to close out * - * An nfsd_file represents a struct file being held open on behalf of nfsd. An - * open file however can block other activity (such as leases), or cause + * An nfsd_file represents a struct file being held open on behalf of nfsd. + * An open file however can block other activity (such as leases), or cause * undesirable behavior (e.g. spurious silly-renames when reexporting NFS). * * This function is intended to find open nfsd_files when this sort of @@ -692,20 +579,17 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) static void nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_INODE, - .inode = inode, - }; + struct rhlist_head *tmp, *list; struct nfsd_file *nf; rcu_read_lock(); - do { - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); - if (!nf) - break; + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (!test_bit(NFSD_FILE_GC, &nf->nf_flags)) + continue; nfsd_file_cond_queue(nf, dispose); - } while (1); + } rcu_read_unlock(); } @@ -758,8 +642,8 @@ nfsd_file_close_inode_sync(struct inode *inode) * nfsd_file_delayed_close - close unused nfsd_files * @work: dummy * - * Walk the LRU list and destroy any entries that have not been used since - * the last scan. + * Scrape the freeme list for this nfsd_net, and then dispose of them + * all. */ static void nfsd_file_delayed_close(struct work_struct *work) @@ -768,7 +652,10 @@ nfsd_file_delayed_close(struct work_struct *work) struct nfsd_fcache_disposal *l = container_of(work, struct nfsd_fcache_disposal, work); - nfsd_file_list_remove_disposal(&head, l); + spin_lock(&l->lock); + list_splice_init(&l->freeme, &head); + spin_unlock(&l->lock); + nfsd_file_dispose_list(&head); } @@ -829,7 +716,7 @@ nfsd_file_cache_init(void) if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) return 0; - ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); if (ret) return ret; @@ -897,7 +784,7 @@ out_err: nfsd_file_mark_slab = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; - rhashtable_destroy(&nfsd_file_rhash_tbl); + rhltable_destroy(&nfsd_file_rhltable); goto out; } @@ -906,7 +793,8 @@ out_err: * @net: net-namespace to shut down the cache (may be NULL) * * Walk the nfsd_file cache and close out any that match @net. If @net is NULL, - * then close out everything. Called when an nfsd instance is being shut down. + * then close out everything. Called when an nfsd instance is being shut down, + * and when the exports table is flushed. */ static void __nfsd_file_cache_purge(struct net *net) @@ -915,7 +803,7 @@ __nfsd_file_cache_purge(struct net *net) struct nfsd_file *nf; LIST_HEAD(dispose); - rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); + rhltable_walk_enter(&nfsd_file_rhltable, &iter); do { rhashtable_walk_start(&iter); @@ -1021,7 +909,7 @@ nfsd_file_cache_shutdown(void) nfsd_file_mark_slab = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; - rhashtable_destroy(&nfsd_file_rhash_tbl); + rhltable_destroy(&nfsd_file_rhltable); for_each_possible_cpu(i) { per_cpu(nfsd_file_cache_hits, i) = 0; @@ -1032,6 +920,35 @@ nfsd_file_cache_shutdown(void) } } +static struct nfsd_file * +nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, + struct inode *inode, unsigned char need, + bool want_gc) +{ + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; + + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (nf->nf_may != need) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, cred)) + continue; + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) + continue; + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) + continue; + + if (!nfsd_file_get(nf)) + continue; + return nf; + } + return NULL; +} + /** * nfsd_file_is_cached - are there any cached open files for this inode? * @inode: inode to check @@ -1046,15 +963,20 @@ nfsd_file_cache_shutdown(void) bool nfsd_file_is_cached(struct inode *inode) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_INODE, - .inode = inode, - }; + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; bool ret = false; - if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params) != NULL) - ret = true; + rcu_read_lock(); + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) { + ret = true; + break; + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, (int)ret); return ret; } @@ -1064,14 +986,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct file *file, struct nfsd_file **pnf, bool want_gc) { - struct nfsd_file_lookup_key key = { - .type = NFSD_FILE_KEY_FULL, - .need = may_flags & NFSD_FILE_MAY_MASK, - .net = SVC_NET(rqstp), - .gc = want_gc, - }; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + struct net *net = SVC_NET(rqstp); + struct nfsd_file *new, *nf; + const struct cred *cred; bool open_retry = true; - struct nfsd_file *nf; + struct inode *inode; __be32 status; int ret; @@ -1079,80 +999,88 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, may_flags|NFSD_MAY_OWNER_OVERRIDE); if (status != nfs_ok) return status; - key.inode = d_inode(fhp->fh_dentry); - key.cred = get_current_cred(); + inode = d_inode(fhp->fh_dentry); + cred = get_current_cred(); retry: rcu_read_lock(); - nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, - nfsd_file_rhash_params); - nf = nfsd_file_get(nf); + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); rcu_read_unlock(); if (nf) { + /* + * If the nf is on the LRU then it holds an extra reference + * that must be put if it's removed. It had better not be + * the last one however, since we should hold another. + */ if (nfsd_file_lru_remove(nf)) WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref)); goto wait_for_construction; } - nf = nfsd_file_alloc(&key, may_flags); - if (!nf) { + new = nfsd_file_alloc(net, inode, need, want_gc); + if (!new) { status = nfserr_jukebox; - goto out_status; + goto out; } - ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl, - &key, &nf->nf_rhash, - nfsd_file_rhash_params); + rcu_read_lock(); + spin_lock(&inode->i_lock); + nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + if (unlikely(nf)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + nfsd_file_slab_free(&new->nf_rcu); + goto wait_for_construction; + } + nf = new; + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (likely(ret == 0)) goto open_file; - nfsd_file_slab_free(&nf->nf_rcu); - nf = NULL; if (ret == -EEXIST) goto retry; - trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret); + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); status = nfserr_jukebox; - goto out_status; + goto construction_err; wait_for_construction: wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); if (!open_retry) { status = nfserr_jukebox; - goto out; + goto construction_err; } open_retry = false; - if (refcount_dec_and_test(&nf->nf_ref)) - nfsd_file_free(nf); goto retry; } - this_cpu_inc(nfsd_file_cache_hits); status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags)); + if (status != nfs_ok) { + nfsd_file_put(nf); + nf = NULL; + } + out: if (status == nfs_ok) { this_cpu_inc(nfsd_file_acquisitions); nfsd_file_check_write_error(nf); *pnf = nf; - } else { - if (refcount_dec_and_test(&nf->nf_ref)) - nfsd_file_free(nf); - nf = NULL; } - -out_status: - put_cred(key.cred); - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); + put_cred(cred); + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); return status; open_file: trace_nfsd_file_alloc(nf); - nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); + nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode); if (nf->nf_mark) { if (file) { get_file(file); @@ -1170,13 +1098,16 @@ open_file: * If construction failed, or we raced with a call to unlink() * then unhash. */ - if (status == nfs_ok && key.inode->i_nlink == 0) - status = nfserr_jukebox; - if (status != nfs_ok) + if (status != nfs_ok || inode->i_nlink == 0) nfsd_file_unhash(nf); - clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); - smp_mb__after_atomic(); - wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); + if (status == nfs_ok) + goto out; + +construction_err: + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; goto out; } @@ -1192,8 +1123,11 @@ open_file: * seconds after the final nfsd_file_put() in case the caller * wants to re-use it. * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1213,8 +1147,11 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, * but not garbage-collected. The object is unhashed after the * final nfsd_file_put(). * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1235,8 +1172,11 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, * and @file is non-NULL, use it to instantiate a new nfsd_file instead of * opening a new one. * - * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in - * network byte order is returned. + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. */ __be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -1267,7 +1207,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) lru = list_lru_count(&nfsd_file_lru); rcu_read_lock(); - ht = &nfsd_file_rhash_tbl; + ht = &nfsd_file_rhltable.ht; count = atomic_read(&ht->nelems); tbl = rht_dereference_rcu(ht->tbl, ht); buckets = tbl->size; @@ -1283,7 +1223,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v) evictions += per_cpu(nfsd_file_evictions, i); } - seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "total inodes: %u\n", count); seq_printf(m, "hash buckets: %u\n", buckets); seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 41516a4263ea..e54165a3224f 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -29,9 +29,8 @@ struct nfsd_file_mark { * never be dereferenced, only used for comparison. */ struct nfsd_file { - struct rhash_head nf_rhash; - struct list_head nf_lru; - struct rcu_head nf_rcu; + struct rhlist_head nf_rlist; + void *nf_inode; struct file *nf_file; const struct cred *nf_cred; struct net *nf_net; @@ -40,10 +39,12 @@ struct nfsd_file { #define NFSD_FILE_REFERENCED (2) #define NFSD_FILE_GC (3) unsigned long nf_flags; - struct inode *nf_inode; /* don't deref */ refcount_t nf_ref; unsigned char nf_may; + struct nfsd_file_mark *nf_mark; + struct list_head nf_lru; + struct rcu_head nf_rcu; ktime_t nf_birthtime; }; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5e9809aff37e..7a806ac13e31 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -240,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; error = -ENOMEM; @@ -408,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; /* ID */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5783209f17fc..bb9d47172162 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -930,6 +930,9 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines * directly. They will be released after the sending has completed. + * + * Return values: Number of bytes consumed, or -EIO if there are no + * remaining pages in rqstp->rq_pages. */ static int nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, @@ -948,7 +951,8 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, */ if (page == *(rqstp->rq_next_page - 1)) continue; - svc_rqst_replace_page(rqstp, page); + if (unlikely(!svc_rqst_replace_page(rqstp, page))) + return -EIO; } if (rqstp->rq_res.page_len == 0) // first call rqstp->rq_res.page_base = offset % PAGE_SIZE; @@ -2164,7 +2168,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + buf = kvmalloc(len, GFP_KERNEL); if (buf == NULL) { err = nfserr_jukebox; goto out; @@ -2227,10 +2231,7 @@ nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp, goto out; } - /* - * We're holding i_rwsem - use GFP_NOFS. - */ - buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + buf = kvmalloc(len, GFP_KERNEL); if (buf == NULL) { err = nfserr_jukebox; goto out; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 41ccd43cd979..5cf30827f244 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -259,10 +259,10 @@ repeat: NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); - if (unlikely(!dfolio)) { + if (unlikely(IS_ERR(dfolio))) { /* No empty page is added to the page cache */ - err = -ENOMEM; folio_unlock(folio); + err = PTR_ERR(dfolio); break; } if (unlikely(!folio_buffers(folio))) @@ -311,7 +311,7 @@ repeat: folio_lock(folio); dfolio = filemap_lock_folio(dmap, index); - if (dfolio) { + if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); nilfs_copy_page(&dfolio->page, &folio->page, 0); diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index a030d00af90c..174fe536a1c0 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -31,16 +31,6 @@ static struct ctl_table ntfs_sysctls[] = { {} }; -/* Define the parent directory /proc/sys/fs. */ -static struct ctl_table sysctls_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = ntfs_sysctls - }, - {} -}; - /* Storage for the sysctls header. */ static struct ctl_table_header *sysctls_root_table; @@ -54,7 +44,7 @@ int ntfs_sysctl(int add) { if (add) { BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl_table(sysctls_root); + sysctls_root_table = register_sysctl("fs", ntfs_sysctls); if (!sysctls_root_table) return -ENOMEM; } else { diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 5e6bafb10f42..0b8bc66377db 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -405,8 +405,8 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; - bool is_mft = - ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; + bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && + !name_len; u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; @@ -531,11 +531,10 @@ add_alloc_in_same_attr_seg: pre_alloc = 0; if (type == ATTR_DATA && !name_len && sbi->options->prealloc) { - pre_alloc = - bytes_to_cluster( - sbi, - get_pre_allocated(new_size)) - - new_alen; + pre_alloc = bytes_to_cluster( + sbi, get_pre_allocated( + new_size)) - + new_alen; } /* Get the last LCN to allocate from. */ @@ -573,8 +572,8 @@ add_alloc_in_same_attr_seg: err = attr_allocate_clusters( sbi, run, vcn, lcn, to_allocate, &pre_alloc, is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, - is_mft ? 0 - : (sbi->record_size - + is_mft ? 0 : + (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 723fb64e6531..9a6c6a09d70c 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -40,9 +40,9 @@ static struct kmem_cache *ntfs_enode_cachep; int __init ntfs3_init_bitmap(void) { - ntfs_enode_cachep = - kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, - SLAB_RECLAIM_ACCOUNT, NULL); + ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache", + sizeof(struct e_node), 0, + SLAB_RECLAIM_ACCOUNT, NULL); return ntfs_enode_cachep ? 0 : -ENOMEM; } @@ -286,9 +286,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, if (wnd->uptodated != 1) { /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || - bit < wnd->zone_end - ? 0 - : wnd->zone_end; + bit < wnd->zone_end ? + 0 : + wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; @@ -297,9 +297,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || - end_in > wnd->zone_bit - ? wnd->nbits - : wnd->zone_bit; + end_in > wnd->zone_bit ? + wnd->nbits : + wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; @@ -417,8 +417,8 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) return; n3 = rb_first(&wnd->count_tree); wnd->extent_max = - n3 ? rb_entry(n3, struct e_node, count.node)->count.key - : 0; + n3 ? rb_entry(n3, struct e_node, count.node)->count.key : + 0; return; } @@ -658,7 +658,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) if (!wnd->bits_last) wnd->bits_last = wbits; - wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + wnd->free_bits = + kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index e9bdc1ff08c9..9a3d55c367d9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -22,20 +22,21 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) { struct fstrim_range __user *user_range; struct fstrim_range range; + struct block_device *dev; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) + dev = sbi->sb->s_bdev; + if (!bdev_max_discard_sectors(dev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(u32, range.minlen, - bdev_discard_granularity(sbi->sb->s_bdev)); + range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev)); err = ntfs_trim_fs(sbi, &range); if (err < 0) @@ -190,8 +191,8 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; - to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) - : PAGE_SIZE; + to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) : + PAGE_SIZE; iblock = page_off >> inode->i_blkbits; page = find_or_create_page(mapping, idx, @@ -223,16 +224,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(REQ_OP_READ, bh); - - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + err = bh_read(bh, 0); + if (err < 0) { unlock_page(page); put_page(page); - err = -EIO; goto out; } } @@ -570,13 +565,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_unlock(ni); } else { /* Check new size. */ + u8 cluster_bits = sbi->cluster_bits; /* generic/213: expected -ENOSPC instead of -EFBIG. */ if (!is_supported_holes) { loff_t to_alloc = new_size - inode_get_bytes(inode); if (to_alloc > 0 && - (to_alloc >> sbi->cluster_bits) > + (to_alloc >> cluster_bits) > wnd_zeroes(&sbi->used.bitmap)) { err = -ENOSPC; goto out; @@ -597,7 +593,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) } if (is_supported_holes) { - CLST vcn = vbo >> sbi->cluster_bits; + CLST vcn = vbo >> cluster_bits; CLST cend = bytes_to_cluster(sbi, end); CLST cend_v = bytes_to_cluster(sbi, ni->i_valid); CLST lcn, clen; @@ -660,22 +656,12 @@ out: int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { - struct super_block *sb = dentry->d_sb; - struct ntfs_sb_info *sbi = sb->s_fs_info; struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); u32 ia_valid = attr->ia_valid; umode_t mode = inode->i_mode; int err; - if (sbi->options->noacsrules) { - /* "No access rules" - Force any changes of time etc. */ - attr->ia_valid |= ATTR_FORCE; - /* and disable for editing some attributes. */ - attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE); - ia_valid = attr->ia_valid; - } - err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -719,7 +705,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) - ntfs_save_wsl_perm(inode); + ntfs_save_wsl_perm(inode, NULL); mark_inode_dirty(inode); out: return err; @@ -1065,8 +1051,8 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out; - ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) - : __generic_file_write_iter(iocb, from); + ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) : + __generic_file_write_iter(iocb, from); out: inode_unlock(inode); @@ -1118,8 +1104,9 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && - atomic_read(&inode->i_writecount) == 1)) { + if (sbi->options->prealloc && + ((file->f_mode & FMODE_WRITE) && + atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); @@ -1159,8 +1146,7 @@ const struct inode_operations ntfs_file_inode_operations = { .getattr = ntfs_getattr, .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, - .permission = ntfs_permission, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, }; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index f1df52dfab74..2bfcf1a989c9 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -76,8 +76,8 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) const struct ATTRIB *attr; attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); - return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) - : NULL; + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : + NULL; } /* @@ -91,8 +91,8 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); - return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) - : NULL; + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : + NULL; } /* @@ -102,7 +102,7 @@ void ni_clear(struct ntfs_inode *ni) { struct rb_node *node; - if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec)) + if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec)) ni_delete_all(ni); al_destroy(ni); @@ -1439,8 +1439,8 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, int err; CLST plen; struct ATTRIB *attr; - bool is_ext = - (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn; + bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && + !svcn; u32 name_size = ALIGN(name_len * sizeof(short), 8); u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT; u32 run_off = name_off + name_size; @@ -1645,7 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; - struct le_str *fns; + struct le_str *fns; if (le) *le = NULL; @@ -1756,9 +1756,9 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) } /* Resize nonresident empty attribute in-place only. */ - new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) - ? (SIZEOF_NONRESIDENT_EX + 8) - : (SIZEOF_NONRESIDENT + 8); + new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? + (SIZEOF_NONRESIDENT_EX + 8) : + (SIZEOF_NONRESIDENT + 8); if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) return -EOPNOTSUPP; @@ -2965,14 +2965,14 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, { struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr; - u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0; + u16 de_key_size; switch (undo_step) { case 4: + de_key_size = le16_to_cpu(de2->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, - &attr, NULL, NULL)) { + &attr, NULL, NULL)) return false; - } memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size); mi_get_ref(&ni->mi, &de2->ref); @@ -2981,19 +2981,16 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, de2->flags = 0; de2->res = 0; - if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, - 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1)) return false; - } fallthrough; case 2: de_key_size = le16_to_cpu(de->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, - &attr, NULL, NULL)) { + &attr, NULL, NULL)) return false; - } memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); @@ -3162,9 +3159,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, u64 data_size = le64_to_cpu(attr->nres.data_size); __le64 valid_le; - dup->alloc_size = is_attr_ext(attr) - ? attr->nres.total_size - : attr->nres.alloc_size; + dup->alloc_size = is_attr_ext(attr) ? + attr->nres.total_size : + attr->nres.alloc_size; dup->data_size = attr->nres.data_size; if (new_valid > data_size) @@ -3258,6 +3255,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) return 0; } + if (!ni->mi.mrec) + goto out; + if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; @@ -3360,7 +3360,7 @@ out: ni_unlock(ni); if (err) { - ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err); + ntfs_inode_err(inode, "%s failed, %d.", hint, err); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); return err; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index c6eb371a3695..57762c5fe68b 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -827,10 +827,10 @@ static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, memcpy(rt + 1, tbl + 1, esize * used); - rt->free_goal = free_goal == ~0u - ? cpu_to_le32(~0u) - : cpu_to_le32(sizeof(struct RESTART_TABLE) + - free_goal * esize); + rt->free_goal = free_goal == ~0u ? + cpu_to_le32(~0u) : + cpu_to_le32(sizeof(struct RESTART_TABLE) + + free_goal * esize); if (tbl->first_free) { rt->first_free = tbl->first_free; @@ -1089,9 +1089,9 @@ static inline u64 base_lsn(struct ntfs_log *log, (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) << log->file_data_bits) + ((((is_log_record_end(hdr) && - h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) - ? le16_to_cpu(hdr->record_hdr.next_record_off) - : log->page_size) + + h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ? + le16_to_cpu(hdr->record_hdr.next_record_off) : + log->page_size) + lsn) >> 3); @@ -1298,9 +1298,9 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 - ? 0x22 * page_size - : ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? + 0x22 * page_size : + ((sys_page_size << 1) + (page_size << 1)); log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1512,20 +1512,19 @@ static u32 current_log_avail(struct ntfs_log *log) * have to compute the free range. * If there is no oldest lsn then start at the first page of the file. */ - oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) - ? log->first_page - : (log->oldest_lsn_off & ~log->sys_page_mask); + oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ? + log->first_page : + (log->oldest_lsn_off & ~log->sys_page_mask); /* * We will use the next log page offset to compute the next free page. * If we are going to reuse this page go to the next page. * If we are at the first page then use the end of the file. */ - next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) - ? log->next_page + log->page_size - : log->next_page == log->first_page - ? log->l_size - : log->next_page; + next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ? + log->next_page + log->page_size : + log->next_page == log->first_page ? log->l_size : + log->next_page; /* If the two offsets are the same then there is no available space. */ if (oldest_off == next_free_off) @@ -1535,9 +1534,9 @@ static u32 current_log_avail(struct ntfs_log *log) * this range from the total available pages. */ free_bytes = - oldest_off < next_free_off - ? log->total_avail_pages - (next_free_off - oldest_off) - : oldest_off - next_free_off; + oldest_off < next_free_off ? + log->total_avail_pages - (next_free_off - oldest_off) : + oldest_off - next_free_off; free_bytes >>= log->page_bits; return free_bytes * log->reserved; @@ -1671,8 +1670,8 @@ next_tail: } best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; - best_lsn2 = - second_tail ? base_lsn(log, second_tail, second_file_off) : 0; + best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) : + 0; if (first_tail && second_tail) { if (best_lsn1 > best_lsn2) { @@ -1767,8 +1766,8 @@ tail_read: page_cnt = page_pos = 1; - curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) - : log->next_page; + curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) : + log->next_page; wrapped_file = curpage_off == log->first_page && @@ -1826,9 +1825,9 @@ use_cur_page: le64_to_cpu(cur_page->record_hdr.last_end_lsn) && ((lsn_cur >> log->file_data_bits) + ((curpage_off < - (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) - ? 1 - : 0)) != expected_seq) { + (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ? + 1 : + 0)) != expected_seq) { goto check_tail; } @@ -2575,7 +2574,7 @@ static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) return find_log_rec(log, *lsn, lcb); } -static inline bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) { __le16 mask; u32 min_de, de_off, used, total; @@ -2642,9 +2641,10 @@ static inline bool check_index_root(const struct ATTRIB *attr, { bool ret; const struct INDEX_ROOT *root = resident_data(attr); - u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size - ? sbi->cluster_bits - : SECTOR_SHIFT; + u8 index_bits = le32_to_cpu(root->index_block_size) >= + sbi->cluster_size ? + sbi->cluster_bits : + SECTOR_SHIFT; u8 block_clst = root->index_block_clst; if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || @@ -3683,7 +3683,8 @@ move_data: if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, + 0); if (err) goto out; } @@ -3768,11 +3769,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (!log) return -ENOMEM; - memset(&rst_info, 0, sizeof(struct restart_info)); - log->ni = ni; log->l_size = l_size; log->one_page_buf = kmalloc(page_size, GFP_NOFS); + if (!log->one_page_buf) { err = -ENOMEM; goto out; @@ -3783,6 +3783,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) log->page_bits = blksize_bits(page_size); /* Look for a restart area on the disk. */ + memset(&rst_info, 0, sizeof(struct restart_info)); err = log_read_rst(log, l_size, true, &rst_info); if (err) goto out; @@ -3859,10 +3860,10 @@ check_restart_area: log->init_ra = !!rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page - ? Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) - : NULL; + ra2 = rst_info.valid_page ? + Add2Ptr(rst_info.r_page, + le16_to_cpu(rst_info.r_page->ra_off)) : + NULL; if (rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { @@ -4256,6 +4257,10 @@ check_attribute_names: rec_len -= t32; attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS); + if (!attr_names) { + err = -ENOMEM; + goto out; + } lcb_put(lcb); lcb = NULL; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 567563771bf8..28cc421102e5 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -172,8 +172,8 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, u16 sample, fo, fn; fo = le16_to_cpu(rhdr->fix_off); - fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) - : le16_to_cpu(rhdr->fix_num); + fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) : + le16_to_cpu(rhdr->fix_num); /* Check errors. */ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || @@ -223,7 +223,7 @@ int ntfs_extend_init(struct ntfs_sb_info *sbi) inode = ntfs_iget5(sb, &ref, &NAME_EXTEND); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load $Extend."); + ntfs_err(sb, "Failed to load $Extend (%d).", err); inode = NULL; goto out; } @@ -282,7 +282,7 @@ int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) /* Check for 4GB. */ if (ni->vfs_inode.i_size >= 0x100000000ull) { - ntfs_err(sb, "\x24LogFile is too big"); + ntfs_err(sb, "\x24LogFile is large than 4G."); err = -EINVAL; goto out; } @@ -646,13 +646,13 @@ next: NULL, 0, NULL, NULL)) goto next; - __clear_bit_le(ir - MFT_REC_RESERVED, + __clear_bit(ir - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } } /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ - zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap, + zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, MFT_REC_FREE, MFT_REC_RESERVED); if (zbit >= MFT_REC_FREE) { sbi->mft.next_reserved = MFT_REC_FREE; @@ -720,7 +720,7 @@ found: if (*rno >= MFT_REC_FREE) wnd_set_used(wnd, *rno, 1); else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) - __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); out: if (!mft) @@ -748,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) else wnd_set_free(wnd, rno, 1); } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { - __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } if (rno < wnd_zone_bit(wnd)) @@ -846,18 +846,16 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) { int err; struct super_block *sb = sbi->sb; - u32 blocksize; + u32 blocksize, bytes; sector_t block1, block2; - u32 bytes; - if (!sb) + /* + * sb can be NULL here. In this case sbi->flags should be 0 too. + */ + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) return; blocksize = sb->s_blocksize; - - if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) - return; - bytes = sbi->mft.recs_mirr << sbi->record_bits; block1 = sbi->mft.lbo >> sb->s_blocksize_bits; block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; @@ -925,6 +923,7 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) struct VOLUME_INFO *info; struct mft_inode *mi; struct ntfs_inode *ni; + __le16 info_flags; /* * Do not change state if fs was real_dirty. @@ -957,6 +956,8 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) goto out; } + info_flags = info->flags; + switch (dirty) { case NTFS_DIRTY_ERROR: ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors"); @@ -970,8 +971,10 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) break; } /* Cache current volume flags. */ - sbi->volume.flags = info->flags; - mi->dirty = true; + if (info_flags != info->flags) { + sbi->volume.flags = info->flags; + mi->dirty = true; + } err = 0; out: @@ -1683,6 +1686,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) out: if (err) { + make_bad_inode(inode); iput(inode); ni = ERR_PTR(err); } @@ -1859,7 +1863,7 @@ int ntfs_security_init(struct ntfs_sb_info *sbi) inode = ntfs_iget5(sb, &ref, &NAME_SECURE); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load $Secure."); + ntfs_err(sb, "Failed to load $Secure (%d).", err); inode = NULL; goto out; } @@ -1870,41 +1874,43 @@ int ntfs_security_init(struct ntfs_sb_info *sbi) attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME, ARRAY_SIZE(SDH_NAME), NULL, NULL); - if (!attr) { - err = -EINVAL; - goto out; - } - - root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); - if (root_sdh->type != ATTR_ZERO || + if (!attr || + !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || + root_sdh->type != ATTR_ZERO || root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH || - offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) { + offsetof(struct INDEX_ROOT, ihdr) + + le32_to_cpu(root_sdh->ihdr.used) > + le32_to_cpu(attr->res.data_size)) { + ntfs_err(sb, "$Secure::$SDH is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err); goto out; + } attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME, ARRAY_SIZE(SII_NAME), NULL, NULL); - if (!attr) { - err = -EINVAL; - goto out; - } - - root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); - if (root_sii->type != ATTR_ZERO || + if (!attr || + !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || + root_sii->type != ATTR_ZERO || root_sii->rule != NTFS_COLLATION_TYPE_UINT || - offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) { + offsetof(struct INDEX_ROOT, ihdr) + + le32_to_cpu(root_sii->ihdr.used) > + le32_to_cpu(attr->res.data_size)) { + ntfs_err(sb, "$Secure::$SII is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err); goto out; + } fnd_sii = fnd_get(); if (!fnd_sii) { @@ -2594,8 +2600,10 @@ static inline bool is_reserved_name(struct ntfs_sb_info *sbi, if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) { port_digit = le16_to_cpu(name[3]); if (port_digit >= '1' && port_digit <= '9') - if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) || - !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false)) + if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, + false) || + !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, + false)) return true; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 51ab75954640..0a48d2d67219 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -431,8 +431,9 @@ next_run: if (vbo + blocksize > data_size) nbits = 8 * (data_size - vbo); - ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) - : false; + ok = nbits > from ? + (*fn)((ulong *)bh->b_data, from, nbits, ret) : + false; put_bh(bh); if (ok) { @@ -725,9 +726,13 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u32 total = le32_to_cpu(hdr->total); u16 offs[128]; fill_table: + if (end > total) + return NULL; + if (off + sizeof(struct NTFS_DE) > end) return NULL; @@ -760,8 +765,7 @@ binary_search: return NULL; max_idx = 0; - table_size = min(table_size * 2, - (int)ARRAY_SIZE(offs)); + table_size = min(table_size * 2, (int)ARRAY_SIZE(offs)); goto fill_table; } } else if (diff2 < 0) { @@ -844,6 +848,10 @@ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, u32 off = PtrOffset(hdr, re); int bytes = used - (off + esize); + /* check INDEX_HDR valid before using INDEX_HDR */ + if (!check_index_header(hdr, le32_to_cpu(hdr->total))) + return NULL; + if (off >= used || esize < sizeof(struct NTFS_DE) || bytes < sizeof(struct NTFS_DE)) return NULL; @@ -986,6 +994,7 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *a; const struct INDEX_NAMES *in = &s_index_names[indx->type]; + struct INDEX_ROOT *root; a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, mi); @@ -995,7 +1004,16 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, if (attr) *attr = a; - return resident_data_ex(a, sizeof(struct INDEX_ROOT)); + root = resident_data_ex(a, sizeof(struct INDEX_ROOT)); + + /* length check */ + if (root && + offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) > + le32_to_cpu(a->res.data_size)) { + return NULL; + } + + return root; } static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, @@ -1085,7 +1103,8 @@ ok: } /* check for index header length */ - if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) { + if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) > + bytes) { err = -EINVAL; goto out; } @@ -1151,8 +1170,10 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &node); - if (err) + if (err) { + /* io error? */ return err; + } /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, @@ -1654,9 +1675,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, mi->dirty = true; /* Create alloc and bitmap attributes (if not). */ - err = run_is_empty(&indx->alloc_run) - ? indx_create_allocate(indx, ni, &new_vbn) - : indx_add_allocate(indx, ni, &new_vbn); + err = run_is_empty(&indx->alloc_run) ? + indx_create_allocate(indx, ni, &new_vbn) : + indx_add_allocate(indx, ni, &new_vbn); /* Layout of record may be changed, so rescan root. */ root = indx_get_root(indx, ni, &attr, &mi); @@ -1759,10 +1780,11 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, struct indx_node *n1 = fnd->nodes[level]; struct INDEX_HDR *hdr1 = &n1->index->ihdr; struct INDEX_HDR *hdr2; - u32 to_copy, used; + u32 to_copy, used, used1; CLST new_vbn; __le64 t_vbn, *sub_vbn; u16 sp_size; + void *hdr1_saved = NULL; /* Try the most easy case. */ e = fnd->level - 1 == level ? fnd->de[level] : NULL; @@ -1795,6 +1817,13 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, return -ENOMEM; memcpy(up_e, sp, sp_size); + used1 = le32_to_cpu(hdr1->used); + hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS); + if (!hdr1_saved) { + err = -ENOMEM; + goto out; + } + if (!hdr1->flags) { up_e->flags |= NTFS_IE_HAS_SUBNODES; up_e->size = cpu_to_le16(sp_size + sizeof(u64)); @@ -1827,7 +1856,7 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, hdr_insert_head(hdr2, de_t, to_copy); /* Remove all entries (sp including) from hdr1. */ - used = le32_to_cpu(hdr1->used) - to_copy - sp_size; + used = used1 - to_copy - sp_size; memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); hdr1->used = cpu_to_le32(used); @@ -1838,9 +1867,9 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, hdr_insert_de(indx, (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), up_e + 1, le16_to_cpu(up_e->key_size), - ctx) < 0 - ? hdr2 - : hdr1, + ctx) < 0 ? + hdr2 : + hdr1, new_de, NULL, ctx); indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); @@ -1857,8 +1886,6 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, if (!level) { /* Insert in root. */ err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); - if (err) - goto out; } else { /* * The target buffer's parent is another index buffer. @@ -1866,12 +1893,20 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, */ err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, level - 1, fnd); - if (err) - goto out; + } + + if (err) { + /* + * Undo critical operations. + */ + indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits); + memcpy(hdr1, hdr1_saved, used1); + indx_write(indx, ni, n1, 0); } out: kfree(up_e); + kfree(hdr1_saved); return err; } @@ -1930,16 +1965,12 @@ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, */ err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, fnd, undo); - if (err) - goto out; } else { /* * Found a leaf buffer, so we'll insert the new entry into it. */ err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, fnd->level - 1, fnd); - if (err) - goto out; } out: @@ -2308,8 +2339,8 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = level ? indx_insert_into_buffer(indx, ni, root, re, ctx, fnd->level - 1, - fnd) - : indx_insert_into_root(indx, ni, re, e, + fnd) : + indx_insert_into_root(indx, ni, re, e, ctx, fnd, 0); kfree(re); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 309d9b46b5d5..6c560245eef4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -100,6 +100,12 @@ static struct inode *ntfs_read_mft(struct inode *inode, /* Record should contain $I30 root. */ is_dir = rec->flags & RECORD_FLAG_DIR; + /* MFT_REC_MFT is not a dir */ + if (is_dir && ino == MFT_REC_MFT) { + err = -EINVAL; + goto out; + } + inode->i_generation = le16_to_cpu(rec->seq); /* Enumerate all struct Attributes MFT. */ @@ -131,7 +137,13 @@ next_attr: rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); asize = le32_to_cpu(attr->size); - if (le16_to_cpu(attr->name_off) + attr->name_len > asize) + /* + * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'. + * There not critical to check this case again + */ + if (attr->name_len && + sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) > + asize) goto out; if (attr->non_res) { @@ -250,8 +262,8 @@ next_attr: if (!attr->nres.alloc_size) goto next_attr; - run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run - : &ni->file.run; + run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run : + &ni->file.run; break; case ATTR_ROOT: @@ -259,7 +271,6 @@ next_attr: goto out; root = Add2Ptr(attr, roff); - is_root = true; if (attr->name_len != ARRAY_SIZE(I30_NAME) || memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) @@ -272,15 +283,16 @@ next_attr: if (!is_dir) goto next_attr; + is_root = true; ni->ni_flags |= NI_FLAG_DIR; err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); if (err) goto out; - mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) - : (S_IFDIR | 0777); + mode = sb->s_root ? + (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : + (S_IFDIR | 0777); goto next_attr; case ATTR_ALLOC: @@ -437,8 +449,8 @@ end_enum: ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; - inode->i_mapping->a_ops = - is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : + &ntfs_aops; if (ino != MFT_REC_MFT) init_rwsem(&ni->file.run_lock); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || @@ -636,6 +648,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); + err = bh_read(bh, 0); if (err < 0) goto out; @@ -773,8 +786,8 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } ret = blockdev_direct_IO(iocb, inode, iter, - wr ? ntfs_get_block_direct_IO_W - : ntfs_get_block_direct_IO_R); + wr ? ntfs_get_block_direct_IO_W : + ntfs_get_block_direct_IO_R); if (ret > 0) end = vbo + ret; @@ -833,7 +846,7 @@ out: } static int ntfs_resident_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, void *data) { struct address_space *mapping = data; struct ntfs_inode *ni = ntfs_i(mapping->host); @@ -874,8 +887,8 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, *pagep = NULL; if (is_resident(ni)) { - struct page *page = grab_cache_page_write_begin( - mapping, pos >> PAGE_SHIFT); + struct page *page = + grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT); if (!page) { err = -ENOMEM; @@ -907,9 +920,8 @@ out: /* * ntfs_write_end - Address_space_operations::write_end. */ -int ntfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, u32 copied, struct page *page, - void *fsdata) +int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, + u32 len, u32 copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); @@ -1307,8 +1319,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = - current_time(inode); + ni->i_crtime = current_time(inode); rec = ni->mi.mrec; rec->hard_links = cpu_to_le16(1); @@ -1349,10 +1360,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, attr->res.data_size = cpu_to_le32(dsize); std5->cr_time = std5->m_time = std5->c_time = std5->a_time = - kernel2nt(&inode->i_atime); + kernel2nt(&ni->i_crtime); - ni->std_fa = fa; - std5->fa = fa; + std5->fa = ni->std_fa = fa; attr = Add2Ptr(attr, asize); @@ -1551,11 +1561,15 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); + /* Write non resident data. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, + nsize, 0); + if (err) + goto out5; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - nsize = 0; } /* Size of symlink equals the length of input string. */ inode->i_size = size; @@ -1576,19 +1590,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8); rec->next_attr_id = cpu_to_le16(aid); - /* Step 2: Add new name in index. */ - err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); - if (err) - goto out6; - - /* Unlock parent directory before ntfs_init_acl. */ - if (!fnd) - ni_unlock(dir_ni); - inode->i_generation = le16_to_cpu(rec->seq); - dir->i_mtime = dir->i_ctime = inode->i_atime; - if (S_ISDIR(mode)) { inode->i_op = &ntfs_dir_inode_operations; inode->i_fop = &ntfs_dir_operations; @@ -1601,8 +1604,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; - inode->i_mapping->a_ops = - is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : + &ntfs_aops; init_rwsem(&ni->file.run_lock); } else { inode->i_op = &ntfs_special_inode_operations; @@ -1613,41 +1616,58 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(idmap, inode, dir); if (err) - goto out7; + goto out5; } else #endif { inode->i_flags |= S_NOSEC; } - /* Write non resident data. */ - if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); - if (err) - goto out7; + /* + * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute. + * The packed size of extended attribute is stored in direntry too. + * 'fname' here points to inside new_de. + */ + ntfs_save_wsl_perm(inode, &fname->dup.ea_size); + + /* + * update ea_size in file_name attribute too. + * Use ni_find_attr cause layout of MFT record may be changed + * in ntfs_init_acl and ntfs_save_wsl_perm. + */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL); + if (attr) { + struct ATTR_FILE_NAME *fn; + + fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (fn) + fn->dup.ea_size = fname->dup.ea_size; } + /* We do not need to update parent directory later */ + ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; + + /* Step 2: Add new name in index. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); + if (err) + goto out6; + /* * Call 'd_instantiate' after inode->i_op is set * but before finish_open. */ d_instantiate(dentry, inode); - ntfs_save_wsl_perm(inode); + /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ + inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime = + dir->i_ctime = ni->i_crtime; + mark_inode_dirty(dir); mark_inode_dirty(inode); /* Normal exit. */ goto out2; -out7: - - /* Undo 'indx_insert_entry'. */ - if (!fnd) - ni_lock_dir(dir_ni); - indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, - le16_to_cpu(new_de->key_size), sbi); - /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1669,11 +1689,11 @@ out2: kfree(rp); out1: - if (err) { - if (!fnd) - ni_unlock(dir_ni); + if (!fnd) + ni_unlock(dir_ni); + + if (err) return ERR_PTR(err); - } unlock_new_inode(inode); @@ -1770,9 +1790,6 @@ void ntfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_nlink) - _ni_write_inode(inode, inode_needs_sync(inode)); - invalidate_inode_buffers(inode); clear_inode(inode); @@ -2057,7 +2074,6 @@ const struct inode_operations ntfs_link_inode_operations = { .get_link = ntfs_get_link, .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, - .permission = ntfs_permission, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index 28f654561f27..61e161c7c567 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -296,8 +296,8 @@ next: */ struct lznt *get_lznt_ctx(int level) { - struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) - : sizeof(struct lznt), + struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) : + sizeof(struct lznt), GFP_NOFS); if (r) @@ -392,9 +392,9 @@ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, unc_use = err; } else { /* This chunk does not contain compressed data. */ - unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end - ? unc_end - unc_chunk - : LZNT_CHUNK_SIZE; + unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ? + unc_end - unc_chunk : + LZNT_CHUNK_SIZE; if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > cmpr_end) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 407fe92394e2..9736b1e4a0f6 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -88,6 +88,16 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, __putname(uni); } + /* + * Check for a null pointer + * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL. + * This causes null pointer dereference in d_splice_alias(). + */ + if (!IS_ERR_OR_NULL(inode) && !inode->i_op) { + iput(inode); + inode = ERR_PTR(-EINVAL); + } + return d_splice_alias(inode, dentry); } @@ -423,8 +433,8 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, NULL, 0, fnd); - err = IS_ERR(inode) ? PTR_ERR(inode) - : finish_open(file, dentry, ntfs_file_open); + err = IS_ERR(inode) ? PTR_ERR(inode) : + finish_open(file, dentry, ntfs_file_open); dput(d); out2: @@ -597,8 +607,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .rmdir = ntfs_rmdir, .mknod = ntfs_mknod, .rename = ntfs_rename, - .permission = ntfs_permission, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .setattr = ntfs3_setattr, .getattr = ntfs_getattr, @@ -611,7 +620,7 @@ const struct inode_operations ntfs_special_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86ea1826d099..90151e56c122 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -435,9 +435,6 @@ static inline u64 attr_svcn(const struct ATTRIB *attr) return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; } -/* The size of resident attribute by its resident size. */ -#define BYTES_PER_RESIDENT(b) (0x18 + (b)) - static_assert(sizeof(struct ATTRIB) == 0x48); static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 80072e5f96f7..eb01f7e76479 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -100,7 +100,6 @@ struct ntfs_mount_options { unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */ unsigned force : 1; /* RW mount dirty volume. */ - unsigned noacsrules : 1; /* Exclude acs rules. */ unsigned prealloc : 1; /* Preallocate space when file is growing. */ unsigned nocase : 1; /* case insensitive. */ }; @@ -164,7 +163,6 @@ struct wnd_bitmap { size_t zone_bit; size_t zone_end; - bool set_tail; // Not necessary in driver. bool inited; }; @@ -340,7 +338,7 @@ enum ntfs_inode_mutex_lock_class { }; /* - * sturct ntfs_inode + * struct ntfs_inode * * Ntfs inode - extends linux inode. consists of one or more MFT inodes. */ @@ -581,6 +579,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool ni_is_dirty(struct inode *inode); /* Globals from fslog.c */ +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ @@ -700,9 +699,8 @@ int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct page **pagep, void **fsdata); -int ntfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, u32 copied, struct page *page, - void *fsdata); +int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, + u32 len, u32 copied, struct page *page, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, @@ -858,23 +856,22 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL -struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type); int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, - struct inode *dir); + struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); -int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, - int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *ntfs_xattr_handlers[]; -int ntfs_save_wsl_perm(struct inode *inode); +int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); /* globals from lznt.c */ diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index defce6a5c8e1..2a281cead2bc 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -221,7 +221,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) } if (off + asize < off) { - /* overflow check */ + /* Overflow check. */ return NULL; } @@ -247,8 +247,8 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if ((t32 & 0xf) || (t32 > 0x100)) return NULL; - /* Check boundary. */ - if (off + asize > used) + /* Check overflow and boundary. */ + if (off + asize < off || off + asize > used) return NULL; /* Check size of attribute. */ @@ -419,10 +419,9 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, struct ntfs_sb_info *sbi = mi->sbi; u32 used = le32_to_cpu(rec->used); const u16 *upcase = sbi->upcase; - int diff; /* Can we insert mi attribute? */ - if (used + asize > mi->sbi->record_size) + if (used + asize > sbi->record_size) return NULL; /* @@ -431,7 +430,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, */ attr = NULL; while ((attr = mi_enum_attr(mi, attr))) { - diff = compare_attr(attr, type, name, name_len, upcase); + int diff = compare_attr(attr, type, name, name_len, upcase); if (diff < 0) continue; @@ -442,9 +441,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, } if (!attr) { - tail = 8; /* Not used, just to suppress warning. */ + /* Append. */ + tail = 8; attr = Add2Ptr(rec, used - 8); } else { + /* Insert before 'attr'. */ tail = used - PtrOffset(rec, attr); } diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index a5af71cd8d14..47612d16c027 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -433,9 +433,9 @@ requires_new_range: should_add_tail = Tovcn < r->len; if (should_add_tail) { - tail_lcn = r->lcn == SPARSE_LCN - ? SPARSE_LCN - : (r->lcn + Tovcn); + tail_lcn = r->lcn == SPARSE_LCN ? + SPARSE_LCN : + (r->lcn + Tovcn); tail_vcn = r->vcn + Tovcn; tail_len = r->len - Tovcn; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index ef4ea3f21905..5158dd31fd97 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -39,10 +39,10 @@ * To mount large volumes as ntfs one should use large cluster size (up to 2M) * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P * - * ntfs limits, cluster size is 2M (2^31) + * ntfs limits, cluster size is 2M (2^21) * ----------------------------------------------------------------------------- - * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes | - * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes | + * | < 8P, 2^53 | < 2^32 | yes | yes | yes | yes | yes | + * | > 8P, 2^53 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * */ @@ -115,9 +115,9 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) return; /* Use static allocated buffer, if possible. */ - name = atomic_dec_and_test(&s_name_buf_cnt) - ? s_name_buf - : kmalloc(sizeof(s_name_buf), GFP_NOFS); + name = atomic_dec_and_test(&s_name_buf_cnt) ? + s_name_buf : + kmalloc(sizeof(s_name_buf), GFP_NOFS); if (name) { struct dentry *de = d_find_alias(inode); @@ -253,7 +253,6 @@ enum Opt { Opt_acl, Opt_iocharset, Opt_prealloc, - Opt_noacsrules, Opt_nocase, Opt_err, }; @@ -271,12 +270,11 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_flag_no("hidden", Opt_nohidden), fsparam_flag_no("hide_dot_files", Opt_hide_dot_files), fsparam_flag_no("windows_names", Opt_windows_names), - fsparam_flag_no("acl", Opt_acl), fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("acl", Opt_acl), + fsparam_string("iocharset", Opt_iocharset), fsparam_flag_no("prealloc", Opt_prealloc), - fsparam_flag_no("acsrules", Opt_noacsrules), fsparam_flag_no("nocase", Opt_nocase), - fsparam_string("iocharset", Opt_iocharset), {} }; @@ -366,19 +364,20 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_windows_names: opts->windows_names = result.negated ? 0 : 1; break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; case Opt_acl: if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else - return invalf(fc, "ntfs3: Support for ACL not compiled in!"); + return invalf( + fc, "ntfs3: Support for ACL not compiled in!"); #endif else fc->sb_flags &= ~SB_POSIXACL; break; - case Opt_showmeta: - opts->showmeta = result.negated ? 0 : 1; - break; case Opt_iocharset: kfree(opts->nls_name); opts->nls_name = param->string; @@ -387,9 +386,6 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_prealloc: opts->prealloc = result.negated ? 0 : 1; break; - case Opt_noacsrules: - opts->noacsrules = result.negated ? 1 : 0; - break; case Opt_nocase: opts->nocase = result.negated ? 1 : 0; break; @@ -409,24 +405,29 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + errorf(fc, + "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); return -EINVAL; } new_opts->nls = ntfs_load_nls(new_opts->nls_name); if (IS_ERR(new_opts->nls)) { new_opts->nls = NULL; - errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + errorf(fc, "ntfs3: Cannot load iocharset %s", + new_opts->nls_name); return -EINVAL; } if (new_opts->nls != sbi->options->nls) - return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + return invalf( + fc, + "ntfs3: Cannot use different iocharset when remounting!"); sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && !new_opts->force) { - errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + errorf(fc, + "ntfs3: Volume is dirty and \"force\" flag is not set!"); return -EINVAL; } @@ -544,40 +545,38 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); - if (opts->fmask) - seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); + seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid)); if (opts->dmask) seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff); - if (opts->nls) - seq_printf(m, ",iocharset=%s", opts->nls->charset); - else - seq_puts(m, ",iocharset=utf8"); + if (opts->fmask) + seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->force) + seq_puts(m, ",force"); if (opts->sparse) seq_puts(m, ",sparse"); - if (opts->showmeta) - seq_puts(m, ",showmeta"); if (opts->nohidden) seq_puts(m, ",nohidden"); - if (opts->windows_names) - seq_puts(m, ",windows_names"); if (opts->hide_dot_files) seq_puts(m, ",hide_dot_files"); - if (opts->force) - seq_puts(m, ",force"); - if (opts->noacsrules) - seq_puts(m, ",noacsrules"); - if (opts->prealloc) - seq_puts(m, ",prealloc"); + if (opts->windows_names) + seq_puts(m, ",windows_names"); + if (opts->showmeta) + seq_puts(m, ",showmeta"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); + if (opts->nls) + seq_printf(m, ",iocharset=%s", opts->nls->charset); + else + seq_puts(m, ",iocharset=utf8"); + if (opts->prealloc) + seq_puts(m, ",prealloc"); + if (opts->nocase) + seq_puts(m, ",nocase"); return 0; } @@ -706,7 +705,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ - return 1U << -(s8)boot->sectors_per_clusters; + return 1U << (-(s8)boot->sectors_per_clusters); return -EINVAL; } @@ -724,6 +723,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct buffer_head *bh; struct MFT_REC *rec; u16 fn, ao; + u8 cluster_bits; sbi->volume.blocks = dev_size >> PAGE_SHIFT; @@ -734,48 +734,81 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, err = -EINVAL; boot = (struct NTFS_BOOT *)bh->b_data; - if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) + if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { + ntfs_err(sb, "Boot's signature is not NTFS."); goto out; + } /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) * goto out; */ - boot_sector_size = (u32)boot->bytes_per_sector[1] << 8; - if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE || + boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) | + boot->bytes_per_sector[0]; + if (boot_sector_size < SECTOR_SIZE || !is_power_of_2(boot_sector_size)) { + ntfs_err(sb, "Invalid bytes per sector %u.", boot_sector_size); goto out; } /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); - if ((int)sct_per_clst < 0) - goto out; - if (!is_power_of_2(sct_per_clst)) + if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) { + ntfs_err(sb, "Invalid sectors per cluster %u.", sct_per_clst); goto out; + } + + sbi->cluster_size = boot_sector_size * sct_per_clst; + sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size); + sbi->cluster_mask = sbi->cluster_size - 1; + sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; mlcn = le64_to_cpu(boot->mft_clst); mlcn2 = le64_to_cpu(boot->mft2_clst); sectors = le64_to_cpu(boot->sectors_per_volume); - if (mlcn * sct_per_clst >= sectors) + if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) { + ntfs_err( + sb, + "Start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", + mlcn, mlcn2, sectors); goto out; + } - if (mlcn2 * sct_per_clst >= sectors) - goto out; + sbi->record_size = record_size = + boot->record_size < 0 ? 1 << (-boot->record_size) : + (u32)boot->record_size << cluster_bits; + sbi->record_bits = blksize_bits(record_size); + sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes /* Check MFT record size. */ - if ((boot->record_size < 0 && - SECTOR_SIZE > (2U << (-boot->record_size))) || - (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) { + if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) { + ntfs_err(sb, "Invalid bytes per MFT record %u (%d).", + record_size, boot->record_size); + goto out; + } + + if (record_size > MAXIMUM_BYTES_PER_MFT) { + ntfs_err(sb, "Unsupported bytes per MFT record %u.", + record_size); goto out; } + sbi->index_size = boot->index_size < 0 ? + 1u << (-boot->index_size) : + (u32)boot->index_size << cluster_bits; + /* Check index record size. */ - if ((boot->index_size < 0 && - SECTOR_SIZE > (2U << (-boot->index_size))) || - (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) { + if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { + ntfs_err(sb, "Invalid bytes per index %u(%d).", sbi->index_size, + boot->index_size); + goto out; + } + + if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) { + ntfs_err(sb, "Unsupported bytes per index %u.", + sbi->index_size); goto out; } @@ -791,53 +824,36 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (boot_sector_size != sector_size) { ntfs_warn( sb, - "Different NTFS' sector size (%u) and media sector size (%u)", + "Different NTFS sector size (%u) and media sector size (%u).", boot_sector_size, sector_size); dev_size += sector_size - 1; } - sbi->cluster_size = boot_sector_size * sct_per_clst; - sbi->cluster_bits = blksize_bits(sbi->cluster_size); - - sbi->mft.lbo = mlcn << sbi->cluster_bits; - sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; + sbi->mft.lbo = mlcn << cluster_bits; + sbi->mft.lbo2 = mlcn2 << cluster_bits; /* Compare boot's cluster and sector. */ - if (sbi->cluster_size < boot_sector_size) + if (sbi->cluster_size < boot_sector_size) { + ntfs_err(sb, "Invalid bytes per cluster (%u).", + sbi->cluster_size); goto out; + } /* Compare boot's cluster and media sector. */ if (sbi->cluster_size < sector_size) { /* No way to use ntfs_get_block in this case. */ ntfs_err( sb, - "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).", sbi->cluster_size, sector_size); goto out; } - sbi->cluster_mask = sbi->cluster_size - 1; - sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; - sbi->record_size = record_size = boot->record_size < 0 - ? 1 << (-boot->record_size) - : (u32)boot->record_size - << sbi->cluster_bits; - - if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE) - goto out; - - sbi->record_bits = blksize_bits(record_size); - sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes - sbi->max_bytes_per_attr = record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) - ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - ALIGN(sizeof(enum ATTR_TYPE), 8); - sbi->index_size = boot->index_size < 0 - ? 1u << (-boot->index_size) - : (u32)boot->index_size << sbi->cluster_bits; - sbi->volume.ser_num = le64_to_cpu(boot->serial_num); /* Warning if RAW volume. */ @@ -847,18 +863,18 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, gb0 = format_size_gb(dev_size, &mb0); ntfs_warn( sb, - "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only", + "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.", gb, mb, gb0, mb0); sb->s_flags |= SB_RDONLY; } - clusters = sbi->volume.size >> sbi->cluster_bits; + clusters = sbi->volume.size >> cluster_bits; #ifndef CONFIG_NTFS3_64BIT_CLUSTER /* 32 bits per cluster. */ if (clusters >> 32) { ntfs_notice( sb, - "NTFS %u.%02u Gb is too big to use 32 bits per cluster", + "NTFS %u.%02u Gb is too big to use 32 bits per cluster.", gb, mb); goto out; } @@ -892,17 +908,17 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; /* Maximum size for normal files. */ - sbi->maxbytes = (clusters << sbi->cluster_bits) - 1; + sbi->maxbytes = (clusters << cluster_bits) - 1; #ifdef CONFIG_NTFS3_64BIT_CLUSTER - if (clusters >= (1ull << (64 - sbi->cluster_bits))) + if (clusters >= (1ull << (64 - cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ - sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; + sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits; #endif /* @@ -910,7 +926,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, * It would be nice if we are able to allocate 1/8 of * total clusters for MFT but not more then 512 MB. */ - sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3); + sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3); err = 0; @@ -928,6 +944,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; + struct ntfs_mount_options *options; struct inode *inode; struct ntfs_inode *ni; size_t i, tt, bad_len, bad_frags; @@ -942,7 +959,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.high = 0; sbi->sb = sb; - sbi->options = fc->fs_private; + sbi->options = options = fc->fs_private; fc->fs_private = NULL; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -950,12 +967,12 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL; + sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL; - sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); - if (IS_ERR(sbi->options->nls)) { - sbi->options->nls = NULL; - errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + options->nls = ntfs_load_nls(options->nls_name); + if (IS_ERR(options->nls)) { + options->nls = NULL; + errorf(fc, "Cannot load nls %s", options->nls_name); err = -EINVAL; goto out; } @@ -980,8 +997,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $Volume."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Volume (%d).", err); goto out; } @@ -1007,13 +1024,9 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); - if (!attr || is_attr_ext(attr)) { - err = -EINVAL; - goto put_inode_out; - } - - info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); - if (!info) { + if (!attr || is_attr_ext(attr) || + !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) { + ntfs_err(sb, "$Volume is corrupted."); err = -EINVAL; goto put_inode_out; } @@ -1028,13 +1041,13 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $MFTMirr."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFTMirr (%d).", err); goto out; } - sbi->mft.recs_mirr = - ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; + sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >> + sbi->record_bits; iput(inode); @@ -1043,8 +1056,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load \x24LogFile."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load \x24LogFile (%d).", err); goto out; } @@ -1064,7 +1077,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!sb_rdonly(sb) && !sbi->options->force) { + if (!sb_rdonly(sb) && !options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1079,8 +1092,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $MFT."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFT (%d).", err); goto out; } @@ -1095,8 +1108,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto put_inode_out; err = ni_load_all_mi(ni); - if (err) + if (err) { + ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err); goto put_inode_out; + } sbi->mft.ni = ni; @@ -1105,8 +1120,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $Bitmap."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Bitmap (%d).", err); goto out; } @@ -1120,22 +1135,25 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Check bitmap boundary. */ tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { + ntfs_err(sb, "$Bitmap is corrupted."); err = -EINVAL; goto put_inode_out; } - /* Not necessary. */ - sbi->used.bitmap.set_tail = true; err = wnd_init(&sbi->used.bitmap, sb, tt); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err); goto put_inode_out; + } iput(inode); /* Compute the MFT zone. */ err = ntfs_refresh_zone(sbi); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize MFT zone (%d).", err); goto out; + } /* Load $BadClus. */ ref.low = cpu_to_le32(MFT_REC_BADCLUST); @@ -1180,15 +1198,23 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_ATTR); inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $AttrDef -> %d", err); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $AttrDef (%d)", err); goto out; } - if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { + /* + * Typical $AttrDef contains up to 20 entries. + * Check for extremely large/small size. + */ + if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) || + inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) { + ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).", + inode->i_size); err = -EINVAL; goto put_inode_out; } + bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); if (!t) { @@ -1202,6 +1228,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (IS_ERR(page)) { err = PTR_ERR(page); + ntfs_err(sb, "Failed to read $AttrDef (%d).", err); goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), @@ -1209,6 +1236,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ntfs_unmap_page(page); if (!idx && ATTR_STD != t->type) { + ntfs_err(sb, "$AttrDef is corrupted."); err = -EINVAL; goto put_inode_out; } @@ -1243,13 +1271,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $UpCase (%d).", err); goto out; } if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; + ntfs_err(sb, "$UpCase is corrupted."); goto put_inode_out; } @@ -1260,6 +1289,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (IS_ERR(page)) { err = PTR_ERR(page); + ntfs_err(sb, "Failed to read $UpCase (%d).", err); goto put_inode_out; } @@ -1285,23 +1315,31 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (is_ntfs3(sbi)) { /* Load $Secure. */ err = ntfs_security_init(sbi); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure (%d).", err); goto out; + } /* Load $Extend. */ err = ntfs_extend_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend."); goto load_root; + } - /* Load $Extend\$Reparse. */ + /* Load $Extend/$Reparse. */ err = ntfs_reparse_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend/$Reparse."); goto load_root; + } - /* Load $Extend\$ObjId. */ + /* Load $Extend/$ObjId. */ err = ntfs_objid_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend/$ObjId."); goto load_root; + } } load_root: @@ -1309,12 +1347,21 @@ load_root: ref.low = cpu_to_le32(MFT_REC_ROOT); ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); - if (IS_ERR(inode) || !inode->i_op) { - ntfs_err(sb, "Failed to load root."); - err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load root (%d).", err); goto out; } + /* + * Final check. Looks like this case should never occurs. + */ + if (!inode->i_op) { + err = -EINVAL; + ntfs_err(sb, "Failed to load root (%d).", err); + goto put_inode_out; + } + sb->s_root = d_make_root(inode); if (!sb->s_root) { err = -ENOMEM; @@ -1434,7 +1481,7 @@ static const struct fs_context_operations ntfs_context_ops = { }; /* - * ntfs_init_fs_context - Initialize spi and opts + * ntfs_init_fs_context - Initialize sbi and opts * * This will called when mount/remount. We will first initialize * options so that if remount we can use just that. @@ -1507,7 +1554,8 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) - pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); + pr_notice( + "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); @@ -1550,7 +1598,9 @@ MODULE_DESCRIPTION("ntfs3 read/write filesystem"); MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); #endif #ifdef CONFIG_NTFS3_64BIT_CLUSTER -MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); +MODULE_INFO( + cluster, + "Warning: Activated 64 bits per cluster. Windows does not support this"); #endif #ifdef CONFIG_NTFS3_LZX_XPRESS MODULE_INFO(compression, "Read-only lzx/xpress compression included"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7324cf924932..c3de60a4543f 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -23,8 +23,8 @@ static inline size_t unpacked_ea_size(const struct EA_FULL *ea) { - return ea->size ? le32_to_cpu(ea->size) - : ALIGN(struct_size(ea, name, + return ea->size ? le32_to_cpu(ea->size) : + ALIGN(struct_size(ea, name, 1 + ea->name_len + le16_to_cpu(ea->elength)), 4); @@ -296,7 +296,8 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, bool locked) + size_t val_size, int flags, bool locked, + __le16 *ea_size) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -410,7 +411,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, /* * 1. Check ea_info.size_pack for overflow. - * 2. New attibute size must fit value from $AttrDef + * 2. New attribute size must fit value from $AttrDef */ if (new_pack > 0xffff || size > sbi->ea_max_size) { ntfs_inode_warn( @@ -504,6 +505,8 @@ update_ea: if (ea_info.size_pack != size_pack) ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + if (ea_size) + *ea_size = ea_info.size_pack; mark_inode_dirty(&ni->vfs_inode); out: @@ -517,9 +520,14 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, - int locked) + +/* + * ntfs_get_acl - inode_operations::get_acl + */ +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) { + struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); const char *name; size_t name_len; @@ -542,13 +550,11 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; } - if (!locked) - ni_lock(ni); + ni_lock(ni); err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req); - if (!locked) - ni_unlock(ni); + ni_unlock(ni); /* Translate extended attribute to acl. */ if (err >= 0) { @@ -567,17 +573,6 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, return acl; } -/* - * ntfs_get_acl - inode_operations::get_acl - */ -struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) -{ - if (rcu) - return ERR_PTR(-ECHILD); - - return ntfs_get_acl_ex(inode, type, 0); -} - static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) @@ -633,7 +628,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0, NULL); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ if (!err) { @@ -712,20 +707,6 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) } /* - * ntfs_permission - inode_operations::permission - */ -int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, - int mask) -{ - if (ntfs_sb(inode->i_sb)->options->noacsrules) { - /* "No access rules" mode - Allow all changes. */ - return 0; - } - - return generic_permission(idmap, inode, mask); -} - -/* * ntfs_listxattr - inode_operations::listxattr */ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -780,7 +761,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, err = sizeof(u32); *(u32 *)buffer = le32_to_cpu(ni->std_fa); if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) - *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer); + *(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer); } goto out; } @@ -857,7 +838,7 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, if (size != sizeof(u32)) goto out; if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) - new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value)); + new_fa = cpu_to_le32(be32_to_cpu(*(__be32 *)value)); else new_fa = cpu_to_le32(*(u32 *)value); @@ -937,7 +918,8 @@ set_new_fa: } /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0); + err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0, + NULL); out: inode->i_ctime = current_time(inode); @@ -951,7 +933,7 @@ out: * * save uid/gid/mode in xattr */ -int ntfs_save_wsl_perm(struct inode *inode) +int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size) { int err; __le32 value; @@ -960,26 +942,26 @@ int ntfs_save_wsl_perm(struct inode *inode) ni_lock(ni); value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, true); /* true == already locked. */ + sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; } diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 811a6ea374bb..b1550ba73f96 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -803,8 +803,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static noinline_for_stack int +ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; @@ -840,27 +840,26 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - int new_clusters; - int status; - struct ocfs2_space_resv sr; - struct ocfs2_new_group_input input; - struct reflink_arguments args; - const char __user *old_path; - const char __user *new_path; - bool preserve; - struct ocfs2_info info; void __user *argp = (void __user *)arg; + int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: + { + struct ocfs2_space_resv sr; + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); + } case OCFS2_IOC_GROUP_EXTEND: + { + int new_clusters; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -873,8 +872,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + { + struct ocfs2_new_group_input input; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -887,7 +890,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_REFLINK: + { + struct reflink_arguments args; + const char __user *old_path; + const char __user *new_path; + bool preserve; + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; @@ -895,11 +905,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + } case OCFS2_IOC_INFO: + { + struct ocfs2_info info; + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); + } case FITRIM: { struct super_block *sb = inode->i_sb; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aefdf1d3be7c..9014bbcc8031 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac) struct iov_iter iter; struct inode *inode = rac->mapping->host; struct xarray *i_pages; - struct page *page; + struct folio *folio; loff_t new_start = readahead_pos(rac); int ret; size_t new_len = 0; @@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac) ret = 0; /* clean up. */ - while ((page = readahead_page(rac))) { - page_endio(page, false, ret); - put_page(page); + while ((folio = readahead_folio(rac))) { + if (!ret) + folio_mark_uptodate(folio); + folio_unlock(folio); } } diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b0315d34c58..d35bbf35a874 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,6 +91,7 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" @@ -219,6 +220,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); + + seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, @@ -423,6 +426,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } +static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -438,6 +446,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); + task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index 5e0e0ccd47aa..05452c3b9872 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> +#include <linux/ksm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -699,7 +700,6 @@ int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return error; setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); return 0; } @@ -3207,6 +3207,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8379593fa4bb..42ae38ff6e7e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -127,7 +127,6 @@ static int proc_notify_change(struct mnt_idmap *idmap, return error; setattr_copy(&nop_mnt_idmap, inode, iattr); - mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 71157ee35c1a..25b44b303b35 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -24,7 +24,7 @@ #include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> @@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, *i = ALIGN(*i + descsz, 4); } -static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { - char *buf = file->private_data; + loff_t *fpos = &iocb->ki_pos; size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; @@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) size_t tsz; int nphdr; unsigned long start; + size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; @@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { ret = -EFAULT; goto out; } - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, - tsz)) { + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { kfree(phdrs); ret = -EFAULT; goto out; } kfree(phdrs); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; goto out; } kfree(notes); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } if (!m) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) switch (m->type) { case KCORE_VMALLOC: - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } } break; + } case KCORE_USER: /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) { + if (copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (!page || PageOffline(page) || is_page_hwpoison(page) || !pfn_is_ram(pfn)) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. + * We use _copy_to_iter() to bypass usermode hardening + * which would otherwise prevent this operation. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + if (_copy_to_iter((char *)start, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; } break; default: pr_warn_once("Unhandled KCORE type: %d\n", m->type); - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } @@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp) if (ret) return ret; - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } -static int release_kcore(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - static const struct proc_ops kcore_proc_ops = { - .proc_read = read_kcore, + .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, - .proc_release = release_kcore, .proc_lseek = default_llseek, }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 440960110a42..b43d0bd42762 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -6,6 +6,7 @@ #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> @@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); +#ifdef CONFIG_MEMTEST + if (early_memtest_done) { + unsigned long early_memtest_bad_size_kb; + + early_memtest_bad_size_kb = early_memtest_bad_size>>10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); + } +#endif + #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5851eb5bc726..8038833ff5b0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -841,7 +841,6 @@ static int proc_sys_setattr(struct mnt_idmap *idmap, return error; setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); return 0; } @@ -1283,11 +1282,43 @@ out: return err; } +/* Find the directory for the ctl_table. If one is not found create it. */ +static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) +{ + const char *name, *nextname; + + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + break; + } + return dir; +} + /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * @table: the top-level table structure without any child. This table + * should not be free'd after registration. So it should not be + * used on stack. It can either be a global or dynamically allocated + * by the caller and free'd later after sysctl unregistration. * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1308,9 +1339,12 @@ out: * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. + * under /proc; non-leaf nodes (where child is not NULL) are not allowed, + * sysctl_check_table() verifies this. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - @@ -1331,7 +1365,6 @@ struct ctl_table_header *__register_sysctl_table( { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; - const char *name, *nextname; struct ctl_dir *dir; struct ctl_table *entry; struct ctl_node *node; @@ -1352,28 +1385,13 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); dir = &set->dir; - /* Reference moved down the diretory tree get_subdir */ + /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); - /* Find the directory for the ctl_table */ - for (name = path; name; name = nextname) { - int namelen; - nextname = strchr(name, '/'); - if (nextname) { - namelen = nextname - name; - nextname++; - } else { - namelen = strlen(name); - } - if (namelen == 0) - continue; - - dir = get_subdir(dir, name, namelen); - if (IS_ERR(dir)) - goto fail; - } - + dir = sysctl_mkdir_p(dir, path); + if (IS_ERR(dir)) + goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; @@ -1394,8 +1412,15 @@ fail: /** * register_sysctl - register a sysctl table - * @path: The path to the directory the sysctl table is in. - * @table: the table structure + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1411,8 +1436,11 @@ EXPORT_SYMBOL(register_sysctl); /** * __register_sysctl_init() - register sysctl table to path - * @path: path name for sysctl base - * @table: This is the sysctl table that needs to be registered to the path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @@ -1424,10 +1452,7 @@ EXPORT_SYMBOL(register_sysctl); * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * - * Context: Can only be called after your respective sysctl base path has been - * registered. So for instance, most base directories are registered early on - * init before init levels are processed through proc_sys_init() and - * sysctl_init_bases(). + * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1550,24 +1575,18 @@ out: } /** - * __register_sysctl_paths - register a sysctl table hierarchy - * @set: Sysctl tree to register on - * @path: The path to the directory the sysctl table is in. + * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_table for more details. + * We are slowly deprecating this call so avoid its use. */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_set *set, - const struct ctl_path *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) { struct ctl_table *ctl_table_arg = table; int nr_subheaders = count_subheaders(table); struct ctl_table_header *header = NULL, **subheaders, **subheader; - const struct ctl_path *component; char *new_path, *pos; pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); @@ -1575,11 +1594,6 @@ struct ctl_table_header *__register_sysctl_paths( return NULL; pos[0] = '\0'; - for (component = path; component->procname; component++) { - pos = append_path(new_path, pos, component->procname); - if (!pos) - goto out; - } while (table->procname && table->child && !table[1].procname) { pos = append_path(new_path, pos, table->procname); if (!pos) @@ -1587,7 +1601,7 @@ struct ctl_table_header *__register_sysctl_paths( table = table->child; } if (nr_subheaders == 1) { - header = __register_sysctl_table(set, new_path, table); + header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table); if (header) header->ctl_table_arg = ctl_table_arg; } else { @@ -1601,7 +1615,7 @@ struct ctl_table_header *__register_sysctl_paths( header->ctl_table_arg = ctl_table_arg; if (register_leaf_sysctl_tables(new_path, pos, &subheader, - set, table)) + &sysctl_table_root.default_set, table)) goto err_register_leaves; } @@ -1620,40 +1634,6 @@ err_register_leaves: header = NULL; goto out; } - -/** - * register_sysctl_paths - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root.default_set, - path, table); -} -EXPORT_SYMBOL(register_sysctl_paths); - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} EXPORT_SYMBOL(register_sysctl_table); int __register_sysctl_base(struct ctl_table *base_table) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 4fb8729a68d4..da60956b2915 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,30 +22,6 @@ #define arch_irq_stat() 0 #endif -#ifdef arch_idle_time - -u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 idle; - - idle = kcs->cpustat[CPUTIME_IDLE]; - if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) - idle += arch_idle_time(cpu); - return idle; -} - -static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 iowait; - - iowait = kcs->cpustat[CPUTIME_IOWAIT]; - if (cpu_online(cpu) && nr_iowait_cpu(cpu)) - iowait += arch_idle_time(cpu); - return iowait; -} - -#else - u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -78,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) return iowait; } -#endif - static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6a96e1713fd5..420510f6a545 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (start >= vma->vm_end) return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, ops = &smaps_shmem_walk_ops; } } -#endif + /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); @@ -1689,8 +1688,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; - if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) - start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); + mmap_read_unlock(mm); + } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 12af614f33ce..03f5963914a1 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -339,7 +339,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) return acc; } - /* Read Elf note segment */ + /* Read ELF note segment */ if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; @@ -1109,7 +1109,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1152,7 +1152,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1188,7 +1188,7 @@ static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, loff_t vmcore_off; struct vmcore *m; - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { @@ -1213,7 +1213,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -1269,7 +1269,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -1376,12 +1376,12 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, } /** - * vmcoredd_update_program_headers - Update all Elf program headers + * vmcoredd_update_program_headers - Update all ELF program headers * @elfptr: Pointer to elf header * @elfnotesz: Size of elf notes aligned to page size * @vmcoreddsz: Size of device dumps to be added to elf note header * - * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Determine type of ELF header (Elf64 or Elf32) and update the elf note size. * Also update the offsets of all the program headers after the elf note header. */ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, @@ -1439,10 +1439,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, /** * vmcoredd_update_size - Update the total size of the device dumps and update - * Elf header + * ELF header * @dump_size: Size of the current device dump to be added to total size * - * Update the total size of all the device dumps and update the Elf program + * Update the total size of all the device dumps and update the ELF program * headers. Calculate the new offsets for the vmcore list and update the * total vmcore size. */ @@ -1466,7 +1466,7 @@ static void vmcoredd_update_size(size_t dump_size) * @data: dump info. * * Allocate a buffer and invoke the calling driver's dump collect routine. - * Write Elf note at the beginning of the buffer to indicate vmcore device + * Write ELF note at the beginning of the buffer to indicate vmcore device * dump and add the dump to global list. */ int vmcore_add_device_dump(struct vmcoredd_data *data) diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index ab82e5f05346..55f139afa327 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -7,10 +7,9 @@ #include <linux/device.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/rtmutex.h> #include "internal.h" -static DEFINE_RT_MUTEX(pmsg_lock); +static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -29,9 +28,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf, if (!access_ok(buf, count)) return -EFAULT; - rt_mutex_lock(&pmsg_lock); + mutex_lock(&pmsg_lock); ret = psinfo->write_user(&record, buf); - rt_mutex_unlock(&pmsg_lock); + mutex_unlock(&pmsg_lock); return ret ? ret : count; } @@ -64,7 +63,7 @@ void pstore_register_pmsg(void) goto err; } - pmsg_class = class_create(THIS_MODULE, PMSG_NAME); + pmsg_class = class_create(PMSG_NAME); if (IS_ERR(pmsg_class)) { pr_err("device class file already in use\n"); goto err_class; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2f67516bb9bf..9fbb9b5256f7 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index ace133cf6072..3b43a51e6f7e 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -327,17 +327,18 @@ struct smb2_tree_connect_req { #define SMB2_SHAREFLAG_NO_CACHING 0x00000030 #define SHI1005_FLAGS_DFS 0x00000001 #define SHI1005_FLAGS_DFS_ROOT 0x00000002 -#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 -#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 -#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 -#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 -#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 -#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SMB2_SHAREFLAG_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SMB2_SHAREFLAG_FORCE_SHARED_DELETE 0x00000200 +#define SMB2_SHAREFLAG_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SMB2_SHAREFLAG_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SMB2_SHAREFLAG_FORCE_LEVELII_OPLOCK 0x00001000 +#define SMB2_SHAREFLAG_ENABLE_HASH_V1 0x00002000 +#define SMB2_SHAREFLAG_ENABLE_HASH_V2 0x00004000 #define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 #define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ #define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0014FF33 +#define SMB2_SHAREFLAG_ISOLATED_TRANSPORT 0x00200000 +#define SHI1005_FLAGS_ALL 0x0034FF33 /* Possible share capabilities */ #define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ @@ -1171,6 +1172,34 @@ struct create_posix { __u32 Reserved; } __packed; +/* See MS-SMB2 2.2.13.2.3 and MS-SMB2 2.2.13.2.4 */ +struct create_durable { + struct create_context ccontext; + __u8 Name[8]; + union { + __u8 Reserved[16]; + struct { + __u64 PersistentFileId; + __u64 VolatileFileId; + } Fid; + } Data; +} __packed; + +/* See MS-SMB2 2.2.13.2.5 */ +struct create_mxac_req { + struct create_context ccontext; + __u8 Name[8]; + __le64 Timestamp; +} __packed; + +/* See MS-SMB2 2.2.14.2.5 */ +struct create_mxac_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le32 QueryStatus; + __le32 MaximalAccess; +} __packed; + #define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) #define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) #define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) @@ -1180,6 +1209,7 @@ struct create_posix { #define SMB2_LEASE_KEY_SIZE 16 +/* See MS-SMB2 2.2.13.2.8 */ struct lease_context { __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; @@ -1187,6 +1217,7 @@ struct lease_context { __le64 LeaseDuration; } __packed; +/* See MS-SMB2 2.2.13.2.10 */ struct lease_context_v2 { __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; @@ -1210,6 +1241,15 @@ struct create_lease_v2 { __u8 Pad[4]; } __packed; +/* See MS-SMB2 2.2.14.2.9 */ +struct create_disk_id_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le64 DiskFileId; + __le64 VolumeId; + __u8 Reserved[16]; +} __packed; + /* See MS-SMB2 2.2.31 and 2.2.32 */ struct smb2_ioctl_req { struct smb2_hdr hdr; diff --git a/fs/super.c b/fs/super.c index 04bc62ab7dfe..34afe411cf2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 3a92e6af69b2..75461777c466 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -217,7 +217,6 @@ static void compr_exit(struct ubifs_compressor *compr) { if (compr->capi_name) crypto_free_comp(compr->cc); - return; } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 1505539f6fe9..ef0499edc248 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -358,7 +358,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) umode_t mode = S_IFCHR | WHITEOUT_MODE; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct fscrypt_name nm; /* * Create an inode('nlink = 1') for whiteout without updating journal, @@ -369,10 +368,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd', mode %#hx in dir ino %lu", dentry, mode, dir->i_ino); - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); - if (err) - return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -395,7 +390,6 @@ out_inode: make_bad_inode(inode); iput(inode); out_free: - fscrypt_free_filename(&nm); ubifs_err(c, "cannot create whiteout file, error %d", err); return ERR_PTR(err); } @@ -492,6 +486,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); return finish_open_simple(file, 0); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 2469f72eeaab..6b7d95b65f4b 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -44,6 +44,33 @@ enum { NOT_ON_MEDIA = 3, }; +static void do_insert_old_idx(struct ubifs_info *c, + struct ubifs_old_idx *old_idx) +{ + struct ubifs_old_idx *o; + struct rb_node **p, *parent = NULL; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (old_idx->lnum < o->lnum) + p = &(*p)->rb_left; + else if (old_idx->lnum > o->lnum) + p = &(*p)->rb_right; + else if (old_idx->offs < o->offs) + p = &(*p)->rb_left; + else if (old_idx->offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err(c, "old idx added twice!"); + kfree(old_idx); + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); +} + /** * insert_old_idx - record an index node obsoleted since the last commit start. * @c: UBIFS file-system description object @@ -69,35 +96,15 @@ enum { */ static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) { - struct ubifs_old_idx *old_idx, *o; - struct rb_node **p, *parent = NULL; + struct ubifs_old_idx *old_idx; old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); if (unlikely(!old_idx)) return -ENOMEM; old_idx->lnum = lnum; old_idx->offs = offs; + do_insert_old_idx(c, old_idx); - p = &c->old_idx.rb_node; - while (*p) { - parent = *p; - o = rb_entry(parent, struct ubifs_old_idx, rb); - if (lnum < o->lnum) - p = &(*p)->rb_left; - else if (lnum > o->lnum) - p = &(*p)->rb_right; - else if (offs < o->offs) - p = &(*p)->rb_left; - else if (offs > o->offs) - p = &(*p)->rb_right; - else { - ubifs_err(c, "old idx added twice!"); - kfree(old_idx); - return 0; - } - } - rb_link_node(&old_idx->rb, parent, p); - rb_insert_color(&old_idx->rb, &c->old_idx); return 0; } @@ -199,23 +206,6 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); - ubifs_assert(c, !ubifs_zn_obsolete(znode)); - __set_bit(OBSOLETE_ZNODE, &znode->flags); - - if (znode->level != 0) { - int i; - const int n = zn->child_cnt; - - /* The children now have new parent */ - for (i = 0; i < n; i++) { - struct ubifs_zbranch *zbr = &zn->zbranch[i]; - - if (zbr->znode) - zbr->znode->parent = zn; - } - } - - atomic_long_inc(&c->dirty_zn_cnt); return zn; } @@ -234,6 +224,42 @@ static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) } /** + * replace_znode - replace old znode with new znode. + * @c: UBIFS file-system description object + * @new_zn: new znode + * @old_zn: old znode + * @zbr: the branch of parent znode + * + * Replace old znode with new znode in TNC. + */ +static void replace_znode(struct ubifs_info *c, struct ubifs_znode *new_zn, + struct ubifs_znode *old_zn, struct ubifs_zbranch *zbr) +{ + ubifs_assert(c, !ubifs_zn_obsolete(old_zn)); + __set_bit(OBSOLETE_ZNODE, &old_zn->flags); + + if (old_zn->level != 0) { + int i; + const int n = new_zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *child = &new_zn->zbranch[i]; + + if (child->znode) + child->znode->parent = new_zn; + } + } + + zbr->znode = new_zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + atomic_long_inc(&c->dirty_zn_cnt); +} + +/** * dirty_cow_znode - ensure a znode is not being committed. * @c: UBIFS file-system description object * @zbr: branch of znode to check @@ -265,28 +291,32 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, return zn; if (zbr->len) { - err = insert_old_idx(c, zbr->lnum, zbr->offs); - if (unlikely(err)) - /* - * Obsolete znodes will be freed by tnc_destroy_cnext() - * or free_obsolete_znodes(), copied up znodes should - * be added back to tnc and freed by - * ubifs_destroy_tnc_subtree(). - */ + struct ubifs_old_idx *old_idx; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) { + err = -ENOMEM; goto out; + } + old_idx->lnum = zbr->lnum; + old_idx->offs = zbr->offs; + err = add_idx_dirt(c, zbr->lnum, zbr->len); - } else - err = 0; + if (err) { + kfree(old_idx); + goto out; + } -out: - zbr->znode = zn; - zbr->lnum = 0; - zbr->offs = 0; - zbr->len = 0; + do_insert_old_idx(c, old_idx); + } + + replace_znode(c, zn, znode, zbr); - if (unlikely(err)) - return ERR_PTR(err); return zn; + +out: + kfree(zn); + return ERR_PTR(err); } /** diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 67aaadc3ab07..8395066341a4 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -214,4 +214,3 @@ void utf8_unload(struct unicode_map *um) } EXPORT_SYMBOL(utf8_unload); -MODULE_LICENSE("GPL v2"); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 40f9e1a2ebdd..0fd96d6e39ce 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -32,7 +32,22 @@ #include <linux/swapops.h> #include <linux/miscdevice.h> -int sysctl_unprivileged_userfaultfd __read_mostly; +static int sysctl_unprivileged_userfaultfd __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_userfaultfd_table[] = { + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; +#endif static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -108,6 +123,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t flags) { @@ -1629,7 +1659,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) - uffd_wp_range(mm, vma, start, vma_end - start, false); + uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, @@ -1714,6 +1744,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_copy = (struct uffdio_copy __user *) arg; @@ -1740,10 +1771,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, &ctx->mmap_changing, + flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1793,9 +1826,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1875,6 +1908,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1899,13 +1933,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1973,6 +2010,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; @@ -2180,6 +2218,9 @@ static int __init userfaultfd_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once_userfaultfd_ctx); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_userfaultfd_table); +#endif return 0; } __initcall(userfaultfd_init); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 9fac5ea8d0e4..52e1823241fb 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -47,6 +47,33 @@ config XFS_SUPPORT_V4 To continue supporting the old V4 format (crc=0), say Y. To close off an attack surface, say N. +config XFS_SUPPORT_ASCII_CI + bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" + depends on XFS_FS + default y + help + The ASCII case insensitivity filesystem feature only works correctly + on systems that have been coerced into using ISO 8859-1, and it does + not work on extended attributes. The kernel has no visibility into + the locale settings in userspace, so it corrupts UTF-8 names. + Enabling this feature makes XFS vulnerable to mixed case sensitivity + attacks. Because of this, the feature is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect such a filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "ascii-ci=". If the string "ascii-ci=1" is found, the + filesystem is a case-insensitive filesystem. If no such string is + found, please upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + feature will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting case-insensitivity (ascii-ci=1), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS @@ -93,10 +120,15 @@ config XFS_RT If unsure, say N. +config XFS_DRAIN_INTENTS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 92d88dc3c9f7..16e4eb431230 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y) xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif +xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) @@ -146,6 +148,7 @@ xfs-y += $(addprefix scrub/, \ agheader.o \ alloc.o \ attr.o \ + bitmap.o \ bmap.o \ btree.o \ common.o \ @@ -156,6 +159,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ parent.o \ + readdir.o \ refcount.o \ rmap.o \ scrub.o \ @@ -169,7 +173,6 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ - bitmap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 86696a1c6891..1b078bbbf225 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -81,6 +81,19 @@ xfs_perag_get_tag( return pag; } +/* Get a passive reference to the given perag. */ +struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + ASSERT(atomic_read(&pag->pag_ref) > 0 || + atomic_read(&pag->pag_active_ref) > 0); + + trace_xfs_perag_hold(pag, _RET_IP_); + atomic_inc(&pag->pag_ref); + return pag; +} + void xfs_perag_put( struct xfs_perag *pag) @@ -247,6 +260,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); @@ -372,6 +386,7 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; @@ -408,6 +423,7 @@ xfs_initialize_perag( return 0; out_remove_pag: + xfs_defer_drain_free(&pag->pag_intents_drain); radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: kmem_free(pag); @@ -418,6 +434,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); kmem_free(pag); } return error; @@ -1043,10 +1060,8 @@ xfs_ag_extend_space( if (error) return error; - error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, - be32_to_cpu(agf->agf_length) - len), - len, &XFS_RMAP_OINFO_SKIP_UPDATE, - XFS_AG_RESV_NONE); + error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len, + len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 5e18536dfdce..2e0aef87d633 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -101,6 +101,14 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain pag_intents_drain; #endif /* __KERNEL__ */ }; @@ -134,6 +142,7 @@ void xfs_free_perag(struct xfs_mount *mp); struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); +struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); void xfs_perag_put(struct xfs_perag *pag); /* Active AG references */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203f16c48c19..fdfa08cbf4db 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -233,6 +233,52 @@ xfs_alloc_update( return xfs_btree_update(cur, &rec); } +/* Convert the ondisk btree record to its incore representation. */ +void +xfs_alloc_btrec_to_irec( + const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec) +{ + irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); +} + +/* Simple checks for free space records. */ +xfs_failaddr_t +xfs_alloc_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_perag *pag = cur->bc_ag.pag; + + if (irec->ar_blockcount == 0) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_alloc_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; + + xfs_warn(mp, + "%s Freespace BTree record corruption in AG %d detected at %pS!", + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + cur->bc_ag.pag->pag_agno, fa); + xfs_warn(mp, + "start block 0x%x block count 0x%x", irec->ar_startblock, + irec->ar_blockcount); + return -EFSCORRUPTED; +} + /* * Get the data from the pointed-to record. */ @@ -243,35 +289,23 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat) /* output: success/failure */ { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_alloc_rec_incore irec; union xfs_btree_rec *rec; + xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !(*stat)) return error; - *bno = be32_to_cpu(rec->alloc.ar_startblock); - *len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (*len == 0) - goto out_bad_rec; - - /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, *bno, *len)) - goto out_bad_rec; + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); + *bno = irec.ar_startblock; + *len = irec.ar_blockcount; return 0; - -out_bad_rec: - xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - pag->pag_agno); - xfs_warn(mp, - "start block 0x%x block count 0x%x", *bno, *len); - return -EFSCORRUPTED; } /* @@ -2405,6 +2439,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } @@ -2421,8 +2456,8 @@ __xfs_free_extent_later( bool skip_discard) { struct xfs_extent_free_item *xefi; -#ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; +#ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -2456,9 +2491,11 @@ __xfs_free_extent_later( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(tp->t_mountp, + trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } @@ -3596,7 +3633,8 @@ xfs_free_extent_fix_freelist( int __xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, @@ -3604,12 +3642,9 @@ __xfs_free_extent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); struct xfs_agf *agf; int error; unsigned int busy_flags = 0; - struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3618,10 +3653,9 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - pag = xfs_perag_get(mp, agno); error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - goto err; + return error; agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { @@ -3635,20 +3669,18 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); + error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, + type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); - xfs_perag_put(pag); return 0; err_release: xfs_trans_brelse(tp, agbp); -err: - xfs_perag_put(pag); return error; } @@ -3666,9 +3698,13 @@ xfs_alloc_query_range_helper( { struct xfs_alloc_query_range_info *query = priv; struct xfs_alloc_rec_incore irec; + xfs_failaddr_t fa; + + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); - irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); - irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); return query->fn(cur, &irec, query->priv); } @@ -3709,13 +3745,16 @@ xfs_alloc_query_all( return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the free space and tell us if the area has no + * records, is fully mapped by records, or is partially filled. + */ int -xfs_alloc_has_record( +xfs_alloc_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -3725,7 +3764,7 @@ xfs_alloc_has_record( memset(&high, 0xFF, sizeof(high)); high.a.ar_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2b246d74c189..5dbb25546d0b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -141,7 +141,8 @@ int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args, int /* error */ __xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, /* length of extent */ const struct xfs_owner_info *oinfo, /* extent owner */ enum xfs_ag_resv_type type, /* block reservation type */ @@ -150,12 +151,13 @@ __xfs_free_extent( static inline int xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - return __xfs_free_extent(tp, bno, len, oinfo, type, false); + return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false); } int /* error */ @@ -179,6 +181,12 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +union xfs_btree_rec; +void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec); +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec); + int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -205,8 +213,8 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); -int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exist); +int xfs_alloc_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, void *priv); @@ -235,9 +243,13 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + struct xfs_perag *xefi_pag; unsigned int xefi_flags; }; +void xfs_extent_free_get_group(struct xfs_mount *mp, + struct xfs_extent_free_item *xefi); + #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 0f29c7b1b39f..c65228efed4a 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -260,20 +260,27 @@ STATIC int64_t xfs_bnobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->alloc.ar_startblock); + return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int64_t xfs_cntbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { int64_t diff; + ASSERT(!mask || (mask->alloc.ar_blockcount && + mask->alloc.ar_startblock)); + diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); if (diff) @@ -423,6 +430,19 @@ xfs_cntbt_recs_inorder( be32_to_cpu(r2->alloc.ar_startblock)); } +STATIC enum xbtree_key_contig +xfs_allocbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->alloc.ar_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->alloc.ar_startblock), + be32_to_cpu(key2->alloc.ar_startblock)); +} + static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -443,6 +463,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, + .keys_contiguous = xfs_allocbt_keys_contiguous, }; static const struct xfs_btree_ops xfs_cntbt_ops = { @@ -465,6 +486,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, + .keys_contiguous = NULL, /* not needed right now */ }; /* Allocate most of a new allocation btree cursor. */ @@ -492,9 +514,7 @@ xfs_allocbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 34de6e6898c4..b512de0540d5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1083,6 +1083,34 @@ struct xfs_iread_state { xfs_extnum_t loaded; }; +int +xfs_bmap_complain_bad_rec( + struct xfs_inode *ip, + int whichfork, + xfs_failaddr_t fa, + const struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + const char *forkname; + + switch (whichfork) { + case XFS_DATA_FORK: forkname = "data"; break; + case XFS_ATTR_FORK: forkname = "attr"; break; + case XFS_COW_FORK: forkname = "CoW"; break; + default: forkname = "???"; break; + } + + xfs_warn(mp, + "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!", + ip->i_ino, forkname, fa); + xfs_warn(mp, + "Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x", + irec->br_startoff, irec->br_startblock, irec->br_blockcount, + irec->br_state); + + return -EFSCORRUPTED; +} + /* Stuff every bmbt record from this block into the incore extent map. */ static int xfs_iread_bmbt_block( @@ -1125,7 +1153,8 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, fa, + &new); } xfs_iext_insert(ip, &ir->icur, &new, xfs_bmap_fork_to_state(whichfork)); @@ -1171,6 +1200,12 @@ xfs_iread_extents( goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); + /* + * Use release semantics so that we can use acquire semantics in + * xfs_need_iread_extents and be guaranteed to see a valid mapping tree + * after that load. + */ + smp_store_release(&ifp->if_needextents, 0); return 0; out: xfs_iext_destroy(ifp); @@ -3505,7 +3540,6 @@ xfs_bmap_btalloc_at_eof( * original non-aligned state so the caller can proceed on allocation * failure as if this function was never called. */ - args->fsbno = ap->blkno; args->alignment = 1; return 0; } @@ -6075,6 +6109,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; + xfs_bmap_update_get_group(tp->t_mountp, bi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index dd08361ca5a6..e33470e39728 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { BMAP_COWFORK, "COW" } /* Return true if the extent is an allocated extent, written or not. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) { return irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && @@ -238,9 +238,13 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; + struct xfs_perag *bi_pag; struct xfs_bmbt_irec bi_bmap; }; +void xfs_bmap_update_get_group(struct xfs_mount *mp, + struct xfs_bmap_intent *bi); + int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); @@ -261,6 +265,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, + xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index b8ad95050c9b..1b40e5f8b1ec 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -382,11 +382,14 @@ STATIC int64_t xfs_bmbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + ASSERT(!mask || mask->bmbt.br_startoff); + /* * Note: This routine previously casted a and b to int64 and subtracted * them to generate a result. This lead to problems if b was the @@ -500,6 +503,19 @@ xfs_bmbt_recs_inorder( xfs_bmbt_disk_get_startoff(&r2->bmbt); } +STATIC enum xbtree_key_contig +xfs_bmbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->bmbt.br_startoff); + + return xbtree_key_contig(be64_to_cpu(key1->bmbt.br_startoff), + be64_to_cpu(key2->bmbt.br_startoff)); +} + static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), @@ -520,6 +536,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, + .keys_contiguous = xfs_bmbt_keys_contiguous, }; /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c4649cc624e1..6a6503ab0cd7 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2067,8 +2067,7 @@ xfs_btree_get_leaf_keys( for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { rec = xfs_btree_rec_addr(cur, n, block); cur->bc_ops->init_high_key_from_rec(&hkey, rec); - if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) - > 0) + if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) max_hkey = hkey; } @@ -2096,7 +2095,7 @@ xfs_btree_get_node_keys( max_hkey = xfs_btree_high_key_addr(cur, 1, block); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { hkey = xfs_btree_high_key_addr(cur, n, block); - if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) + if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) max_hkey = hkey; } @@ -2183,8 +2182,8 @@ __xfs_btree_updkeys( nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && - !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || - cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) + xfs_btree_keycmp_eq(cur, nlkey, lkey) && + xfs_btree_keycmp_eq(cur, nhkey, hkey)) break; xfs_btree_copy_keys(cur, nlkey, lkey, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); @@ -4716,7 +4715,6 @@ xfs_btree_simple_query_range( { union xfs_btree_rec *recp; union xfs_btree_key rec_key; - int64_t diff; int stat; bool firstrec = true; int error; @@ -4746,20 +4744,17 @@ xfs_btree_simple_query_range( if (error || !stat) break; - /* Skip if high_key(rec) < low_key. */ + /* Skip if low_key > high_key(rec). */ if (firstrec) { cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; - diff = cur->bc_ops->diff_two_keys(cur, low_key, - &rec_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) goto advloop; } - /* Stop if high_key < low_key(rec). */ + /* Stop if low_key(rec) > high_key. */ cur->bc_ops->init_key_from_rec(&rec_key, recp); - diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) break; /* Callback */ @@ -4813,8 +4808,6 @@ xfs_btree_overlapped_query_range( union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - int64_t ldiff; - int64_t hdiff; int level; struct xfs_buf *bp; int i; @@ -4854,25 +4847,23 @@ pop_up: block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); - ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, - low_key); - cur->bc_ops->init_key_from_rec(&rec_key, recp); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, - &rec_key); /* + * If (query's high key < record's low key), then there + * are no more interesting records in this block. Pop + * up to the leaf level to find more record blocks. + * * If (record's high key >= query's low key) and * (query's high key >= record's low key), then * this record overlaps the query range; callback. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { error = fn(cur, recp, priv); if (error) break; - } else if (hdiff < 0) { - /* Record is larger than high key; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; continue; @@ -4884,15 +4875,18 @@ pop_up: block); pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); - ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); - /* + * If (query's high key < pointer's low key), then there are no + * more interesting keys in this block. Pop up one leaf level + * to continue looking for records. + * * If (pointer's high key >= query's low key) and * (query's high key >= pointer's low key), then * this record overlaps the query range; follow pointer. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, lkp)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { level--; error = xfs_btree_lookup_get_block(cur, level, pp, &block); @@ -4907,9 +4901,6 @@ pop_up: #endif cur->bc_levels[level].ptr = 1; continue; - } else if (hdiff < 0) { - /* The low key is larger than the upper range; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; } @@ -4937,6 +4928,19 @@ out: return error; } +static inline void +xfs_btree_key_from_irec( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + const union xfs_btree_irec *irec) +{ + union xfs_btree_rec rec; + + cur->bc_rec = *irec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(key, &rec); +} + /* * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the @@ -4951,21 +4955,15 @@ xfs_btree_query_range( xfs_btree_query_range_fn fn, void *priv) { - union xfs_btree_rec rec; union xfs_btree_key low_key; union xfs_btree_key high_key; /* Find the keys of both ends of the interval. */ - cur->bc_rec = *high_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&high_key, &rec); + xfs_btree_key_from_irec(cur, &high_key, high_rec); + xfs_btree_key_from_irec(cur, &low_key, low_rec); - cur->bc_rec = *low_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&low_key, &rec); - - /* Enforce low key < high key. */ - if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) + /* Enforce low key <= high key. */ + if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -5027,34 +5025,132 @@ xfs_btree_diff_two_ptrs( return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } -/* If there's an extent, we're done. */ +struct xfs_btree_has_records { + /* Keys for the start and end of the range we want to know about. */ + union xfs_btree_key start_key; + union xfs_btree_key end_key; + + /* Mask for key comparisons, if desired. */ + const union xfs_btree_key *key_mask; + + /* Highest record key we've seen so far. */ + union xfs_btree_key high_key; + + enum xbtree_recpacking outcome; +}; + STATIC int -xfs_btree_has_record_helper( +xfs_btree_has_records_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { - return -ECANCELED; + union xfs_btree_key rec_key; + union xfs_btree_key rec_high_key; + struct xfs_btree_has_records *info = priv; + enum xbtree_key_contig key_contig; + + cur->bc_ops->init_key_from_rec(&rec_key, rec); + + if (info->outcome == XBTREE_RECPACKING_EMPTY) { + info->outcome = XBTREE_RECPACKING_SPARSE; + + /* + * If the first record we find does not overlap the start key, + * then there is a hole at the start of the search range. + * Classify this as sparse and stop immediately. + */ + if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, + info->key_mask)) + return -ECANCELED; + } else { + /* + * If a subsequent record does not overlap with the any record + * we've seen so far, there is a hole in the middle of the + * search range. Classify this as sparse and stop. + * If the keys overlap and this btree does not allow overlap, + * signal corruption. + */ + key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, + &rec_key, info->key_mask); + if (key_contig == XBTREE_KEY_OVERLAP && + !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return -EFSCORRUPTED; + if (key_contig == XBTREE_KEY_GAP) + return -ECANCELED; + } + + /* + * If high_key(rec) is larger than any other high key we've seen, + * remember it for later. + */ + cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); + if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, + info->key_mask)) + info->high_key = rec_high_key; /* struct copy */ + + return 0; } -/* Is there a record covering a given range of keys? */ +/* + * Scan part of the keyspace of a btree and tell us if that keyspace does not + * map to any records; is fully mapped to records; or is partially mapped to + * records. This is the btree record equivalent to determining if a file is + * sparse. + * + * For most btree types, the record scan should use all available btree key + * fields to compare the keys encountered. These callers should pass NULL for + * @mask. However, some callers (e.g. scanning physical space in the rmapbt) + * want to ignore some part of the btree record keyspace when performing the + * comparison. These callers should pass in a union xfs_btree_key object with + * the fields that *should* be a part of the comparison set to any nonzero + * value, and the rest zeroed. + */ int -xfs_btree_has_record( +xfs_btree_has_records( struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, - bool *exists) + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome) { + struct xfs_btree_has_records info = { + .outcome = XBTREE_RECPACKING_EMPTY, + .key_mask = mask, + }; int error; - error = xfs_btree_query_range(cur, low, high, - &xfs_btree_has_record_helper, NULL); - if (error == -ECANCELED) { - *exists = true; - return 0; + /* Not all btrees support this operation. */ + if (!cur->bc_ops->keys_contiguous) { + ASSERT(0); + return -EOPNOTSUPP; } - *exists = false; - return error; + + xfs_btree_key_from_irec(cur, &info.start_key, low); + xfs_btree_key_from_irec(cur, &info.end_key, high); + + error = xfs_btree_query_range(cur, low, high, + xfs_btree_has_records_helper, &info); + if (error == -ECANCELED) + goto out; + if (error) + return error; + + if (info.outcome == XBTREE_RECPACKING_EMPTY) + goto out; + + /* + * If the largest high_key(rec) we saw during the walk is greater than + * the end of the search range, classify this as full. Otherwise, + * there is a hole at the end of the search range. + */ + if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, + mask)) + info.outcome = XBTREE_RECPACKING_FULL; + +out: + *outcome = info.outcome; + return 0; } /* Are there more records in this btree? */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 29c4b4ccb909..a2aa36b23e25 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -90,6 +90,27 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) +enum xbtree_key_contig { + XBTREE_KEY_GAP = 0, + XBTREE_KEY_CONTIGUOUS, + XBTREE_KEY_OVERLAP, +}; + +/* + * Decide if these two numeric btree key fields are contiguous, overlapping, + * or if there's a gap between them. @x should be the field from the high + * key and @y should be the field from the low key. + */ +static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) +{ + x++; + if (x < y) + return XBTREE_KEY_GAP; + if (x == y) + return XBTREE_KEY_CONTIGUOUS; + return XBTREE_KEY_OVERLAP; +} + struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; @@ -140,11 +161,14 @@ struct xfs_btree_ops { /* * Difference between key2 and key1 -- positive if key1 > key2, - * negative if key1 < key2, and zero if equal. + * negative if key1 < key2, and zero if equal. If the @mask parameter + * is non NULL, each key field to be used in the comparison must + * contain a nonzero value. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, - const union xfs_btree_key *key2); + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; @@ -157,6 +181,22 @@ struct xfs_btree_ops { int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); + + /* + * Are these two btree keys immediately adjacent? + * + * Given two btree keys @key1 and @key2, decide if it is impossible for + * there to be a third btree key K satisfying the relationship + * @key1 < K < @key2. To determine if two btree records are + * immediately adjacent, @key1 should be the high key of the first + * record and @key2 should be the low key of the second record. + * If the @mask parameter is non NULL, each key field to be used in the + * comparison must contain a nonzero value. + */ + enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); }; /* @@ -540,12 +580,105 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, +typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); + +int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, - const union xfs_btree_irec *high, bool *exists); + const union xfs_btree_irec *high, + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome); + bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); +/* Key comparison helpers */ +static inline bool +xfs_btree_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; +} + +static inline bool +xfs_btree_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; +} + +static inline bool +xfs_btree_keycmp_eq( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; +} + +static inline bool +xfs_btree_keycmp_le( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_gt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_lt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ne( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_eq(cur, key1, key2); +} + +/* Masked key comparison helpers */ +static inline bool +xfs_btree_masked_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; +} + +static inline bool +xfs_btree_masked_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; +} + +static inline bool +xfs_btree_masked_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); +} + /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5a321b783398..bcfb6a4203cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -397,6 +397,7 @@ xfs_defer_cancel_list( list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); @@ -476,6 +477,7 @@ xfs_defer_finish_one( list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; + trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; @@ -623,7 +625,7 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops; + const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); @@ -636,7 +638,6 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; if (dfp->dfp_type != type || (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; @@ -653,6 +654,7 @@ xfs_defer_add( } list_add_tail(li, &dfp->dfp_work); + trace_xfs_defer_add_item(tp->t_mountp, dfp, li); dfp->dfp_count++; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 92bac3373f1f..f5462fd582d5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -64,7 +64,7 @@ xfs_ascii_ci_hashname( int i; for (i = 0, hash = 0; i < name->len; i++) - hash = tolower(name->name[i]) ^ rol32(hash, 7); + hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } @@ -85,7 +85,8 @@ xfs_ascii_ci_compname( for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; - if (tolower(args->name[i]) != tolower(name[i])) + if (xfs_ascii_ci_xfrm(args->name[i]) != + xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index dd39f17dd9a9..19af22a16c41 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); bool xfs_dir2_namecheck(const void *name, size_t length); +/* + * The "ascii-ci" feature was created to speed up case-insensitive lookups for + * a Samba product. Because of the inherent problems with CI and UTF-8 + * encoding, etc, it was decided that Samba would be configured to export + * latin1/iso 8859-1 encodings as that covered >90% of the target markets for + * the product. Hence the "ascii-ci" casefolding code could be encoded into + * the XFS directory operations and remove all the overhead of casefolding from + * Samba. + * + * To provide consistent hashing behavior between the userspace and kernel, + * these functions prepare names for hashing by transforming specific bytes + * to other bytes. Robustness with other encodings is not guaranteed. + */ +static inline bool xfs_ascii_ci_need_xfrm(unsigned char c) +{ + if (c >= 0x41 && c <= 0x5a) /* A-Z */ + return true; + if (c >= 0xc0 && c <= 0xd6) /* latin A-O with accents */ + return true; + if (c >= 0xd8 && c <= 0xde) /* latin O-Y with accents */ + return true; + return false; +} + +static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) +{ + if (xfs_ascii_ci_need_xfrm(c)) + c -= 'A' - 'a'; + return c; +} + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7ee292aecbeb..a16d5de16933 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,33 +95,25 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_inobt_get_rec( - struct xfs_btree_cur *cur, - struct xfs_inobt_rec_incore *irec, - int *stat) +/* Simple checks for inode records. */ +xfs_failaddr_t +xfs_inobt_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec) { - struct xfs_mount *mp = cur->bc_mp; - union xfs_btree_rec *rec; - int error; uint64_t realfree; - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || *stat == 0) - return error; - - xfs_inobt_btrec_to_irec(mp, rec, irec); - + /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) - goto out_bad_rec; + return __this_address; + if (!xfs_verify_agino(cur->bc_ag.pag, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) + return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(irec->ir_holemask)) @@ -129,15 +121,23 @@ xfs_inobt_get_rec( else realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); if (hweight64(realfree) != irec->ir_freecount) - goto out_bad_rec; + return __this_address; - return 0; + return NULL; +} + +static inline int +xfs_inobt_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected!", + "%s Inode BTree record corruption in AG %d detected at %pS!", cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno); + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -146,6 +146,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *irec, + int *stat) +{ + struct xfs_mount *mp = cur->bc_mp; + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || *stat == 0) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, irec); + fa = xfs_inobt_check_irec(cur, irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, irec); + + return 0; +} + +/* * Insert a single inobt record. Cursor must already point to desired location. */ int @@ -1952,8 +1978,6 @@ xfs_difree_inobt( */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - struct xfs_perag *pag = agbp->b_pag; - xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino); @@ -2617,44 +2641,50 @@ xfs_ialloc_read_agi( return 0; } -/* Is there an inode record covering a given range of inode numbers? */ -int -xfs_ialloc_has_inode_record( - struct xfs_btree_cur *cur, - xfs_agino_t low, - xfs_agino_t high, - bool *exists) +/* How many inodes are backed by inode clusters ondisk? */ +STATIC int +xfs_ialloc_count_ondisk( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + unsigned int *allocated) { struct xfs_inobt_rec_incore irec; - xfs_agino_t agino; - uint16_t holemask; - int has_record; - int i; - int error; + unsigned int ret = 0; + int has_record; + int error; - *exists = false; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); - while (error == 0 && has_record) { + if (error) + return error; + + while (has_record) { + unsigned int i, hole_idx; + error = xfs_inobt_get_rec(cur, &irec, &has_record); - if (error || irec.ir_startino > high) + if (error) + return error; + if (irec.ir_startino > high) break; - agino = irec.ir_startino; - holemask = irec.ir_holemask; - for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, - i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { - if (holemask & 1) + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (irec.ir_startino + i < low) continue; - if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && - agino <= high) { - *exists = true; - return 0; - } + if (irec.ir_startino + i > high) + break; + + hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; + if (!(irec.ir_holemask & (1U << hole_idx))) + ret++; } error = xfs_btree_increment(cur, 0, &has_record); + if (error) + return error; } - return error; + + *allocated = ret; + return 0; } /* Is there an inode record covering a given extent? */ @@ -2663,15 +2693,27 @@ xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { - xfs_agino_t low; - xfs_agino_t high; + xfs_agino_t agino; + xfs_agino_t last_agino; + unsigned int allocated; + int error; + + agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; - low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); - high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; + error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); + if (error) + return error; - return xfs_ialloc_has_inode_record(cur, low, high, exists); + if (allocated == 0) + *outcome = XBTREE_RECPACKING_EMPTY; + else if (allocated == last_agino - agino + 1) + *outcome = XBTREE_RECPACKING_FULL; + else + *outcome = XBTREE_RECPACKING_SPARSE; + return 0; } struct xfs_ialloc_count_inodes { @@ -2688,8 +2730,13 @@ xfs_ialloc_count_inodes_rec( { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; + xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + fa = xfs_inobt_check_irec(cur, &irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, &irec); + ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index ab8c30b4ec22..fe824bb04a09 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -93,10 +93,11 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); -int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, - xfs_agino_t high, bool *exists); + xfs_agblock_t bno, xfs_extlen_t len, + enum xbtree_recpacking *outcome); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b28211d5a4c..5a945ae21b5d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -156,9 +156,12 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_fsblock_t fsbno; + xfs_inobt_mod_blockcount(cur, -1); - return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -266,10 +269,13 @@ STATIC int64_t xfs_inobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->inobt.ir_startino); + return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - - be32_to_cpu(k2->inobt.ir_startino); + be32_to_cpu(k2->inobt.ir_startino); } static xfs_failaddr_t @@ -380,6 +386,19 @@ xfs_inobt_recs_inorder( be32_to_cpu(r2->inobt.ir_startino); } +STATIC enum xbtree_key_contig +xfs_inobt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->inobt.ir_startino); + + return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino), + be32_to_cpu(key2->inobt.ir_startino)); +} + static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), @@ -399,6 +418,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; static const struct xfs_btree_ops xfs_finobt_ops = { @@ -420,6 +440,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; /* @@ -447,9 +468,7 @@ xfs_inobt_init_common( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } @@ -607,7 +626,7 @@ xfs_iallocbt_maxlevels_ondisk(void) */ uint64_t xfs_inobt_irec_to_allocmask( - struct xfs_inobt_rec_incore *rec) + const struct xfs_inobt_rec_incore *rec) { uint64_t bitmap = 0; uint64_t inodespbit; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index e859a6e05230..3262c3fe5ebe 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -53,7 +53,7 @@ struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ -uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); +uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); #if defined(DEBUG) || defined(XFS_WARN) int xfs_inobt_rec_check_count(struct xfs_mount *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 6b21760184d9..5a2e7ddfa76d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -140,7 +140,8 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, + fa, &new); } xfs_iext_insert(ip, &icur, &new, state); @@ -226,10 +227,15 @@ xfs_iformat_data_fork( /* * Initialize the extent count early, as the per-format routines may - * depend on it. + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. */ ip->i_df.if_format = dip->di_format; ip->i_df.if_nextents = xfs_dfork_data_extents(dip); + smp_store_release(&ip->i_df.if_needextents, + ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -282,8 +288,17 @@ xfs_ifork_init_attr( enum xfs_dinode_fmt format, xfs_extnum_t nextents) { + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. + */ ip->i_af.if_format = format; ip->i_af.if_nextents = nextents; + smp_store_release(&ip->i_af.if_needextents, + ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index d3943d6ad0b9..96d307784c85 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -24,6 +24,7 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ + uint8_t if_needextents; /* extents have not been read */ }; /* @@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ -static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) { - return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; + /* see xfs_iformat_{data,attr}_fork() for needextents semantics */ + return smp_load_acquire(&ifp->if_needextents) != 0; } #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bcf46aa0d08b..c1c65774dcc2 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -120,45 +120,41 @@ xfs_refcount_btrec_to_irec( irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_refcount_get_rec( +/* Simple checks for refcount records. */ +xfs_failaddr_t +xfs_refcount_check_irec( struct xfs_btree_cur *cur, - struct xfs_refcount_irec *irec, - int *stat) + const struct xfs_refcount_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - xfs_refcount_btrec_to_irec(rec, irec); if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) - goto out_bad_rec; + return __this_address; if (!xfs_refcount_check_domain(irec)) - goto out_bad_rec; + return __this_address; /* check for valid extent range, including overflow */ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) - goto out_bad_rec; + return __this_address; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) - goto out_bad_rec; + return __this_address; - trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); - return 0; + return NULL; +} + +static inline int +xfs_refcount_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_refcount_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", - pag->pag_agno); + "Refcount BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -166,6 +162,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + xfs_refcount_btrec_to_irec(rec, irec); + fa = xfs_refcount_check_irec(cur, irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, irec); + + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + return 0; +} + +/* * Update the record referred to by cur to the value given * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. @@ -1332,26 +1354,22 @@ xfs_refcount_finish_one( xfs_agblock_t bno; unsigned long nr_ops = 0; int shape_changes = 0; - struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), ri->ri_blockcount); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { nr_ops = rcur->bc_ag.refc.nr_ops; shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); @@ -1359,12 +1377,12 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, - &agbp); + error = xfs_alloc_read_agf(ri->ri_pag, tp, + XFS_ALLOC_FLAG_FREEING, &agbp); if (error) - goto out_drop; + return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1375,7 +1393,7 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_INCREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; @@ -1383,31 +1401,29 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_DECREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; default: ASSERT(0); - error = -EFSCORRUPTED; + return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_blockcount); -out_drop: - xfs_perag_put(pag); return error; } @@ -1435,6 +1451,7 @@ __xfs_refcount_add( ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + xfs_refcount_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); } @@ -1876,7 +1893,8 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (XFS_IS_CORRUPT(cur->bc_mp, + if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); return -EFSCORRUPTED; @@ -1980,14 +1998,17 @@ out_free: return error; } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the refcount records and tell us if the area + * has no records, is fully mapped by records, or is partially filled. + */ int -xfs_refcount_has_record( +xfs_refcount_has_records( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -1998,7 +2019,7 @@ xfs_refcount_has_record( high.rc.rc_startblock = bno + len - 1; low.rc.rc_domain = high.rc.rc_domain = domain; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } int __init diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index c633477ce3ce..783cd89ca195 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -50,6 +50,7 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; + struct xfs_perag *ri_pag; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; @@ -67,6 +68,9 @@ xfs_refcount_check_domain( return true; } +void xfs_refcount_update_get_group(struct xfs_mount *mp, + struct xfs_refcount_intent *ri); + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -107,12 +111,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, +extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); + xfs_extlen_t len, enum xbtree_recpacking *outcome); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index f3b860970b26..d4afc5f4e6a5 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,8 +112,9 @@ xfs_refcountbt_free_block( XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, - XFS_AG_RESV_METADATA); + error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error; @@ -201,10 +202,13 @@ STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->refc.rc_startblock); + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + be32_to_cpu(k2->refc.rc_startblock); } STATIC xfs_failaddr_t @@ -299,6 +303,19 @@ xfs_refcountbt_recs_inorder( be32_to_cpu(r2->refc.rc_startblock); } +STATIC enum xbtree_key_contig +xfs_refcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + static const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), @@ -318,6 +335,7 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { .diff_two_keys = xfs_refcountbt_diff_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, + .keys_contiguous = xfs_refcountbt_keys_contiguous, }; /* @@ -339,10 +357,7 @@ xfs_refcountbt_init_common( cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; cur->bc_ops = &xfs_refcountbt_ops; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index df720041cd3d..f4dc23b3b837 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -193,7 +193,7 @@ done: } /* Convert an internal btree record to an rmap record. */ -int +xfs_failaddr_t xfs_rmap_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) @@ -205,51 +205,74 @@ xfs_rmap_btrec_to_irec( irec); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_rmap_get_rec( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *irec, - int *stat) +/* Simple checks for rmap records. */ +xfs_failaddr_t +xfs_rmap_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - - if (xfs_rmap_btrec_to_irec(rec, irec)) - goto out_bad_rec; + struct xfs_mount *mp = cur->bc_mp; + bool is_inode; + bool is_unwritten; + bool is_bmbt; + bool is_attr; if (irec->rm_blockcount == 0) - goto out_bad_rec; + return __this_address; if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) { if (irec->rm_owner != XFS_RMAP_OWN_FS) - goto out_bad_rec; + return __this_address; if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - goto out_bad_rec; + return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, irec->rm_startblock, - irec->rm_blockcount)) - goto out_bad_rec; + if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; } if (!(xfs_verify_ino(mp, irec->rm_owner) || (irec->rm_owner <= XFS_RMAP_OWN_FS && irec->rm_owner >= XFS_RMAP_OWN_MIN))) - goto out_bad_rec; + return __this_address; + + /* Check flags. */ + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + if (is_bmbt && irec->rm_offset != 0) + return __this_address; + + if (!is_inode && irec->rm_offset != 0) + return __this_address; + + if (is_unwritten && (is_bmbt || !is_inode || is_attr)) + return __this_address; + + if (!is_inode && (is_bmbt || is_unwritten || is_attr)) + return __this_address; + + /* Check for a valid fork offset, if applicable. */ + if (is_inode && !is_bmbt && + !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_rmap_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; - return 0; -out_bad_rec: xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected!", - pag->pag_agno); + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -257,6 +280,32 @@ out_bad_rec: return -EFSCORRUPTED; } +/* + * Get the data from the pointed-to record. + */ +int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + fa = xfs_rmap_btrec_to_irec(rec, irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, irec); + + return 0; +} + struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; @@ -2320,11 +2369,14 @@ xfs_rmap_query_range_helper( { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; - int error; + xfs_failaddr_t fa; + + fa = xfs_rmap_btrec_to_irec(rec, &irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, &irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, &irec); - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (error) - return error; return query->fn(cur, &irec, query->priv); } @@ -2394,7 +2446,6 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; @@ -2402,26 +2453,22 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, ri->ri_bmap.br_state); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } - + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2432,15 +2479,13 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); if (error) - goto out_drop; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - error = -EFSCORRUPTED; - goto out_drop; - } + return error; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) + return -EFSCORRUPTED; - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } *pcur = rcur; @@ -2480,8 +2525,7 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } -out_drop: - xfs_perag_put(pag); + return error; } @@ -2526,6 +2570,7 @@ __xfs_rmap_add( ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + xfs_rmap_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); } @@ -2664,14 +2709,21 @@ xfs_rmap_compare( return 0; } -/* Is there a record covering a given extent? */ +/* + * Scan the physical storage part of the keyspace of the reverse mapping index + * and tell us if the area has no records, is fully mapped by records, or is + * partially filled. + */ int -xfs_rmap_has_record( +xfs_rmap_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { + union xfs_btree_key mask = { + .rmap.rm_startblock = cpu_to_be32(-1U), + }; union xfs_btree_irec low; union xfs_btree_irec high; @@ -2680,68 +2732,144 @@ xfs_rmap_has_record( memset(&high, 0xFF, sizeof(high)); high.r.rm_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, &mask, outcome); } -/* - * Is there a record for this owner completely covering a given physical - * extent? If so, *has_rmap will be set to true. If there is no record - * or the record only covers part of the range, we set *has_rmap to false. - * This function doesn't perform range lookups or offset checks, so it is - * not suitable for checking data fork blocks. - */ -int -xfs_rmap_record_exists( - struct xfs_btree_cur *cur, +struct xfs_rmap_ownercount { + /* Owner that we're looking for. */ + struct xfs_rmap_irec good; + + /* rmap search keys */ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + + struct xfs_rmap_matches *results; + + /* Stop early if we find a nonmatch? */ + bool stop_on_nonmatch; +}; + +/* Does this rmap represent space that can have multiple owners? */ +static inline bool +xfs_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + if (!xfs_has_reflink(mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK)) + return false; + return true; +} + +static inline void +xfs_rmap_ownercount_init( + struct xfs_rmap_ownercount *roc, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + struct xfs_rmap_matches *results) { - uint64_t owner; - uint64_t offset; - unsigned int flags; - int has_record; - struct xfs_rmap_irec irec; - int error; + memset(roc, 0, sizeof(*roc)); + roc->results = results; + + roc->low.rm_startblock = bno; + memset(&roc->high, 0xFF, sizeof(roc->high)); + roc->high.rm_startblock = bno + len - 1; + + memset(results, 0, sizeof(*results)); + roc->good.rm_startblock = bno; + roc->good.rm_blockcount = len; + roc->good.rm_owner = oinfo->oi_owner; + roc->good.rm_offset = oinfo->oi_offset; + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + roc->good.rm_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + roc->good.rm_flags |= XFS_RMAP_BMBT_BLOCK; +} - xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); - ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || - (flags & XFS_RMAP_BMBT_BLOCK)); +/* Figure out if this is a match for the owner. */ +STATIC int +xfs_rmap_count_owners_helper( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_ownercount *roc = priv; + struct xfs_rmap_irec check = *rec; + unsigned int keyflags; + bool filedata; + int64_t delta; + + filedata = !XFS_RMAP_NON_INODE_OWNER(check.rm_owner) && + !(check.rm_flags & XFS_RMAP_BMBT_BLOCK); + + /* Trim the part of check that comes before the comparison range. */ + delta = (int64_t)roc->good.rm_startblock - check.rm_startblock; + if (delta > 0) { + check.rm_startblock += delta; + check.rm_blockcount -= delta; + if (filedata) + check.rm_offset += delta; + } - error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, - &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; + /* Trim the part of check that comes after the comparison range. */ + delta = (check.rm_startblock + check.rm_blockcount) - + (roc->good.rm_startblock + roc->good.rm_blockcount); + if (delta > 0) + check.rm_blockcount -= delta; + + /* Don't care about unwritten status for establishing ownership. */ + keyflags = check.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK); + + if (check.rm_startblock == roc->good.rm_startblock && + check.rm_blockcount == roc->good.rm_blockcount && + check.rm_owner == roc->good.rm_owner && + check.rm_offset == roc->good.rm_offset && + keyflags == roc->good.rm_flags) { + roc->results->matches++; + } else { + roc->results->non_owner_matches++; + if (xfs_rmap_shareable(cur->bc_mp, &roc->good) ^ + xfs_rmap_shareable(cur->bc_mp, &check)) + roc->results->bad_non_owner_matches++; } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && - irec.rm_startblock + irec.rm_blockcount >= bno + len); + if (roc->results->non_owner_matches && roc->stop_on_nonmatch) + return -ECANCELED; + return 0; } -struct xfs_rmap_key_state { - uint64_t owner; - uint64_t offset; - unsigned int flags; -}; - -/* For each rmap given, figure out if it doesn't match the key we want. */ -STATIC int -xfs_rmap_has_other_keys_helper( +/* Count the number of owners and non-owners of this range of blocks. */ +int +xfs_rmap_count_owners( struct xfs_btree_cur *cur, - const struct xfs_rmap_irec *rec, - void *priv) + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + struct xfs_rmap_matches *results) { - struct xfs_rmap_key_state *rks = priv; + struct xfs_rmap_ownercount roc; + int error; - if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && - ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) - return 0; - return -ECANCELED; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, results); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); + if (error) + return error; + + /* + * There can't be any non-owner rmaps that conflict with the given + * owner if we didn't find any rmaps matching the owner. + */ + if (!results->matches) + results->bad_non_owner_matches = 0; + + return 0; } /* @@ -2754,28 +2882,26 @@ xfs_rmap_has_other_keys( xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + bool *has_other) { - struct xfs_rmap_irec low = {0}; - struct xfs_rmap_irec high; - struct xfs_rmap_key_state rks; + struct xfs_rmap_matches res; + struct xfs_rmap_ownercount roc; int error; - xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - *has_rmap = false; - - low.rm_startblock = bno; - memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, &res); + roc.stop_on_nonmatch = true; - error = xfs_rmap_query_range(cur, &low, &high, - xfs_rmap_has_other_keys_helper, &rks); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); if (error == -ECANCELED) { - *has_rmap = true; + *has_other = true; return 0; } + if (error) + return error; - return error; + *has_other = false; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 2dac88cea28d..3c98d9d50afb 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -62,13 +62,14 @@ xfs_rmap_irec_offset_pack( return x; } -static inline int +static inline xfs_failaddr_t xfs_rmap_irec_offset_unpack( __u64 offset, struct xfs_rmap_irec *irec) { if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) - return -EFSCORRUPTED; + return __this_address; + irec->rm_offset = XFS_RMAP_OFF(offset); irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) @@ -77,7 +78,7 @@ xfs_rmap_irec_offset_unpack( irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; if (offset & XFS_RMAP_OFF_UNWRITTEN) irec->rm_flags |= XFS_RMAP_UNWRITTEN; - return 0; + return NULL; } static inline void @@ -162,8 +163,12 @@ struct xfs_rmap_intent { int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; + struct xfs_perag *ri_pag; }; +void xfs_rmap_update_get_group(struct xfs_mount *mp, + struct xfs_rmap_intent *ri); + /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); @@ -188,16 +193,31 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, +xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); -int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec); + +int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); + +struct xfs_rmap_matches { + /* Number of owner matches. */ + unsigned long long matches; + + /* Number of non-owner matches. */ + unsigned long long non_owner_matches; + + /* Number of non-owner matches that conflict with the owner matches. */ + unsigned long long bad_non_owner_matches; +}; + +int xfs_rmap_count_owners(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + struct xfs_rmap_matches *rmatch); int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + bool *has_other); int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index d3285684bb5e..6c81b20e97d2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs( return cur->bc_mp->m_rmap_mxr[level != 0]; } +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + STATIC void xfs_rmapbt_init_key_from_rec( union xfs_btree_key *key, @@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec( { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); } /* @@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec( key->rmap.rm_startblock = rec->rmap.rm_startblock; be32_add_cpu(&key->rmap.rm_startblock, adj); key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) return; @@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + STATIC int64_t xfs_rmapbt_key_diff( struct xfs_btree_cur *cur, @@ -240,8 +260,8 @@ xfs_rmapbt_key_diff( else if (y > x) return -1; - x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); - y = rec->rm_offset; + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); if (x > y) return 1; else if (y > x) @@ -253,31 +273,43 @@ STATIC int64_t xfs_rmapbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; int64_t d; __u64 x, y; + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + be32_to_cpu(kp2->rm_startblock); if (d) return d; - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } - x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); - y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; return 0; } @@ -387,8 +419,8 @@ xfs_rmapbt_keys_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); if (a <= b) return 1; return 0; @@ -417,13 +449,33 @@ xfs_rmapbt_recs_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); if (a <= b) return 1; return 0; } +STATIC enum xbtree_key_contig +xfs_rmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + static const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), @@ -443,6 +495,7 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .diff_two_keys = xfs_rmapbt_diff_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, }; static struct xfs_btree_cur * @@ -460,10 +513,7 @@ xfs_rmapbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ops = &xfs_rmapbt_ops; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 99cc03a298e2..ba0f17bc1dc0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -72,7 +72,8 @@ xfs_sb_validate_v5_features( } /* - * We support all XFS versions newer than a v4 superblock with V2 directories. + * We current support XFS v5 formats with known features and v4 superblocks with + * at least V2 directories. */ bool xfs_sb_good_version( @@ -86,16 +87,16 @@ xfs_sb_good_version( if (xfs_sb_is_v5(sbp)) return xfs_sb_validate_v5_features(sbp); + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4) + return false; + /* We must not have any unknown v4 feature bits set */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) return false; - /* versions prior to v4 are not supported */ - if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) - return false; - /* V4 filesystems need v2 directories and unwritten extents */ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 5ebdda7e1078..851220021484 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -204,6 +204,18 @@ enum xfs_ag_resv_type { XFS_AG_RESV_RMAPBT, }; +/* Results of scanning a btree keyspace to check occupancy. */ +enum xbtree_recpacking { + /* None of the keyspace maps to records. */ + XBTREE_RECPACKING_EMPTY = 0, + + /* Some, but not all, of the keyspace maps to records. */ + XBTREE_RECPACKING_SPARSE, + + /* The entire keyspace maps to records. */ + XBTREE_RECPACKING_FULL, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 4dd52b15f09c..6c6e5eba42c8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,15 @@ #include "scrub/scrub.h" #include "scrub/common.h" +int +xchk_setup_agheader( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_fs(sc); +} + /* Superblock */ /* Cross-reference with the other btrees. */ @@ -42,8 +51,9 @@ xchk_superblock_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ } @@ -505,9 +515,10 @@ xchk_agf_xref( xchk_agf_xref_freeblks(sc); xchk_agf_xref_cntbt(sc); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_agf_xref_btreeblks(sc); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agf_xref_refcblks(sc); /* scrub teardown will take care of sc->sa for us */ @@ -633,8 +644,9 @@ xchk_agfl_block_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); } /* Scrub an AGFL block. */ @@ -689,8 +701,9 @@ xchk_agfl_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* * Scrub teardown will take care of sc->sa for us. Leave sc->sa @@ -844,8 +857,9 @@ xchk_agi_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); xchk_agi_xref_icounts(sc); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index c37e6d72760b..bbaa65422c4f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -487,10 +487,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - struct xrep_agfl *ra, uint64_t start, - uint64_t len) + uint64_t len, + void *priv) { + struct xrep_agfl *ra = priv; xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -538,7 +539,6 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xbitmap_range *br, *n; int error; ra.sc = sc; @@ -579,11 +579,7 @@ xrep_agfl_collect_blocks( /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - for_each_xbitmap_extent(br, n, agfl_extents) { - error = xrep_agfl_check_extent(&ra, br->start, br->len); - if (error) - break; - } + error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; @@ -629,21 +625,58 @@ xrep_agfl_update_agf( XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } +struct xrep_agfl_fill { + struct xbitmap used_extents; + struct xfs_scrub *sc; + __be32 *agfl_bno; + xfs_agblock_t flcount; + unsigned int fl_off; +}; + +/* Fill the AGFL with whatever blocks are in this extent. */ +static int +xrep_agfl_fill( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xrep_agfl_fill *af = priv; + struct xfs_scrub *sc = af->sc; + xfs_fsblock_t fsbno = start; + int error; + + while (fsbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = + cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + + trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, + XFS_FSB_TO_AGBNO(sc->mp, start), len); + + error = xbitmap_set(&af->used_extents, start, fsbno - 1); + if (error) + return error; + + if (af->fl_off == af->flcount) + return -ECANCELED; + + return 0; +} + /* Write out a totally new AGFL. */ -STATIC void +STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, struct xbitmap *agfl_extents, xfs_agblock_t flcount) { + struct xrep_agfl_fill af = { + .sc = sc, + .flcount = flcount, + }; struct xfs_mount *mp = sc->mp; - __be32 *agfl_bno; - struct xbitmap_range *br; - struct xbitmap_range *n; struct xfs_agfl *agfl; - xfs_agblock_t agbno; - unsigned int fl_off; + int error; ASSERT(flcount <= xfs_agfl_size(mp)); @@ -662,36 +695,18 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - fl_off = 0; - agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); - for_each_xbitmap_extent(br, n, agfl_extents) { - agbno = XFS_FSB_TO_AGBNO(mp, br->start); - - trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno, - br->len); - - while (br->len > 0 && fl_off < flcount) { - agfl_bno[fl_off] = cpu_to_be32(agbno); - fl_off++; - agbno++; - - /* - * We've now used br->start by putting it in the AGFL, - * so bump br so that we don't reap the block later. - */ - br->start++; - br->len--; - } - - if (br->len) - break; - list_del(&br->list); - kfree(br); - } + xbitmap_init(&af.used_extents); + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xbitmap_disunion(agfl_extents, &af.used_extents); + if (error) + return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); + xbitmap_destroy(&af.used_extents); + return 0; } /* Repair the AGFL. */ @@ -744,7 +759,9 @@ xrep_agfl( * buffers until we know that part works. */ xrep_agfl_update_agf(sc, agf_bp, flcount); - xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + if (error) + goto err; /* * Ok, the AGFL should be ready to go now. Roll the transaction to diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 3b38f4e2a537..279af72b1671 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -24,10 +24,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Free space btree scrubber. */ + +struct xchk_alloc { + /* Previous free space extent. */ + struct xfs_alloc_rec_incore prev; +}; + /* * Ensure there's a corresponding cntbt/bnobt record matching this * bnobt/cntbt record, respectively. @@ -75,9 +84,11 @@ xchk_allocbt_xref_other( STATIC void xchk_allocbt_xref( struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len) + const struct xfs_alloc_rec_incore *irec) { + xfs_agblock_t agbno = irec->ar_startblock; + xfs_extlen_t len = irec->ar_blockcount; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; @@ -85,25 +96,44 @@ xchk_allocbt_xref( xchk_xref_is_not_inode_chunk(sc, agbno, len); xchk_xref_has_no_owner(sc, agbno, len); xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_allocbt_mergeable( + struct xchk_btree *bs, + struct xchk_alloc *ca, + const struct xfs_alloc_rec_incore *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (ca->prev.ar_blockcount > 0 && + ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock && + ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&ca->prev, irec, sizeof(*irec)); } /* Scrub a bnobt/cntbt record. */ STATIC int xchk_allocbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) + struct xchk_btree *bs, + const union xfs_btree_rec *rec) { - struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; + struct xfs_alloc_rec_incore irec; + struct xchk_alloc *ca = bs->private; - bno = be32_to_cpu(rec->alloc.ar_startblock); - len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (!xfs_verify_agbext(pag, bno, len)) + xfs_alloc_btrec_to_irec(rec, &irec); + if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - xchk_allocbt_xref(bs->sc, bno, len); + xchk_allocbt_mergeable(bs, ca, &irec); + xchk_allocbt_xref(bs->sc, &irec); return 0; } @@ -114,10 +144,11 @@ xchk_allocbt( struct xfs_scrub *sc, xfs_btnum_t which) { + struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } int @@ -141,15 +172,15 @@ xchk_xref_is_used_space( xfs_agblock_t agbno, xfs_extlen_t len) { - bool is_freesp; + enum xbtree_recpacking outcome; int error; if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) return; - if (is_freesp) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 31529b9bf389..6c16d9530cca 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -15,11 +15,51 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +/* Free the buffers linked from the xattr buffer. */ +static void +xchk_xattr_buf_cleanup( + void *priv) +{ + struct xchk_xattr_buf *ab = priv; + + kvfree(ab->freemap); + ab->freemap = NULL; + kvfree(ab->usedmap); + ab->usedmap = NULL; + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; +} + +/* + * Allocate the free space bitmap if we're trying harder; there are leaf blocks + * in the attr fork; or we can't tell if there are leaf blocks. + */ +static inline bool +xchk_xattr_want_freemap( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + if (sc->flags & XCHK_TRY_HARDER) + return true; + + if (!sc->ip) + return true; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (!ifp) + return false; + + return xfs_ifork_has_extents(ifp); +} + /* * Allocate enough memory to hold an attr value and attr block bitmaps, * reallocating the buffer if necessary. Buffer contents are not preserved @@ -28,41 +68,49 @@ static int xchk_setup_xattr_buf( struct xfs_scrub *sc, - size_t value_size, - gfp_t flags) + size_t value_size) { - size_t sz; + size_t bmp_sz; struct xchk_xattr_buf *ab = sc->buf; + void *new_val; - /* - * We need enough space to read an xattr value from the file or enough - * space to hold three copies of the xattr free space bitmap. We don't - * need the buffer space for both purposes at the same time. - */ - sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - sz = max_t(size_t, sz, value_size); + bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - /* - * If there's already a buffer, figure out if we need to reallocate it - * to accommodate a larger size. - */ - if (ab) { - if (sz <= ab->sz) - return 0; - kvfree(ab); - sc->buf = NULL; - } + if (ab) + goto resize_value; - /* - * Don't zero the buffer upon allocation to avoid runtime overhead. - * All users must be careful never to read uninitialized contents. - */ - ab = kvmalloc(sizeof(*ab) + sz, flags); + ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS); if (!ab) return -ENOMEM; - - ab->sz = sz; sc->buf = ab; + sc->buf_cleanup = xchk_xattr_buf_cleanup; + + ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->usedmap) + return -ENOMEM; + + if (xchk_xattr_want_freemap(sc)) { + ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->freemap) + return -ENOMEM; + } + +resize_value: + if (ab->value_sz >= value_size) + return 0; + + if (ab->value) { + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; + } + + new_val = kvmalloc(value_size, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + ab->value = new_val; + ab->value_sz = value_size; return 0; } @@ -79,8 +127,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, - XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX); if (error) return error; } @@ -111,11 +158,24 @@ xchk_xattr_listent( int namelen, int valuelen) { + struct xfs_da_args args = { + .op_flags = XFS_DA_OP_NOTIME, + .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = context->dp->i_mount->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = context->dp, + .name = name, + .namelen = namelen, + .hashval = xfs_da_hashname(name, namelen), + .trans = context->tp, + .valuelen = valuelen, + }; + struct xchk_xattr_buf *ab; struct xchk_xattr *sx; - struct xfs_da_args args = { NULL }; int error = 0; sx = container_of(context, struct xchk_xattr, context); + ab = sx->sc->buf; if (xchk_should_terminate(sx->sc, &error)) { context->seen_enough = error; @@ -128,18 +188,32 @@ xchk_xattr_listent( return; } + /* Only one namespace bit allowed. */ + if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + /* Does this name make sense? */ if (!xfs_attr_namecheck(name, namelen)) { xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - return; + goto fail_xref; } /* + * Local xattr values are stored in the attr leaf block, so we don't + * need to retrieve the value from a remote block to detect corruption + * problems. + */ + if (flags & XFS_ATTR_LOCAL) + goto fail_xref; + + /* * Try to allocate enough memory to extrat the attr value. If that * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sx->sc, valuelen); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -147,17 +221,7 @@ xchk_xattr_listent( return; } - args.op_flags = XFS_DA_OP_NOTIME; - args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; - args.geo = context->dp->i_mount->m_attr_geo; - args.whichfork = XFS_ATTR_FORK; - args.dp = context->dp; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.trans = context->tp; - args.value = xchk_xattr_valuebuf(sx->sc); - args.valuelen = valuelen; + args.value = ab->value; error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ @@ -213,25 +277,23 @@ xchk_xattr_set_map( STATIC bool xchk_xattr_check_freemap( struct xfs_scrub *sc, - unsigned long *map, struct xfs_attr3_icleaf_hdr *leafhdr) { - unsigned long *freemap = xchk_xattr_freemap(sc); - unsigned long *dstmap = xchk_xattr_dstmap(sc); + struct xchk_xattr_buf *ab = sc->buf; unsigned int mapsize = sc->mp->m_attr_geo->blksize; int i; /* Construct bitmap of freemap contents. */ - bitmap_zero(freemap, mapsize); + bitmap_zero(ab->freemap, mapsize); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { - if (!xchk_xattr_set_map(sc, freemap, + if (!xchk_xattr_set_map(sc, ab->freemap, leafhdr->freemap[i].base, leafhdr->freemap[i].size)) return false; } /* Look for bits that are set in freemap and are marked in use. */ - return bitmap_and(dstmap, freemap, map, mapsize) == 0; + return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize); } /* @@ -251,7 +313,7 @@ xchk_xattr_entry( __u32 *last_hashval) { struct xfs_mount *mp = ds->state->mp; - unsigned long *usedmap = xchk_xattr_usedmap(ds->sc); + struct xchk_xattr_buf *ab = ds->sc->buf; char *name_end; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; @@ -291,7 +353,7 @@ xchk_xattr_entry( if (name_end > buf_end) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize)) xchk_da_set_corrupt(ds, level); if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) *usedbytes += namesize; @@ -311,35 +373,26 @@ xchk_xattr_block( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *ent; struct xfs_attr_leaf_entry *entries; - unsigned long *usedmap; + struct xchk_xattr_buf *ab = ds->sc->buf; char *buf_end; size_t off; __u32 last_hashval = 0; unsigned int usedbytes = 0; unsigned int hdrsize; int i; - int error; if (*last_checked == blk->blkno) return 0; - /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); - if (error == -ENOMEM) - return -EDEADLOCK; - if (error) - return error; - usedmap = xchk_xattr_usedmap(ds->sc); - *last_checked = blk->blkno; - bitmap_zero(usedmap, mp->m_attr_geo->blksize); + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); /* Check all the padding. */ if (xfs_has_crc(ds->sc->mp)) { - struct xfs_attr3_leafblock *leaf = bp->b_addr; + struct xfs_attr3_leafblock *leaf3 = bp->b_addr; - if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || - leaf->hdr.info.hdr.pad != 0) + if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 || + leaf3->hdr.info.hdr.pad != 0) xchk_da_set_corrupt(ds, level); } else { if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) @@ -356,7 +409,7 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (leafhdr.firstused < hdrsize) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -370,7 +423,7 @@ xchk_xattr_block( for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { /* Mark the leaf entry itself. */ off = (char *)ent - (char *)leaf; - if (!xchk_xattr_set_map(ds->sc, usedmap, off, + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off, sizeof(xfs_attr_leaf_entry_t))) { xchk_da_set_corrupt(ds, level); goto out; @@ -384,7 +437,7 @@ xchk_xattr_block( goto out; } - if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr)) + if (!xchk_xattr_check_freemap(ds->sc, &leafhdr)) xchk_da_set_corrupt(ds, level); if (leafhdr.usedbytes != usedbytes) @@ -468,38 +521,115 @@ out: return error; } +/* Check space usage of shortform attrs. */ +STATIC int +xchk_xattr_check_sf( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + bitmap_zero(ab->usedmap, ifp->if_bytes); + sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; + end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + + sfe = &sf->list[0]; + if ((unsigned char *)sfe > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + for (i = 0; i < sf->hdr.count; i++) { + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)sf, + sizeof(struct xfs_attr_sf_entry))) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)name - (char *)sf, + sfe->namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)value - (char *)sf, + sfe->valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + sfe = next; + } + + return 0; +} + /* Scrub the extended attribute metadata. */ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx; + struct xchk_xattr sx = { + .sc = sc, + .context = { + .dp = sc->ip, + .tp = sc->tp, + .resynch = 1, + .put_listent = xchk_xattr_listent, + .allow_incomplete = true, + }, + }; xfs_dablk_t last_checked = -1U; int error = 0; if (!xfs_inode_hasattr(sc->ip)) return -ENOENT; - memset(&sx, 0, sizeof(sx)); - /* Check attribute tree structure */ - error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, - &last_checked); + /* Allocate memory for xattr checking. */ + error = xchk_setup_xattr_buf(sc, 0); + if (error == -ENOMEM) + return -EDEADLOCK; if (error) - goto out; + return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Check the physical structure of the xattr. */ + if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + error = xchk_xattr_check_sf(sc); + else + error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, + &last_checked); + if (error) + return error; - /* Check that every attr key can also be looked up by hash. */ - sx.context.dp = sc->ip; - sx.context.resynch = 1; - sx.context.put_listent = xchk_xattr_listent; - sx.context.tp = sc->tp; - sx.context.allow_incomplete = true; - sx.sc = sc; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; /* - * Look up every xattr in this file by name. + * Look up every xattr in this file by name and hash. * * Use the backend implementation of xfs_attr_list to call * xchk_xattr_listent on every attribute key in this inode. @@ -516,11 +646,11 @@ xchk_xattr( */ error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) - goto out; + return error; /* Did our listent function try to return any errors? */ if (sx.context.seen_enough < 0) - error = sx.context.seen_enough; -out: - return error; + return sx.context.seen_enough; + + return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 3590e10e3e62..48fd9402c432 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_ATTR_H__ #define __XFS_SCRUB_ATTR_H__ @@ -10,59 +10,15 @@ * Temporary storage for online scrub and repair of extended attributes. */ struct xchk_xattr_buf { - /* Size of @buf, in bytes. */ - size_t sz; + /* Bitmap of used space in xattr leaf blocks and shortform forks. */ + unsigned long *usedmap; - /* - * Memory buffer -- either used for extracting attr values while - * walking the attributes; or for computing attr block bitmaps when - * checking the attribute tree. - * - * Each bitmap contains enough bits to track every byte in an attr - * block (rounded up to the size of an unsigned long). The attr block - * used space bitmap starts at the beginning of the buffer; the free - * space bitmap follows immediately after; and we have a third buffer - * for storing intermediate bitmap results. - */ - uint8_t buf[]; -}; - -/* A place to store attribute values. */ -static inline uint8_t * -xchk_xattr_valuebuf( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; - - return ab->buf; -} - -/* A bitmap of space usage computed by walking an attr leaf block. */ -static inline unsigned long * -xchk_xattr_usedmap( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; + /* Bitmap of free space in xattr leaf blocks. */ + unsigned long *freemap; - return (unsigned long *)ab->buf; -} - -/* A bitmap of free space computed by walking attr leaf block free info. */ -static inline unsigned long * -xchk_xattr_freemap( - struct xfs_scrub *sc) -{ - return xchk_xattr_usedmap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} - -/* A bitmap used to hold temporary results. */ -static inline unsigned long * -xchk_xattr_dstmap( - struct xfs_scrub *sc) -{ - return xchk_xattr_freemap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} + /* Memory buffer used to extract xattr values. */ + void *value; + size_t value_sz; +}; #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index a255f09e9f0a..0c959be396ea 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -1,11 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -13,27 +14,160 @@ #include "scrub/scrub.h" #include "scrub/bitmap.h" +#include <linux/interval_tree_generic.h> + +struct xbitmap_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint64_t bn_start; + + /* Last set bit of this interval. */ + uint64_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint64_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint64_t parameters. */ + +#define START(node) ((node)->bn_start) +#define LAST(node) ((node)->bn_last) + /* - * Set a range of this bitmap. Caller must ensure the range is not set. - * - * This is the logical equivalent of bitmap |= mask(start, len). + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline void +xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline struct xbitmap_node * +xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, + uint64_t last); + +static inline struct xbitmap_node * +xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, + uint64_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap_clear( + struct xbitmap *bitmap, + uint64_t start, + uint64_t len) +{ + struct xbitmap_node *bn; + struct xbitmap_node *new_bn; + uint64_t last = start + len - 1; + + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint64_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ int xbitmap_set( struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_range *bmr; + struct xbitmap_node *left; + struct xbitmap_node *right; + uint64_t last = start + len - 1; + int error; - bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); - if (!bmr) - return -ENOMEM; + /* Is this whole range already set? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); - INIT_LIST_HEAD(&bmr->list); - bmr->start = start; - bmr->len = len; - list_add_tail(&bmr->list, &bitmap->list); + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } return 0; } @@ -43,12 +177,11 @@ void xbitmap_destroy( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; - for_each_xbitmap_extent(bmr, n, bitmap) { - list_del(&bmr->list); - kfree(bmr); + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); } } @@ -57,27 +190,7 @@ void xbitmap_init( struct xbitmap *bitmap) { - INIT_LIST_HEAD(&bitmap->list); -} - -/* Compare two btree extents. */ -static int -xbitmap_range_cmp( - void *priv, - const struct list_head *a, - const struct list_head *b) -{ - struct xbitmap_range *ap; - struct xbitmap_range *bp; - - ap = container_of(a, struct xbitmap_range, list); - bp = container_of(b, struct xbitmap_range, list); - - if (ap->start > bp->start) - return 1; - if (ap->start < bp->start) - return -1; - return 0; + bitmap->xb_root = RB_ROOT_CACHED; } /* @@ -94,118 +207,26 @@ xbitmap_range_cmp( * * This is the logical equivalent of bitmap &= ~sub. */ -#define LEFT_ALIGNED (1 << 0) -#define RIGHT_ALIGNED (1 << 1) int xbitmap_disunion( struct xbitmap *bitmap, struct xbitmap *sub) { - struct list_head *lp; - struct xbitmap_range *br; - struct xbitmap_range *new_br; - struct xbitmap_range *sub_br; - uint64_t sub_start; - uint64_t sub_len; - int state; - int error = 0; + struct xbitmap_node *bn; + int error; - if (list_empty(&bitmap->list) || list_empty(&sub->list)) + if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) return 0; - ASSERT(!list_empty(&sub->list)); - - list_sort(NULL, &bitmap->list, xbitmap_range_cmp); - list_sort(NULL, &sub->list, xbitmap_range_cmp); - - /* - * Now that we've sorted both lists, we iterate bitmap once, rolling - * forward through sub and/or bitmap as necessary until we find an - * overlap or reach the end of either list. We do not reset lp to the - * head of bitmap nor do we reset sub_br to the head of sub. The - * list traversal is similar to merge sort, but we're deleting - * instead. In this manner we avoid O(n^2) operations. - */ - sub_br = list_first_entry(&sub->list, struct xbitmap_range, - list); - lp = bitmap->list.next; - while (lp != &bitmap->list) { - br = list_entry(lp, struct xbitmap_range, list); - - /* - * Advance sub_br and/or br until we find a pair that - * intersect or we run out of extents. - */ - while (sub_br->start + sub_br->len <= br->start) { - if (list_is_last(&sub_br->list, &sub->list)) - goto out; - sub_br = list_next_entry(sub_br, list); - } - if (sub_br->start >= br->start + br->len) { - lp = lp->next; - continue; - } - /* trim sub_br to fit the extent we have */ - sub_start = sub_br->start; - sub_len = sub_br->len; - if (sub_br->start < br->start) { - sub_len -= br->start - sub_br->start; - sub_start = br->start; - } - if (sub_len > br->len) - sub_len = br->len; - - state = 0; - if (sub_start == br->start) - state |= LEFT_ALIGNED; - if (sub_start + sub_len == br->start + br->len) - state |= RIGHT_ALIGNED; - switch (state) { - case LEFT_ALIGNED: - /* Coincides with only the left. */ - br->start += sub_len; - br->len -= sub_len; - break; - case RIGHT_ALIGNED: - /* Coincides with only the right. */ - br->len -= sub_len; - lp = lp->next; - break; - case LEFT_ALIGNED | RIGHT_ALIGNED: - /* Total overlap, just delete ex. */ - lp = lp->next; - list_del(&br->list); - kfree(br); - break; - case 0: - /* - * Deleting from the middle: add the new right extent - * and then shrink the left extent. - */ - new_br = kmalloc(sizeof(struct xbitmap_range), - XCHK_GFP_FLAGS); - if (!new_br) { - error = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&new_br->list); - new_br->start = sub_start + sub_len; - new_br->len = br->start + br->len - new_br->start; - list_add(&new_br->list, &br->list); - br->len = sub_start - br->start; - lp = lp->next; - break; - default: - ASSERT(0); - break; - } + for_each_xbitmap_extent(bn, sub) { + error = xbitmap_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; } -out: - return error; + return 0; } -#undef LEFT_ALIGNED -#undef RIGHT_ALIGNED /* * Record all btree blocks seen while iterating all records of a btree. @@ -242,6 +263,38 @@ out: * For the 300th record we just exit, with the list being [1, 4, 2, 3]. */ +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + /* * Record all the buffers pointed to by the btree cursor. Callers already * engaged in a btree walk should call this function to capture the list of @@ -304,12 +357,97 @@ uint64_t xbitmap_hweight( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; uint64_t ret = 0; - for_each_xbitmap_extent(bmr, n, bitmap) - ret += bmr->len; + for_each_xbitmap_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; return ret; } + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap_walk( + struct xbitmap *bitmap, + xbitmap_walk_fn fn, + void *priv) +{ + struct xbitmap_node *bn; + int error = 0; + + for_each_xbitmap_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +struct xbitmap_walk_bits { + xbitmap_walk_bits_fn fn; + void *priv; +}; + +/* Walk all the bits in a run. */ +static int +xbitmap_walk_bits_in_run( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xbitmap_walk_bits *wb = priv; + uint64_t i; + int error = 0; + + for (i = start; i < start + len; i++) { + error = wb->fn(i, wb->priv); + if (error) + break; + } + + return error; +} + +/* Call a function for every set bit in this bitmap. */ +int +xbitmap_walk_bits( + struct xbitmap *bitmap, + xbitmap_walk_bits_fn fn, + void *priv) +{ + struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; + + return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap_empty( + struct xbitmap *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap_test( + struct xbitmap *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 900646b72de1..84981724ecaf 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -1,31 +1,19 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap_range { - struct list_head list; - uint64_t start; - uint64_t len; -}; - struct xbitmap { - struct list_head list; + struct rb_root_cached xb_root; }; void xbitmap_init(struct xbitmap *bitmap); void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xbitmap_extent(bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) - -#define for_each_xbitmap_block(b, bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) - +int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); int xbitmap_set_btcur_path(struct xbitmap *bitmap, @@ -34,4 +22,93 @@ int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, + void *priv); + +typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); +int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, + void *priv); + +bool xbitmap_empty(struct xbitmap *bitmap); +bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_set(&bitmap->agbitmap, start, len); +} + +static inline bool +xagb_bitmap_test( + struct xagb_bitmap *bitmap, + xfs_agblock_t start, + xfs_extlen_t *len) +{ + uint64_t biglen = *len; + bool ret; + + ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); + + if (start + biglen >= UINT_MAX) { + ASSERT(0); + biglen = UINT_MAX - start; + } + + *len = biglen; + return ret; +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap_walk_fn fn, void *priv) +{ + return xbitmap_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index dbbc7037074c..87ab9f95a487 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -31,12 +31,15 @@ xchk_setup_inode_bmap( { int error; - error = xchk_get_inode(sc); + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + error = xchk_iget_for_scrubbing(sc); if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); /* * We don't want any ephemeral data fork updates sitting around @@ -47,6 +50,9 @@ xchk_setup_inode_bmap( sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + sc->ilock_flags |= XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + inode_dio_wait(VFS_I(sc->ip)); /* @@ -90,11 +96,23 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + + /* Incore extent tree cursor */ struct xfs_iext_cursor icur; - xfs_fileoff_t lastoff; + + /* Previous fork mapping that we examined */ + struct xfs_bmbt_irec prev_rec; + + /* Is this a realtime fork? */ bool is_rt; + + /* May mappings point to shared space? */ bool is_shared; + + /* Was the incore extent tree loaded? */ bool was_loaded; + + /* Which inode fork are we checking? */ int whichfork; }; @@ -147,49 +165,7 @@ xchk_bmap_get_rmap( return has_rmap; } -static inline bool -xchk_bmap_has_prev( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) - return false; - if (got.br_startoff + got.br_blockcount != irec->br_startoff) - return false; - if (got.br_startblock + got.br_blockcount != irec->br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -static inline bool -xchk_bmap_has_next( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) - return false; - if (irec->br_startoff + irec->br_blockcount != got.br_startoff) - return false; - if (irec->br_startblock + irec->br_blockcount != got.br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -/* Make sure that we have rmapbt records for this extent. */ +/* Make sure that we have rmapbt records for this data/attr fork extent. */ STATIC void xchk_bmap_xref_rmap( struct xchk_bmap_info *info, @@ -198,41 +174,39 @@ xchk_bmap_xref_rmap( { struct xfs_rmap_irec rmap; unsigned long long rmap_end; - uint64_t owner; + uint64_t owner = info->sc->ip->i_ino; if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) return; - if (info->whichfork == XFS_COW_FORK) - owner = XFS_RMAP_OWN_COW; - else - owner = info->sc->ip->i_ino; - /* Find the rmap record for this irec. */ if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) return; - /* Check the rmap. */ + /* + * The rmap must be an exact match for this incore file mapping record, + * which may have arisen from multiple ondisk records. + */ + if (rmap.rm_startblock != agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap.rm_startblock > agbno || - agbno + irec->br_blockcount > rmap_end) + if (rmap_end != agbno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check the logical offsets if applicable. CoW staging extents - * don't track logical offsets since the mappings only exist in - * memory. - */ - if (info->whichfork != XFS_COW_FORK) { - rmap_end = (unsigned long long)rmap.rm_offset + - rmap.rm_blockcount; - if (rmap.rm_offset > irec->br_startoff || - irec->br_startoff + irec->br_blockcount > rmap_end) - xchk_fblock_xref_set_corrupt(info->sc, - info->whichfork, irec->br_startoff); - } + /* Check the logical offsets. */ + if (rmap.rm_offset != irec->br_startoff) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount; + if (rmap_end != irec->br_startoff + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -244,8 +218,7 @@ xchk_bmap_xref_rmap( * records because the blocks are owned (on-disk) by the refcountbt, * which doesn't track unwritten state. */ - if (owner != XFS_RMAP_OWN_COW && - !!(irec->br_state == XFS_EXT_UNWRITTEN) != + if (!!(irec->br_state == XFS_EXT_UNWRITTEN) != !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -257,34 +230,60 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); +} + +/* Make sure that we have rmapbt records for this COW fork extent. */ +STATIC void +xchk_bmap_xref_rmap_cow( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner = XFS_RMAP_OWN_COW; + + if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + return; + + /* Find the rmap record for this irec. */ + if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; /* - * If the rmap starts before this bmbt record, make sure there's a bmbt - * record for the previous offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * CoW staging extents are owned by the refcount btree, so the rmap + * can start before and end after the physical space allocated to this + * mapping. There are no offsets to check. */ - if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && - !xchk_bmap_has_prev(info, irec)) { + if (rmap.rm_startblock > agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap_end < agbno + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ + if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } /* - * If the rmap ends after this bmbt record, make sure there's a bmbt - * record for the next offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * No flags allowed. Note that the (in-memory) CoW fork distinguishes + * between unwritten and written extents, but we don't track that in + * the rmap records because the blocks are owned (on-disk) by the + * refcountbt, which doesn't track unwritten state. */ - rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (info->whichfork != XFS_COW_FORK && - rmap_end > agbno + irec->br_blockcount && - !xchk_bmap_has_next(info, irec)) { + if (rmap.rm_flags & XFS_RMAP_ATTR_FORK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_UNWRITTEN) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } } /* Cross-reference a single rtdev extent record. */ @@ -305,6 +304,7 @@ xchk_bmap_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + struct xfs_owner_info oinfo; struct xfs_mount *mp = info->sc->mp; xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -322,17 +322,35 @@ xchk_bmap_iextent_xref( xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); - xchk_bmap_xref_rmap(info, irec, agbno); switch (info->whichfork) { case XFS_DATA_FORK: - if (xfs_is_reflink_inode(info->sc->ip)) - break; - fallthrough; + xchk_bmap_xref_rmap(info, irec, agbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + } + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; case XFS_ATTR_FORK: + xchk_bmap_xref_rmap(info, irec, agbno); + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &oinfo); xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); break; case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, agbno); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &XFS_RMAP_OINFO_COW); xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); xchk_xref_is_not_shared(info->sc, agbno, @@ -382,7 +400,8 @@ xchk_bmap_iextent( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -392,15 +411,7 @@ xchk_bmap_iextent( xchk_bmap_dirattr_extent(ip, info, irec); - /* There should never be a "hole" extent in either extent list. */ - if (irec->br_startblock == HOLESTARTBLOCK) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); if (info->is_rt && !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -468,6 +479,12 @@ xchk_bmapbt_rec( return 0; xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) { + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; + } + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, &iext_irec) || irec.br_startoff != iext_irec.br_startoff || @@ -618,45 +635,57 @@ xchk_bmap_check_ag_rmaps( return error; } -/* Make sure each rmap has a corresponding bmbt entry. */ -STATIC int -xchk_bmap_check_rmaps( - struct xfs_scrub *sc, - int whichfork) +/* + * Decide if we want to walk every rmap btree in the fs to make sure that each + * rmap for this file fork has corresponding bmbt entries. + */ +static bool +xchk_bmap_want_check_rmaps( + struct xchk_bmap_info *info) { - struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); - struct xfs_perag *pag; - xfs_agnumber_t agno; - bool zero_size; - int error; + struct xfs_scrub *sc = info->sc; + struct xfs_ifork *ifp; - if (!xfs_has_rmapbt(sc->mp) || - whichfork == XFS_COW_FORK || - (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - return 0; + if (!xfs_has_rmapbt(sc->mp)) + return false; + if (info->whichfork == XFS_COW_FORK) + return false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; /* Don't support realtime rmap checks yet. */ - if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) - return 0; - - ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + if (info->is_rt) + return false; /* - * Only do this for complex maps that are in btree format, or for - * situations where we would seem to have a size but zero extents. - * The inode repair code can zap broken iforks, which means we have - * to flag this bmap as corrupt if there are rmaps that need to be - * reattached. + * The inode repair code zaps broken inode forks by resetting them back + * to EXTENTS format and zero extent records. If we encounter a fork + * in this state along with evidence that the fork isn't supposed to be + * empty, we need to scan the reverse mappings to decide if we're going + * to rebuild the fork. Data forks with nonzero file size are scanned. + * xattr forks are never empty of content, so they are always scanned. */ + ifp = xfs_ifork_ptr(sc->ip, info->whichfork); + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { + if (info->whichfork == XFS_DATA_FORK && + i_size_read(VFS_I(sc->ip)) == 0) + return false; - if (whichfork == XFS_DATA_FORK) - zero_size = i_size_read(VFS_I(sc->ip)) == 0; - else - zero_size = false; + return true; + } - if (ifp->if_format != XFS_DINODE_FMT_BTREE && - (zero_size || ifp->if_nextents > 0)) - return 0; + return false; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rmaps( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); @@ -683,7 +712,8 @@ xchk_bmap_iextent_delalloc( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -697,6 +727,101 @@ xchk_bmap_iextent_delalloc( irec->br_startoff); } +/* Decide if this individual fork mapping is ok. */ +static bool +xchk_bmap_iext_mapping( + struct xchk_bmap_info *info, + const struct xfs_bmbt_irec *irec) +{ + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + return false; + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + return false; + return true; +} + +/* Are these two mappings contiguous with each other? */ +static inline bool +xchk_are_bmaps_contiguous( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't try to combine unallocated mappings. */ + if (!xfs_bmap_is_real_extent(b1)) + return false; + if (!xfs_bmap_is_real_extent(b2)) + return false; + + /* Does b2 come right after b1 in the logical and physical range? */ + if (b1->br_startoff + b1->br_blockcount != b2->br_startoff) + return false; + if (b1->br_startblock + b1->br_blockcount != b2->br_startblock) + return false; + if (b1->br_state != b2->br_state) + return false; + return true; +} + +/* + * Walk the incore extent records, accumulating consecutive contiguous records + * into a single incore mapping. Returns true if @irec has been set to a + * mapping or false if there are no more mappings. Caller must ensure that + * @info.icur is zeroed before the first call. + */ +static int +xchk_bmap_iext_iter( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_filblks_t prev_len; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + /* Advance to the next iextent record and check the mapping. */ + xfs_iext_next(ifp, &info->icur); + if (!xfs_iext_get_extent(ifp, &info->icur, irec)) + return false; + + if (!xchk_bmap_iext_mapping(info, irec)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return false; + } + + /* + * Iterate subsequent iextent records and merge them with the one + * that we just read, if possible. + */ + prev_len = irec->br_blockcount; + while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { + if (!xchk_are_bmaps_contiguous(irec, &got)) + break; + + if (!xchk_bmap_iext_mapping(info, &got)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + got.br_startoff); + return false; + } + + /* + * Notify the user of mergeable records in the data or attr + * forks. CoW forks only exist in memory so we ignore them. + */ + if (info->whichfork != XFS_COW_FORK && + prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + + irec->br_blockcount += got.br_blockcount; + prev_len = got.br_blockcount; + xfs_iext_next(ifp, &info->icur); + } + + return true; +} + /* * Scrub an inode fork's block mappings. * @@ -776,10 +901,15 @@ xchk_bmap( if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) goto out; - /* Scrub extent records. */ - info.lastoff = 0; - ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &info.icur, &irec) { + /* + * Scrub extent records. We use a special iterator function here that + * combines adjacent mappings if they are logically and physically + * contiguous. For large allocations that require multiple bmbt + * records, this reduces the number of cross-referencing calls, which + * reduces runtime. Cross referencing with the rmap is simpler because + * the rmap must match the combined mapping exactly. + */ + while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; @@ -794,12 +924,14 @@ xchk_bmap( xchk_bmap_iextent_delalloc(ip, &info, &irec); else xchk_bmap_iextent(ip, &info, &irec); - info.lastoff = irec.br_startoff + irec.br_blockcount; + memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec)); } - error = xchk_bmap_check_rmaps(sc, whichfork); - if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + if (xchk_bmap_want_check_rmaps(&info)) { + error = xchk_bmap_check_rmaps(sc, whichfork); + if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) + goto out; + } out: return error; } diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 0fd36d5b4646..1935b9ce1885 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -36,6 +36,7 @@ __xchk_btree_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -118,6 +119,16 @@ xchk_btree_xref_set_corrupt( __return_address); } +void +xchk_btree_set_preen( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN, + __return_address); +} + /* * Make sure this record is in order and doesn't stray outside of the parent * keys. @@ -140,29 +151,30 @@ xchk_btree_rec( trace_xchk_btree_rec(bs->sc, cur, 0); - /* If this isn't the first record, are they in order? */ - if (cur->bc_levels[0].ptr > 1 && + /* Are all records across all record blocks in order? */ + if (bs->lastrec_valid && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) xchk_btree_set_corrupt(bs->sc, cur, 0); memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + bs->lastrec_valid = true; if (cur->bc_nlevels == 1) return; - /* Is this at least as large as the parent low key? */ + /* Is low_key(rec) at least as large as the parent low key? */ cur->bc_ops->init_key_from_rec(&key, rec); keyblock = xfs_btree_get_block(cur, 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is high_key(rec) no larger than the parent high key? */ cur->bc_ops->init_high_key_from_rec(&hkey, rec); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, &hkey)) xchk_btree_set_corrupt(bs->sc, cur, 1); } @@ -187,29 +199,30 @@ xchk_btree_key( trace_xchk_btree_key(bs->sc, cur, level); - /* If this isn't the first key, are they in order? */ - if (cur->bc_levels[level].ptr > 1 && - !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key)) + /* Are all low keys across all node blocks in order? */ + if (bs->lastkey[level - 1].valid && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key)) xchk_btree_set_corrupt(bs->sc, cur, level); - memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len); + memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len); + bs->lastkey[level - 1].valid = true; if (level + 1 >= cur->bc_nlevels) return; - /* Is this at least as large as the parent low key? */ + /* Is this block's low key at least as large as the parent low key? */ keyblock = xfs_btree_get_block(cur, level + 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is this block's high key no larger than the parent high key? */ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, key)) xchk_btree_set_corrupt(bs->sc, cur, level); } @@ -389,7 +402,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) bs->cur = NULL; - xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo); + xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL; @@ -519,6 +532,48 @@ xchk_btree_check_minrecs( } /* + * If this btree block has a parent, make sure that the parent's keys capture + * the keyspace contained in this block. + */ +STATIC void +xchk_btree_block_check_keys( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_key; + union xfs_btree_key *block_high_key; + union xfs_btree_key *parent_low_key, *parent_high_key; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level == cur->bc_nlevels - 1) + return; + + xfs_btree_get_keys(cur, block, &block_key); + + /* Make sure the low key of this block matches the parent. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return; + } + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Make sure the high key of this block matches the parent. */ + parent_high_key = xfs_btree_high_key_addr(cur, + cur->bc_levels[level + 1].ptr, parent_block); + block_high_key = xfs_btree_high_key_from_key(cur, &block_key); + if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -569,7 +624,12 @@ xchk_btree_get_block( * Check the block's siblings; this function absorbs error codes * for us. */ - return xchk_btree_block_check_siblings(bs, *pblock); + error = xchk_btree_block_check_siblings(bs, *pblock); + if (error) + return error; + + xchk_btree_block_check_keys(bs, level, *pblock); + return 0; } /* @@ -601,7 +661,7 @@ xchk_btree_block_keys( parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) + if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -612,7 +672,7 @@ xchk_btree_block_keys( high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) + if (xfs_btree_keycmp_ne(cur, high_bk, high_pk)) xchk_btree_set_corrupt(bs->sc, cur, 1); } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index da61a53a0b61..9d7b9ee8bef4 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BTREE_H__ #define __XFS_SCRUB_BTREE_H__ @@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc, /* Check for btree corruption. */ void xchk_btree_set_corrupt(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int level); +void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level); /* Check for btree xref discrepancies. */ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, @@ -29,6 +31,11 @@ typedef int (*xchk_btree_rec_fn)( struct xchk_btree *bs, const union xfs_btree_rec *rec); +struct xchk_btree_key { + union xfs_btree_key key; + bool valid; +}; + struct xchk_btree { /* caller-provided scrub state */ struct xfs_scrub *sc; @@ -38,11 +45,12 @@ struct xchk_btree { void *private; /* internal scrub state */ + bool lastrec_valid; union xfs_btree_rec lastrec; struct list_head to_check; /* this element must come last! */ - union xfs_btree_key lastkey[]; + struct xchk_btree_key lastkey[]; }; /* diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 848a8e32e56f..9aa79665c608 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -75,6 +75,7 @@ __xchk_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), @@ -130,6 +131,7 @@ __xchk_fblock_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -396,26 +398,19 @@ want_ag_read_header_failure( } /* - * Grab the perag structure and all the headers for an AG. + * Grab the AG header buffers for the attached perag structure. * * The headers should be released by xchk_ag_free, but as a fail safe we attach * all the buffers we grab to the scrub transaction so they'll all be freed - * when we cancel it. Returns ENOENT if we can't grab the perag structure. + * when we cancel it. */ -int -xchk_ag_read_headers( +static inline int +xchk_perag_read_headers( struct xfs_scrub *sc, - xfs_agnumber_t agno, struct xchk_ag *sa) { - struct xfs_mount *mp = sc->mp; int error; - ASSERT(!sa->pag); - sa->pag = xfs_perag_get(mp, agno); - if (!sa->pag) - return -ENOENT; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -427,6 +422,104 @@ xchk_ag_read_headers( return 0; } +/* + * Grab the AG headers for the attached perag structure and wait for pending + * intents to drain. + */ +static int +xchk_perag_drain_and_lock( + struct xfs_scrub *sc) +{ + struct xchk_ag *sa = &sc->sa; + int error = 0; + + ASSERT(sa->pag != NULL); + ASSERT(sa->agi_bp == NULL); + ASSERT(sa->agf_bp == NULL); + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + error = xchk_perag_read_headers(sc, sa); + if (error) + return error; + + /* + * If we've grabbed an inode for scrubbing then we assume that + * holding its ILOCK will suffice to coordinate with any intent + * chains involving this inode. + */ + if (sc->ip) + return 0; + + /* + * Decide if this AG is quiet enough for all metadata to be + * consistent with each other. XFS allows the AG header buffer + * locks to cycle across transaction rolls while processing + * chains of deferred ops, which means that there could be + * other threads in the middle of processing a chain of + * deferred ops. For regular operations we are careful about + * ordering operations to prevent collisions between threads + * (which is why we don't need a per-AG lock), but scrub and + * repair have to serialize against chained operations. + * + * We just locked all the AG headers buffers; now take a look + * to see if there are any intents in progress. If there are, + * drop the AG headers and wait for the intents to drain. + * Since we hold all the AG header locks for the duration of + * the scrub, this is the only time we have to sample the + * intents counter; any threads increasing it after this point + * can't possibly be in the middle of a chain of AG metadata + * updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_perag_intent_busy(sa->pag)) + return 0; + + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_perag_intent_drain(sa->pag); + if (error == -ERESTARTSYS) + error = -EINTR; + } while (!error); + + return error; +} + +/* + * Grab the per-AG structure, grab all AG header buffers, and wait until there + * aren't any pending intents. Returns -ENOENT if we can't grab the perag + * structure. + */ +int +xchk_ag_read_headers( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; + + return xchk_perag_drain_and_lock(sc); +} + /* Release all the AG btree cursors. */ void xchk_ag_btcur_free( @@ -550,6 +643,14 @@ xchk_ag_init( /* Per-scrubber setup functions */ +void +xchk_trans_cancel( + struct xfs_scrub *sc) +{ + xfs_trans_cancel(sc->tp); + sc->tp = NULL; +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -625,80 +726,273 @@ xchk_checkpoint_log( return 0; } +/* Verify that an inode is allocated ondisk, then return its cached inode. */ +int +xchk_iget( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_inode **ipp) +{ + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); +} + /* - * Given an inode and the scrub control structure, grab either the - * inode referenced in the control structure or the inode passed in. - * The inode is not locked. + * Try to grab an inode in a manner that avoids races with physical inode + * allocation. If we can't, return the locked AGI buffer so that the caller + * can single-step the loading process to see where things went wrong. + * Callers must have a valid scrub transaction. + * + * If the iget succeeds, return 0, a NULL AGI, and the inode. + * + * If the iget fails, return the error, the locked AGI, and a NULL inode. This + * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are + * no longer allocated; or any other corruption or runtime error. + * + * If the AGI read fails, return the error, a NULL AGI, and NULL inode. + * + * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. */ int -xchk_get_inode( +xchk_iget_agi( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_buf **agi_bpp, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_perag *pag; + int error; + + ASSERT(sc->tp != NULL); + +again: + *agi_bpp = NULL; + *ipp = NULL; + error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Attach the AGI buffer to the scrub transaction to avoid deadlocks + * in the iget cache miss path. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + xfs_perag_put(pag); + if (error) + return error; + + error = xfs_iget(mp, tp, inum, + XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + if (error == -EAGAIN) { + /* + * The inode may be in core but temporarily unavailable and may + * require the AGI buffer before it can be returned. Drop the + * AGI buffer and retry the lookup. + * + * Incore lookup will fail with EAGAIN on a cache hit if the + * inode is queued to the inactivation list. The inactivation + * worker may remove the inode from the unlinked list and hence + * needs the AGI. + * + * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN + * to allow inodegc to make progress and move the inode to + * IRECLAIMABLE state where xfs_iget will be able to return it + * again if it can lock the inode. + */ + xfs_trans_brelse(tp, *agi_bpp); + delay(1); + goto again; + } + if (error) + return error; + + /* We got the inode, so we can release the AGI. */ + ASSERT(*ipp != NULL); + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + return 0; +} + +/* Install an inode that we opened by handle for scrubbing. */ +int +xchk_install_handle_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + xchk_irele(sc, ip); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + +/* + * In preparation to scrub metadata structures that hang off of an inode, + * grab either the inode referenced in the scrub control structure or the + * inode passed in. If the inumber does not reference an allocated inode + * record, the function returns ENOENT to end the scrub early. The inode + * is not locked. + */ +int +xchk_iget_for_scrubbing( struct xfs_scrub *sc) { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag; + struct xfs_buf *agi_bp; struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + ASSERT(sc->tp == NULL); + /* We want to scan the inode we already had opened. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { sc->ip = ip_in; return 0; } - /* Look up the inode, see if the generation number matches. */ + /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) return -ENOENT; - error = xfs_iget(mp, NULL, sc->sm->sm_ino, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); - switch (error) { - case -ENOENT: - /* Inode doesn't exist, just bail out. */ + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_inode(sc, ip); + if (error == -ENOENT) return error; - case 0: - /* Got an inode, continue. */ - break; - case -EINVAL: + if (error != -EINVAL) + goto out_error; + + /* + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is a + * record, and it says this inode is free. + * + * We want to look up this inode in the inobt to distinguish two + * scenarios: (1) the inobt says the inode is free, in which case + * there's nothing to do; and (2) the inobt says the inode is + * allocated, but loading it failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity + * in this AG. Retry the iget in case someone allocated a new inode + * after the first iget failed. + */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the inode, so install it. */ + xchk_trans_cancel(sc); + return xchk_install_handle_inode(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; + } + + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt thinks this the inode neither can exist inside the + * filesystem nor is allocated, return ENOENT to signal that the check + * can be skipped. + * + * If the lookup returns corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters any other error, exit to userspace. + * + * If the lookup succeeds, something else must be very wrong in the fs + * such that setting up the incore inode failed in some strange way. + * Treat those as corruptions. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; + if (!error) + error = -EFSCORRUPTED; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; +} + +/* Release an inode, possibly dropping it in the process. */ +void +xchk_irele( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (current->journal_info != NULL) { + ASSERT(current->journal_info == sc->tp); + /* - * -EINVAL with IGET_UNTRUSTED could mean one of several - * things: userspace gave us an inode number that doesn't - * correspond to fs space, or doesn't have an inobt entry; - * or it could simply mean that the inode buffer failed the - * read verifiers. + * If we are in a transaction, we /cannot/ drop the inode + * ourselves, because the VFS will trigger writeback, which + * can require a transaction. Clear DONTCACHE to force the + * inode to the LRU, where someone else can take care of + * dropping it. * - * Try just the inode mapping lookup -- if it succeeds, then - * the inode buffer verifier failed and something needs fixing. - * Otherwise, we really couldn't find it so tell userspace - * that it no longer exists. + * Note that when we grabbed our reference to the inode, it + * could have had an active ref and DONTCACHE set if a sysadmin + * is trying to coerce a change in file access mode. icache + * hits do not clear DONTCACHE, so we must do it here. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); - if (pag) { - error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); - xfs_perag_put(pag); - if (error) - return -ENOENT; - } - error = -EFSCORRUPTED; - fallthrough; - default: - trace_xchk_op_error(sc, - XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), - XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), - error, __return_address); - return error; - } - if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { - xfs_irele(ip); - return -ENOENT; + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); + } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { + /* + * If this is the last reference to the inode and the caller + * permits it, set DONTCACHE to avoid thrashing. + */ + d_mark_dontcache(VFS_I(ip)); } - sc->ip = ip; - return 0; + xfs_irele(ip); } -/* Set us up to scrub a file's contents. */ +/* + * Set us up to scrub metadata mapped by a file's fork. Callers must not use + * this to operate on user-accessible regular file data because the MMAPLOCK is + * not taken. + */ int xchk_setup_inode_contents( struct xfs_scrub *sc, @@ -706,13 +1000,14 @@ xchk_setup_inode_contents( { int error; - error = xchk_get_inode(sc); + error = xchk_iget_for_scrubbing(sc); if (error) return error; - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + /* Lock the inode so the VFS cannot touch this file. */ + sc->ilock_flags = XFS_IOLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_trans_alloc(sc, resblks); if (error) goto out; @@ -869,28 +1164,6 @@ xchk_metadata_inode_forks( return 0; } -/* - * Try to lock an inode in violation of the usual locking order rules. For - * example, trying to get the IOLOCK while in transaction context, or just - * plain breaking AG-order or inode-order inode locking rules. Either way, - * the only way to avoid an ABBA deadlock is to use trylock and back off if - * we can't. - */ -int -xchk_ilock_inverted( - struct xfs_inode *ip, - uint lock_mode) -{ - int i; - - for (i = 0; i < 20; i++) { - if (xfs_ilock_nowait(ip, lock_mode)) - return 0; - delay(1); - } - return -EDEADLOCK; -} - /* Pause background reaping of resources. */ void xchk_stop_reaping( @@ -916,3 +1189,25 @@ xchk_start_reaping( } sc->flags &= ~XCHK_REAPING_DISABLED; } + +/* + * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub + * operation. Callers must not hold any locks that intersect with the CPU + * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs + * to change kernel code. + */ +void +xchk_fsgates_enable( + struct xfs_scrub *sc, + unsigned int scrub_fsgates) +{ + ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); + ASSERT(!(sc->flags & scrub_fsgates)); + + trace_xchk_fsgates_enable(sc, scrub_fsgates); + + if (scrub_fsgates & XCHK_FSGATES_DRAIN) + xfs_drain_wait_enable(); + + sc->flags |= scrub_fsgates; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index b73648d81d23..18b5f2b62f13 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ @@ -32,6 +32,8 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +void xchk_trans_cancel(struct xfs_scrub *sc); + bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, @@ -72,6 +74,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); /* Setup functions */ +int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); @@ -132,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); -int xchk_get_inode(struct xfs_scrub *sc); +int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); +int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, + struct xfs_buf **agi_bpp, struct xfs_inode **ipp); +void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + /* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. @@ -147,8 +156,21 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } int xchk_metadata_inode_forks(struct xfs_scrub *sc); -int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); void xchk_stop_reaping(struct xfs_scrub *sc); void xchk_start_reaping(struct xfs_scrub *sc); +/* + * Setting up a hook to wait for intents to drain is costly -- we have to take + * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it + * up, and again to tear it down. These costs add up quickly, so we only want + * to enable the drain waiter if the drain actually detected a conflict with + * running intent chains. + */ +static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) +{ + return sc->flags & XCHK_NEED_DRAIN; +} + +void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index d17cee177085..82b150d3b8b7 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -39,6 +39,7 @@ xchk_da_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 1f3515c6d5a8..4f8c2138a1ec 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_DABTREE_H__ #define __XFS_SCRUB_DABTREE_H__ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d1b0f23c2c59..0b491784b759 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" +#include "scrub/readdir.h" /* Set us up to scrub directories. */ int @@ -31,168 +32,114 @@ xchk_setup_directory( /* Scrub a directory entry. */ -struct xchk_dir_ctx { - /* VFS fill-directory iterator */ - struct dir_context dir_iter; - - struct xfs_scrub *sc; -}; - -/* Check that an inode's mode matches a given DT_ type. */ -STATIC int +/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ +STATIC void xchk_dir_check_ftype( - struct xchk_dir_ctx *sdc, + struct xfs_scrub *sc, xfs_fileoff_t offset, - xfs_ino_t inum, - int dtype) + struct xfs_inode *ip, + int ftype) { - struct xfs_mount *mp = sdc->sc->mp; - struct xfs_inode *ip; - int ino_dtype; - int error = 0; + struct xfs_mount *mp = sc->mp; if (!xfs_has_ftype(mp)) { - if (dtype != DT_UNKNOWN && dtype != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - goto out; - } - - /* - * Grab the inode pointed to by the dirent. We release the - * inode before we cancel the scrub transaction. Since we're - * don't know a priori that releasing the inode won't trigger - * eofblocks cleanup (which allocates what would be a nested - * transaction), we can't use DONTCACHE here because DONTCACHE - * inodes can trigger immediate inactive cleanup of the inode. - * - * If _iget returns -EINVAL or -ENOENT then the child inode number is - * garbage and the directory is corrupt. If the _iget returns - * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a - * cross referencing error. Any other error is an operational error. - */ - error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); - if (error == -EINVAL || error == -ENOENT) { - error = -EFSCORRUPTED; - xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error); - goto out; + if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return; } - if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) - goto out; - /* Convert mode to the DT_* values that dir_emit uses. */ - ino_dtype = xfs_dir3_get_dtype(mp, - xfs_mode_to_ftype(VFS_I(ip)->i_mode)); - if (ino_dtype != dtype) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - xfs_irele(ip); -out: - return error; + if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* * Scrub a single directory entry. * - * We use the VFS directory iterator (i.e. readdir) to call this - * function for every directory entry in a directory. Once we're here, - * we check the inode number to make sure it's sane, then we check that - * we can look up this filename. Finally, we check the ftype. + * Check the inode number to make sure it's sane, then we check that we can + * look up this filename. Finally, we check the ftype. */ -STATIC bool +STATIC int xchk_dir_actor( - struct dir_context *dir_iter, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xfs_mount *mp; + struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; - struct xchk_dir_ctx *sdc; - struct xfs_name xname; xfs_ino_t lookup_ino; xfs_dablk_t offset; - bool checked_ftype = false; int error = 0; - sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter); - ip = sdc->sc->ip; - mp = ip->i_mount; offset = xfs_dir2_db_to_da(mp->m_dir_geo, - xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos)); - if (xchk_should_terminate(sdc->sc, &error)) - return !error; + if (xchk_should_terminate(sc, &error)) + return error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } /* Does this name make sense? */ - if (!xfs_dir2_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + if (!xfs_dir2_namecheck(name->name, name->len)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - if (!strncmp(".", name, namelen)) { + if (!strncmp(".", name->name, name->len)) { /* If this is "." then check that the inum matches the dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - } else if (!strncmp("..", name, namelen)) { + if (ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else if (!strncmp("..", name->name, name->len)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); + if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* Verify that we can look up this name by hash. */ - xname.name = name; - xname.len = namelen; - xname.type = XFS_DIR3_FT_UNKNOWN; - - error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + error = xchk_dir_lookup(sc, dp, name, &lookup_ino); /* ENOENT means the hash lookup failed and the dir is corrupt */ if (error == -ENOENT) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) goto out; if (lookup_ino != ino) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - /* Verify the file type. This function absorbs error codes. */ - if (!checked_ftype) { - error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type); - if (error) - goto out; - } -out: /* - * A negative error code returned here is supposed to cause the - * dir_emit caller (xfs_readdir) to abort the directory iteration - * and return zero to xchk_directory. + * Grab the inode pointed to by the dirent. We release the inode + * before we cancel the scrub transaction. + * + * If _iget returns -EINVAL or -ENOENT then the child inode number is + * garbage and the directory is corrupt. If the _iget returns + * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a + * cross referencing error. Any other error is an operational error. */ - if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return false; - return !error; + error = xchk_iget(sc, ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + error = -EFSCORRUPTED; + xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error)) + goto out; + + xchk_dir_check_ftype(sc, offset, ip, name->type); + xchk_irele(sc, ip); +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return error; } /* Scrub a directory btree record. */ @@ -201,6 +148,7 @@ xchk_dir_rec( struct xchk_da_btree *ds, int level) { + struct xfs_name dname = { }; struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_mount *mp = ds->state->mp; struct xfs_inode *dp = ds->dargs.dp; @@ -297,7 +245,11 @@ xchk_dir_rec( xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out_relse; } - calc_hash = xfs_da_hashname(dent->name, dent->namelen); + + /* Does the directory hash match? */ + dname.name = dent->name; + dname.len = dent->namelen; + calc_hash = xfs_dir2_hashname(mp, &dname); if (calc_hash != hash) xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); @@ -803,14 +755,7 @@ int xchk_directory( struct xfs_scrub *sc) { - struct xchk_dir_ctx sdc = { - .dir_iter.actor = xchk_dir_actor, - .dir_iter.pos = 0, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - int error = 0; + int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; @@ -818,7 +763,7 @@ xchk_directory( /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } /* Check directory tree structure */ @@ -827,7 +772,7 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; + return 0; /* Check the freespace. */ error = xchk_directory_blocks(sc); @@ -835,44 +780,11 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - - /* - * Check that every dirent we see can also be looked up by hash. - * Userspace usually asks for a 32k buffer, so we will too. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - sc->ip->i_disk_size); - - /* - * Look up every name in this directory by hash. - * - * Use the xfs_readdir function to call xchk_dir_actor on - * every directory entry in this directory. In _actor, we check - * the name, inode number, and ftype (if applicable) of the - * entry. xfs_readdir uses the VFS filldir functions to provide - * iteration context. - * - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to reuse the _readdir and - * _dir_lookup routines, which do their own ILOCK locking. - */ - oldpos = 0; - sc->ilock_flags &= ~XFS_ILOCK_EXCL; - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - while (true) { - error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out; - if (oldpos == sdc.dir_iter.pos) - break; - oldpos = sdc.dir_iter.pos; - } + return 0; -out: + /* Look up every name in this directory by hash. */ + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); + if (error == -ECANCELED) + error = 0; return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index f0c7f41897b9..faa315be7978 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -130,6 +130,13 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index aa65ec88a0c0..d2b2a1cb6533 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -1,12 +1,14 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index d0b938d3d028..66a273f8585b 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_HEALTH_H__ #define __XFS_SCRUB_HEALTH_H__ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e312be7cd375..575f22a02ebe 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -32,6 +32,8 @@ int xchk_setup_ag_iallocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); } @@ -49,83 +51,237 @@ struct xchk_iallocbt { }; /* - * If we're checking the finobt, cross-reference with the inobt. - * Otherwise we're checking the inobt; if there is an finobt, make sure - * we have a record or not depending on freecount. + * Does the finobt have a record for this inode with the same hole/free state? + * This is a bit complicated because of the following: + * + * - The finobt need not have a record if all inodes in the inobt record are + * allocated. + * - The finobt need not have a record if all inodes in the inobt record are + * free. + * - The finobt need not have a record if the inobt record says this is a hole. + * This likely doesn't happen in practice. */ -static inline void -xchk_iallocbt_chunk_xref_other( +STATIC int +xchk_inobt_xref_finobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + bool free, + bool hole) +{ + struct xfs_inobt_rec_incore frec; + struct xfs_btree_cur *cur = sc->sa.fino_cur; + bool ffree, fhole; + unsigned int frec_idx, fhole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &frec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's a finobt record; free and hole status must match. */ + frec_idx = agino - frec.ir_startino; + ffree = frec.ir_free & (1ULL << frec_idx); + fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec.ir_holemask & (1U << fhole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* inobt record is fully allocated */ + if (irec->ir_free == 0) + return 0; + + /* inobt record is totally unallocated */ + if (irec->ir_free == XFS_INOBT_ALL_FREE) + return 0; + + /* inobt record says this is a hole */ + if (hole) + return 0; + + /* finobt doesn't care about allocated inodes */ + if (!free) + return 0; + + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; +} + +/* + * Make sure that each inode of this part of an inobt record has the same + * sparse and free status as the finobt. + */ +STATIC void +xchk_inobt_chunk_xref_finobt( struct xfs_scrub *sc, struct xfs_inobt_rec_incore *irec, - xfs_agino_t agino) + xfs_agino_t agino, + unsigned int nr_inodes) { - struct xfs_btree_cur **pcur; - bool has_irec; + xfs_agino_t i; + unsigned int rec_idx; int error; - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - pcur = &sc->sa.ino_cur; - else - pcur = &sc->sa.fino_cur; - if (!(*pcur)) - return; - error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec); - if (!xchk_should_check_xref(sc, &error, pcur)) + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT); + + if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) return; - if (((irec->ir_freecount > 0 && !has_irec) || - (irec->ir_freecount == 0 && has_irec))) - xchk_btree_xref_set_corrupt(sc, *pcur, 0); + + for (i = agino, rec_idx = agino - irec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool free, hole; + unsigned int hole_idx; + + free = irec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec->ir_holemask & (1U << hole_idx); + + error = xchk_inobt_xref_finobt(sc, irec, i, free, hole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + } +} + +/* + * Does the inobt have a record for this inode with the same hole/free state? + * The inobt must always have a record if there's a finobt record. + */ +STATIC int +xchk_finobt_xref_inobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *frec, + xfs_agino_t agino, + bool ffree, + bool fhole) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_btree_cur *cur = sc->sa.ino_cur; + bool free, hole; + unsigned int rec_idx, hole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's an inobt record; free and hole status must match. */ + rec_idx = agino - irec.ir_startino; + free = irec.ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec.ir_holemask & (1U << hole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* finobt should never have a record for which the inobt does not */ + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; } -/* Cross-reference with the other btrees. */ +/* + * Make sure that each inode of this part of an finobt record has the same + * sparse and free status as the inobt. + */ STATIC void -xchk_iallocbt_chunk_xref( +xchk_finobt_chunk_xref_inobt( struct xfs_scrub *sc, - struct xfs_inobt_rec_incore *irec, + struct xfs_inobt_rec_incore *frec, xfs_agino_t agino, - xfs_agblock_t agbno, - xfs_extlen_t len) + unsigned int nr_inodes) { - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + xfs_agino_t i; + unsigned int rec_idx; + int error; + + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT); + + if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm)) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_iallocbt_chunk_xref_other(sc, irec, agino); - xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); - xchk_xref_is_not_shared(sc, agbno, len); + for (i = agino, rec_idx = agino - frec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool ffree, fhole; + unsigned int hole_idx; + + ffree = frec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec->ir_holemask & (1U << hole_idx); + + error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + } } -/* Is this chunk worth checking? */ +/* Is this chunk worth checking and cross-referencing? */ STATIC bool xchk_iallocbt_chunk( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec, xfs_agino_t agino, - xfs_extlen_t len) + unsigned int nr_inodes) { + struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_extlen_t len; - bno = XFS_AGINO_TO_AGBNO(mp, agino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize); - if (!xfs_verify_agbext(pag, bno, len)) + if (!xfs_verify_agbext(pag, agbno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; + xchk_xref_is_used_space(sc, agbno, len); + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes); + else + xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes); + xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); return true; } -/* Count the number of free inodes. */ -static unsigned int -xchk_iallocbt_freecount( - xfs_inofree_t freemask) -{ - BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); - return hweight64(freemask); -} - /* * Check that an inode's allocation status matches ir_free in the inobt * record. First we try querying the in-core inode state, and if the inode @@ -272,7 +428,7 @@ xchk_iallocbt_check_cluster( return 0; } - xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, + xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ @@ -420,36 +576,22 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agino_t agino; - xfs_extlen_t len; int holecount; int i; int error = 0; - unsigned int real_freecount; uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - - if (irec.ir_count > XFS_INODES_PER_CHUNK || - irec.ir_freecount > XFS_INODES_PER_CHUNK) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - - real_freecount = irec.ir_freecount + - (XFS_INODES_PER_CHUNK - irec.ir_count); - if (real_freecount != xchk_iallocbt_freecount(irec.ir_free)) + if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } agino = irec.ir_startino; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(pag, agino) || - !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - goto out; - } xchk_iallocbt_rec_alignment(bs, &irec); if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -459,12 +601,11 @@ xchk_iallocbt_rec( /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); if (irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) + if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_CHUNK)) goto out; goto check_clusters; } @@ -472,8 +613,6 @@ xchk_iallocbt_rec( /* Check each chunk of a sparse inode cluster. */ holemask = irec.ir_holemask; holecount = 0; - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); holes = ~xfs_inobt_irec_to_allocmask(&irec); if ((holes & irec.ir_free) != holes || irec.ir_freecount > irec.ir_count) @@ -482,8 +621,9 @@ xchk_iallocbt_rec( for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { if (holemask & 1) holecount += XFS_INODES_PER_HOLEMASK_BIT; - else if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) - break; + else if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_HOLEMASK_BIT)) + goto out; holemask >>= 1; agino += XFS_INODES_PER_HOLEMASK_BIT; } @@ -493,6 +633,9 @@ xchk_iallocbt_rec( xchk_btree_set_corrupt(bs->sc, bs->cur, 0); check_clusters: + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -622,18 +765,18 @@ xchk_xref_inode_check( xfs_agblock_t agbno, xfs_extlen_t len, struct xfs_btree_cur **icur, - bool should_have_inodes) + enum xbtree_recpacking expected) { - bool has_inodes; + enum xbtree_recpacking outcome; int error; if (!(*icur) || xchk_skip_xref(sc->sm)) return; - error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, icur)) return; - if (has_inodes != should_have_inodes) + if (outcome != expected) xchk_btree_xref_set_corrupt(sc, *icur, 0); } @@ -644,8 +787,10 @@ xchk_xref_is_not_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false); - xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_EMPTY); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, + XBTREE_RECPACKING_EMPTY); } /* xref check that the extent is covered by inodes */ @@ -655,5 +800,6 @@ xchk_xref_is_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_FULL); } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 7a2f38e5202c..3e1e02e340a6 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -11,8 +11,11 @@ #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_ialloc.h" +#include "xfs_icache.h" #include "xfs_da_format.h" #include "xfs_reflink.h" #include "xfs_rmap.h" @@ -20,45 +23,176 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/trace.h" + +/* Prepare the attached inode for scrubbing. */ +static inline int +xchk_prepare_iscrub( + struct xfs_scrub *sc) +{ + int error; + + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + return 0; +} + +/* Install this scrub-by-handle inode and prepare it for scrubbing. */ +static inline int +xchk_install_handle_iscrub( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + int error; + + error = xchk_install_handle_inode(sc, ip); + if (error) + return error; + + return xchk_prepare_iscrub(sc); +} /* - * Grab total control of the inode metadata. It doesn't matter here if - * the file data is still changing; exclusive access to the metadata is - * the goal. + * Grab total control of the inode metadata. In the best case, we grab the + * incore inode and take all locks on it. If the incore inode cannot be + * constructed due to corruption problems, lock the AGI so that we can single + * step the loading process to fix everything that can go wrong. */ int xchk_setup_inode( struct xfs_scrub *sc) { + struct xfs_imap imap; + struct xfs_inode *ip; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + /* We want to scan the opened inode, so lock it and exit. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return xchk_prepare_iscrub(sc); + } + + /* Reject internal metadata files and obviously bad inode numbers. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_iscrub(sc, ip); + if (error == -ENOENT) + return error; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_error; + /* - * Try to get the inode. If the verifiers fail, we try again - * in raw mode. + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is + * a record, and it says this inode is free. + * + * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but + * some other metadata corruption (e.g. inode forks) prevented + * instantiation of the incore inode. Or it could mean the inobt is + * corrupt. + * + * We want to look up this inode in the inobt directly to distinguish + * three different scenarios: (1) the inobt says the inode is free, + * in which case there's nothing to do; (2) the inobt is corrupt so we + * should flag the corruption and exit to userspace to let it fix the + * inobt; and (3) the inobt says the inode is allocated, but loading it + * failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity in + * this AG. Retry the iget in case someone allocated a new inode after + * the first iget failed. */ - error = xchk_get_inode(sc); - switch (error) { - case 0: - break; - case -EFSCORRUPTED: - case -EFSBADCRC: - return xchk_trans_alloc(sc, 0); - default: - return error; + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the incore inode, so install it and proceed. */ + xchk_trans_cancel(sc); + return xchk_install_handle_iscrub(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; } - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); - error = xchk_trans_alloc(sc, 0); + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt doesn't think this is an allocated inode then we'll + * return ENOENT to signal that the check can be skipped. + * + * If the lookup signals corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters a runtime error, exit to userspace. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; if (error) - goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + goto out_cancel; -out: - /* scrub teardown will unlock and release the inode for us */ + /* + * The lookup succeeded. Chances are the ondisk inode is corrupt and + * preventing iget from reading it. Retain the scrub transaction and + * the AGI buffer to prevent anyone from allocating or freeing inodes. + * This ensures that we preserve the inconsistency between the inobt + * saying the inode is allocated and the icache being unable to load + * the inode until we can flag the corruption in xchk_inode. The + * scrub function has to note the corruption, since we're not really + * supposed to do that from the setup function. + */ + return 0; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; } /* Inode core */ @@ -553,8 +687,9 @@ xchk_inode_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); out_free: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index d8dff3fd8053..58d5dfb7ea21 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -16,6 +16,7 @@ #include "xfs_dir2_priv.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/readdir.h" /* Set us up to scrub parents. */ int @@ -30,122 +31,93 @@ xchk_setup_parent( /* Look for an entry in a parent pointing to this inode. */ struct xchk_parent_ctx { - struct dir_context dc; struct xfs_scrub *sc; - xfs_ino_t ino; xfs_nlink_t nlink; - bool cancelled; }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC bool +STATIC int xchk_parent_actor( - struct dir_context *dc, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xchk_parent_ctx *spc; + struct xchk_parent_ctx *spc = priv; int error = 0; - spc = container_of(dc, struct xchk_parent_ctx, dc); - if (spc->ino == ino) + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name->name, name->len)) + error = -EFSCORRUPTED; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + if (sc->ip->i_ino == ino) spc->nlink++; - /* - * If we're facing a fatal signal, bail out. Store the cancellation - * status separately because the VFS readdir code squashes error codes - * into short directory reads. - */ if (xchk_should_terminate(spc->sc, &error)) - spc->cancelled = true; + return error; - return !error; + return 0; } -/* Count the number of dentries in the parent dir that point to this inode. */ -STATIC int -xchk_parent_count_parent_dentries( - struct xfs_scrub *sc, - struct xfs_inode *parent, - xfs_nlink_t *nlink) +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_ilock_dir( + struct xfs_inode *dp) { - struct xchk_parent_ctx spc = { - .dc.actor = xchk_parent_actor, - .ino = sc->ip->i_ino, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - uint lock_mode; - int error = 0; + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) + return 0; - /* - * If there are any blocks, read-ahead block 0 as we're almost - * certain to have the next operation be a read there. This is - * how we guarantee that the parent's extent map has been loaded, - * if there is one. - */ - lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_df.if_nextents > 0) - error = xfs_dir3_data_readahead(parent, 0, 0); - xfs_iunlock(parent, lock_mode); - if (error) - return error; + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_ILOCK_SHARED; - /* - * Iterate the parent dir to confirm that there is - * exactly one entry pointing back to the inode being - * scanned. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - parent->i_disk_size); - oldpos = 0; - while (true) { - error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); - if (error) - goto out; - if (spc.cancelled) { - error = -EAGAIN; - goto out; - } - if (oldpos == spc.dc.pos) - break; - oldpos = spc.dc.pos; - } - *nlink = spc.nlink; -out: - return error; + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) + return 0; + + return XFS_ILOCK_EXCL; } /* - * Given the inode number of the alleged parent of the inode being - * scrubbed, try to validate that the parent has exactly one directory - * entry pointing back to the inode being scrubbed. + * Given the inode number of the alleged parent of the inode being scrubbed, + * try to validate that the parent has exactly one directory entry pointing + * back to the inode being scrubbed. Returns -EAGAIN if we need to revalidate + * the dotdot entry. */ STATIC int xchk_parent_validate( struct xfs_scrub *sc, - xfs_ino_t dnum, - bool *try_again) + xfs_ino_t parent_ino) { + struct xchk_parent_ctx spc = { + .sc = sc, + .nlink = 0, + }; struct xfs_mount *mp = sc->mp; struct xfs_inode *dp = NULL; xfs_nlink_t expected_nlink; - xfs_nlink_t nlink; + unsigned int lock_mode; int error = 0; - *try_again = false; - - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } /* '..' must not point to ourselves. */ - if (sc->ip->i_ino == dnum) { + if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* @@ -155,106 +127,51 @@ xchk_parent_validate( expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; /* - * Grab this parent inode. We release the inode before we - * cancel the scrub transaction. Since we're don't know a - * priori that releasing the inode won't trigger eofblocks - * cleanup (which allocates what would be a nested transaction) - * if the parent pointer erroneously points to a file, we - * can't use DONTCACHE here because DONTCACHE inodes can trigger - * immediate inactive cleanup of the inode. + * Grab the parent directory inode. This must be released before we + * cancel the scrub transaction. * * If _iget returns -EINVAL or -ENOENT then the parent inode number is * garbage and the directory is corrupt. If the _iget returns * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a * cross referencing error. Any other error is an operational error. */ - error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp); + error = xchk_iget(sc, parent_ino, &dp); if (error == -EINVAL || error == -ENOENT) { error = -EFSCORRUPTED; xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); - goto out; + return error; } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } - /* - * We prefer to keep the inode locked while we lock and search - * its alleged parent for a forward reference. If we can grab - * the iolock, validate the pointers and we're done. We must - * use nowait here to avoid an ABBA deadlock on the parent and - * the child inodes. - */ - if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); - if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out_unlock; - if (nlink != expected_nlink) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out_unlock; - } - - /* - * The game changes if we get here. We failed to lock the parent, - * so we're going to try to verify both pointers while only holding - * one lock so as to avoid deadlocking with something that's actually - * trying to traverse down the directory tree. - */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; - error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED); - if (error) + lock_mode = xchk_parent_ilock_dir(dp); + if (!lock_mode) { + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + error = -EAGAIN; goto out_rele; + } - /* Go looking for our dentry. */ - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); + /* Look for a directory entry in the parent pointing to the child. */ + error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; - /* Drop the parent lock, relock this inode. */ - xfs_iunlock(dp, XFS_IOLOCK_SHARED); - error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); - if (error) - goto out_rele; - sc->ilock_flags = XFS_IOLOCK_EXCL; - - /* - * If we're an unlinked directory, the parent /won't/ have a link - * to us. Otherwise, it should have one link. We have to re-set - * it here because we dropped the lock on sc->ip. - */ - expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; - - /* Look up '..' to see if the inode changed. */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out_rele; - - /* Drat, parent changed. Try again! */ - if (dnum != dp->i_ino) { - xfs_irele(dp); - *try_again = true; - return 0; - } - xfs_irele(dp); - /* - * '..' didn't change, so check that there was only one entry - * for us in the parent. + * Ensure that the parent has as many links to the child as the child + * thinks it has to the parent. */ - if (nlink != expected_nlink) + if (spc.nlink != expected_nlink) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - return error; out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); + xfs_iunlock(dp, lock_mode); out_rele: - xfs_irele(dp); -out: + xchk_irele(sc, dp); return error; } @@ -264,9 +181,7 @@ xchk_parent( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - xfs_ino_t dnum; - bool try_again; - int tries = 0; + xfs_ino_t parent_ino; int error = 0; /* @@ -279,56 +194,29 @@ xchk_parent( /* We're not a special inode, are we? */ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } - /* - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to do directory lookups. - */ - sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - - /* Look up '..' */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; - if (!xfs_verify_dir_ino(mp, dnum)) { - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + do { + if (xchk_should_terminate(sc, &error)) + break; - /* Is this the root dir? Then '..' must point to itself. */ - if (sc->ip == mp->m_rootip) { - if (sc->ip->i_ino != mp->m_sb.sb_rootino || - sc->ip->i_ino != dnum) + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(mp, parent_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + return 0; + } - do { - error = xchk_parent_validate(sc, dnum, &try_again); - if (error) - goto out; - } while (try_again && ++tries < 20); + /* + * Check that the dotdot entry points to a parent directory + * containing a dirent pointing to this subdirectory. + */ + error = xchk_parent_validate(sc, parent_ino); + } while (error == -EAGAIN); - /* - * We gave it our best shot but failed, so mark this scrub - * incomplete. Userspace can decide if it wants to try again. - */ - if (try_again && tries == 20) - xchk_set_incomplete(sc); -out: - /* - * If we failed to lock the parent inode even after a retry, just mark - * this scrub incomplete and return. - */ - if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - error = 0; - xchk_set_incomplete(sc); - } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 9eeac8565394..e6caa358cbda 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -53,6 +53,9 @@ xchk_setup_quota( if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c new file mode 100644 index 000000000000..e51c1544be63 --- /dev/null +++ b/fs/xfs/scrub/readdir.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/readdir.h" + +/* Call a function for every entry in a shortform directory. */ +STATIC int +xchk_dir_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_name name = { + .name = ".", + .len = 1, + .type = XFS_DIR3_FT_DIR, + }; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_hdr *sfp; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + unsigned int i; + int error; + + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + + /* dot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset); + + error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv); + if (error) + return error; + + /* dotdot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); + ino = xfs_dir2_sf_get_parent_ino(sfp); + name.name = ".."; + name.len = 2; + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + /* iterate everything else */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + name.name = sfep->name; + name.len = sfep->namelen; + name.type = xfs_dir2_sf_get_ftype(mp, sfep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); + } + + return 0; +} + +/* Call a function for every entry in a block directory. */ +STATIC int +xchk_dir_walk_block( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp; + unsigned int off, next_off, end; + int error; + + error = xfs_dir3_block_read(sc->tp, dp, &bp); + if (error) + return error; + + /* Walk each directory entry. */ + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + for (off = geo->data_entry_offset; off < end; off = next_off) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup = bp->b_addr + off; + struct xfs_dir2_data_entry *dep = bp->b_addr + off; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + + /* Skip an empty entry. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + next_off = off + be16_to_cpu(dup->length); + continue; + } + + /* Otherwise, find the next entry and report it. */ + next_off = off + xfs_dir2_data_entsize(mp, dep->namelen); + if (next_off > end) + break; + + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off); + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + } + + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Read a leaf-format directory buffer. */ +STATIC int +xchk_read_leaf_dir_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_da_geometry *geo, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec map; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_dablk_t last_da; + xfs_dablk_t map_off; + xfs_dir2_off_t new_off; + + *bpp = NULL; + + /* + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. + */ + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff)); + + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) + return 0; + if (map.br_startoff >= last_da) + return 0; + xfs_trim_extent(&map, map_off, last_da - map_off); + + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *curoff) + *curoff = new_off; + + return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); +} + +/* Call a function for every entry in a leaf directory. */ +STATIC int +xchk_dir_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp = NULL; + xfs_dir2_off_t curoff = 0; + unsigned int offset = 0; + int error; + + /* Iterate every directory offset in this directory. */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_entry *dep; + xfs_ino_t ino; + unsigned int length; + xfs_dir2_dataptr_t dapos; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || offset >= geo->blksize) { + if (bp) { + xfs_trans_brelse(sc->tp, bp); + bp = NULL; + } + + error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff, + &bp); + if (error || !bp) + break; + + /* + * Find our position in the block. + */ + offset = geo->data_entry_offset; + curoff += geo->data_entry_offset; + } + + /* Skip an empty entry. */ + dup = bp->b_addr + offset; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + offset += length; + curoff += length; + continue; + } + + /* Otherwise, find the next entry and report it. */ + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + + dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + + /* Advance to the next entry. */ + offset += length; + curoff += length; + } + + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* + * Call a function for every entry in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. + */ +int +xchk_dir_walk( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + }; + bool isblock; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) + return xchk_dir_walk_block(sc, dp, dirent_fn, priv); + + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); +} + +/* + * Look up the inode number for an exact name in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. Names are not + * checked for correctness. + */ +int +xchk_dir_lookup( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *ino) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + .name = name->name, + .namelen = name->len, + .filetype = name->type, + .hashval = xfs_dir2_hashname(dp->i_mount, name), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + bool isblock, isleaf; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_lookup(&args); + goto out_check_rval; + } + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) { + error = xfs_dir2_block_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_isleaf(&args, &isleaf); + if (error) + return error; + + if (isleaf) { + error = xfs_dir2_leaf_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_node_lookup(&args); + +out_check_rval: + if (error == -EEXIST) + error = 0; + if (!error) + *ino = args.inumber; + return error; +} diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h new file mode 100644 index 000000000000..55787f4df123 --- /dev/null +++ b/fs/xfs/scrub/readdir.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_READDIR_H__ +#define __XFS_SCRUB_READDIR_H__ + +typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, const struct xfs_name *name, + xfs_ino_t ino, void *priv); + +int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, void *priv); + +int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, + const struct xfs_name *name, xfs_ino_t *ino); + +#endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d9c1b3cea4a5..304ea1e1bfb0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -1,21 +1,22 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_ag.h" +#include "scrub/trace.h" /* * Set us up to scrub reference count btrees. @@ -24,6 +25,8 @@ int xchk_setup_ag_refcountbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, false); } @@ -300,8 +303,10 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (irec->rc_refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) { + trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen); xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + } out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { @@ -325,6 +330,107 @@ xchk_refcountbt_xref( xchk_refcountbt_xref_rmap(sc, irec); } +struct xchk_refcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next AG block where we aren't expecting shared extents. */ + xfs_agblock_t next_unshared_agbno; + + /* Number of CoW blocks we expect. */ + xfs_agblock_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +STATIC int +xchk_refcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_agblock_t *next_bno = priv; + + if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_refcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_refcbt_records *rrc, + xfs_agblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_agblock_t next_bno = NULLAGBLOCK; + int error; + + if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_agbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + xchk_refcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur); +} + +static inline bool +xchk_refcount_mergeable( + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_domain != r2->rc_domain) + return false; + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + MAXREFCEXTLEN) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_refcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_refcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + /* Scrub a refcountbt record. */ STATIC int xchk_refcountbt_rec( @@ -332,27 +438,37 @@ xchk_refcountbt_rec( const union xfs_btree_rec *rec) { struct xfs_refcount_irec irec; - xfs_agblock_t *cow_blocks = bs->private; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - - /* Check the domain and refcount are not incompatible. */ - if (!xfs_refcount_check_domain(&irec)) + if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } if (irec.rc_domain == XFS_REFC_DOMAIN_COW) - (*cow_blocks) += irec.rc_blockcount; - - /* Check the extent. */ - if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->cow_blocks += irec.rc_blockcount; - if (irec.rc_refcount == 0) + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + xchk_refcountbt_check_mergeable(bs, rrc, &irec); xchk_refcountbt_xref(bs->sc, &irec); + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_agbno = irec.rc_startblock + + irec.rc_blockcount; + } + return 0; } @@ -394,15 +510,25 @@ int xchk_refcountbt( struct xfs_scrub *sc) { - xfs_agblock_t cow_blocks = 0; + struct xchk_refcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_agbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; int error; error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, - &XFS_RMAP_OINFO_REFC, &cow_blocks); + &XFS_RMAP_OINFO_REFC, &rrc); if (error) return error; - xchk_refcount_xref_rmap(sc, cow_blocks); + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the AG have at most one reverse mapping. + */ + xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks); + + xchk_refcount_xref_rmap(sc, rrc.cow_blocks); return 0; } @@ -458,16 +584,37 @@ xchk_xref_is_not_shared( xfs_agblock_t agbno, xfs_extlen_t len) { - bool shared; + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_cow_staging( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; int error; if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, - agbno, len, &shared); + error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; - if (shared) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b71174ec0d6..ac6d8803e660 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -60,6 +60,9 @@ xrep_attempt( sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; + case -ECHRNG: + sc->flags |= XCHK_NEED_DRAIN; + return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { @@ -442,6 +445,30 @@ xrep_init_btblock( * buffers associated with @bitmap. */ +static int +xrep_invalidate_block( + uint64_t fsbno, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_buf *bp; + int error; + + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return 0; + + error = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + return 0; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + return 0; +} + /* * Invalidate buffers for per-AG btree blocks we're dumping. This function * is not intended for use with file data repairs; we have bunmapi for that. @@ -451,11 +478,6 @@ xrep_invalidate_blocks( struct xfs_scrub *sc, struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - /* * For each block in each extent, see if there's an incore buffer for * exactly that block; if so, invalidate it. The buffer cache only @@ -464,23 +486,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - continue; - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - continue; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } - - return 0; + return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); } /* Ensure the freelist is the correct size. */ @@ -501,6 +507,15 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } +/* Information about reaping extents after a repair. */ +struct xrep_reap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; +}; + /* * Put a block back on the AGFL. */ @@ -545,17 +560,23 @@ xrep_put_freelist( /* Dispose of a single block. */ STATIC int xrep_reap_block( - struct xfs_scrub *sc, - xfs_fsblock_t fsbno, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type resv) + uint64_t fsbno, + void *priv) { + struct xrep_reap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; xfs_agblock_t agbno; bool has_other_rmap; int error; + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); + trace_xrep_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); @@ -574,7 +595,8 @@ xrep_reap_block( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + &has_other_rmap); xfs_btree_del_cursor(cur, error); if (error) goto out_free; @@ -594,11 +616,12 @@ xrep_reap_block( */ if (has_other_rmap) error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, oinfo); - else if (resv == XFS_AG_RESV_AGFL) + 1, rs->oinfo); + else if (rs->resv == XFS_AG_RESV_AGFL) error = xrep_put_freelist(sc, agbno); else - error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, + rs->resv); if (agf_bp != sc->sa.agf_bp) xfs_trans_brelse(sc->tp, agf_bp); if (error) @@ -622,26 +645,15 @@ xrep_reap_extents( const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - xfs_fsblock_t fsbno; - int error = 0; + struct xrep_reap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; ASSERT(xfs_has_rmapbt(sc->mp)); - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - error = xrep_reap_block(sc, fsbno, oinfo, type); - if (error) - break; - } - - return error; + return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 840f74ec431c..dce791c679ee 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ @@ -31,6 +31,7 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, const struct xfs_buf_ops *ops); struct xbitmap; +struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 229826b2e1c0..d29a26ecddd6 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -1,21 +1,29 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" +#include "xfs_ag.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/bitmap.h" /* * Set us up to scrub reverse mapping btrees. @@ -24,11 +32,39 @@ int xchk_setup_ag_rmapbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Reverse-mapping scrubber. */ +struct xchk_rmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; + + /* Bitmaps containing all blocks for each type of AG metadata. */ + struct xagb_bitmap fs_owned; + struct xagb_bitmap log_owned; + struct xagb_bitmap ag_owned; + struct xagb_bitmap inobt_owned; + struct xagb_bitmap refcbt_owned; + + /* Did we complete the AG space metadata bitmaps? */ + bool bitmaps_complete; +}; + /* Cross-reference a rmap against the refcount btree. */ STATIC void xchk_rmapbt_xref_refc( @@ -84,80 +120,415 @@ xchk_rmapbt_xref( xchk_rmapbt_xref_refc(sc, irec); } -/* Scrub an rmapbt record. */ -STATIC int -xchk_rmapbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) +/* + * Check for bogus UNWRITTEN flags in the rmapbt node block keys. + * + * In reverse mapping records, the file mapping extent state + * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not + * involved in lookups in any way. In older kernels, the functions that + * convert rmapbt records to keys forgot to filter out the extent state bit, + * even though the key comparison functions have filtered the flag correctly. + * If we spot an rmap key with the unwritten bit set in rm_offset, we should + * mark the btree as needing optimization to rebuild the btree without those + * flags. + */ +STATIC void +xchk_rmapbt_check_unwritten_in_keyflags( + struct xchk_btree *bs) { - struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_rmap_irec irec; - struct xfs_perag *pag = bs->cur->bc_ag.pag; - bool non_inode; - bool is_unwritten; - bool is_bmbt; - bool is_attr; - int error; + struct xfs_scrub *sc = bs->sc; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *keyblock; + union xfs_btree_key *lkey, *hkey; + __be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); + unsigned int level; - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error)) - goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + return; + + for (level = 1; level < cur->bc_nlevels; level++) { + struct xfs_buf *bp; + unsigned int ptr; + + /* Only check the first time we've seen this node block. */ + if (cur->bc_levels[level].ptr > 1) + continue; + + keyblock = xfs_btree_get_block(cur, level, &bp); + for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) { + lkey = xfs_btree_key_addr(cur, ptr, keyblock); + + if (lkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + + hkey = xfs_btree_high_key_addr(cur, ptr, keyblock); + if (hkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + } + } +} + +static inline bool +xchk_rmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_reflink(sc->mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return false; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK | + XFS_RMAP_UNWRITTEN)) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_agblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; - /* Check extent. */ - if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock) + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rmapbt_is_shareable(bs->sc, irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (irec.rm_owner == XFS_RMAP_OWN_FS) { + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rmap_mergeable( + struct xchk_rmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner)) + return true; + /* must be an inode owner below here */ + if (r1->rm_flags != r2->rm_flags) + return false; + if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK) + return true; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Compare an rmap for AG metadata against the metadata walk. */ +STATIC int +xchk_rmapbt_mark_bitmap( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + struct xfs_scrub *sc = bs->sc; + struct xagb_bitmap *bmp = NULL; + xfs_extlen_t fsbcount = irec->rm_blockcount; + + /* + * Skip corrupt records. It is essential that we detect records in the + * btree that cannot overlap but do, flag those as CORRUPT, and skip + * the bitmap comparison to avoid generating false XCORRUPT reports. + */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * If the AG metadata walk didn't complete, there's no point in + * comparing against partial results. + */ + if (!cr->bitmaps_complete) + return 0; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + bmp = &cr->fs_owned; + break; + case XFS_RMAP_OWN_LOG: + bmp = &cr->log_owned; + break; + case XFS_RMAP_OWN_AG: + bmp = &cr->ag_owned; + break; + case XFS_RMAP_OWN_INOBT: + bmp = &cr->inobt_owned; + break; + case XFS_RMAP_OWN_REFC: + bmp = &cr->refcbt_owned; + break; + } + + if (!bmp) + return 0; + + if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) { /* - * xfs_verify_agbno returns false for static fs metadata. - * Since that only exists at the start of the AG, validate - * that by hand. + * The start of this reverse mapping corresponds to a set + * region in the bitmap. If the mapping covers more area than + * the set region, then it covers space that wasn't found by + * the AG metadata walk. */ - if (irec.rm_startblock != 0 || - irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + if (fsbcount < irec->rm_blockcount) + xchk_btree_xref_set_corrupt(bs->sc, + bs->sc->sa.rmap_cur, 0); } else { /* - * Otherwise we must point somewhere past the static metadata - * but before the end of the FS. Run the regular check. + * The start of this reverse mapping does not correspond to a + * completely set region in the bitmap. The region wasn't + * fully set by walking the AG metadata, so this is a + * cross-referencing corruption. */ - if (!xfs_verify_agbno(pag, irec.rm_startblock) || - !xfs_verify_agbno(pag, irec.rm_startblock + - irec.rm_blockcount - 1)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0); } - /* Check flags. */ - non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner); - is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK; - is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK; - is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN; + /* Unset the region so that we can detect missing rmap records. */ + return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount); +} - if (is_bmbt && irec.rm_offset != 0) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Scrub an rmapbt record. */ +STATIC int +xchk_rmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rmap *cr = bs->private; + struct xfs_rmap_irec irec; - if (non_inode && irec.rm_offset != 0) + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rmap_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - if (is_unwritten && (is_bmbt || non_inode || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_rmapbt_check_unwritten_in_keyflags(bs); + xchk_rmapbt_check_mergeable(bs, cr, &irec); + xchk_rmapbt_check_overlapping(bs, cr, &irec); + xchk_rmapbt_xref(bs->sc, &irec); - if (non_inode && (is_bmbt || is_unwritten || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return xchk_rmapbt_mark_bitmap(bs, cr, &irec); +} - if (!non_inode) { - if (!xfs_verify_ino(mp, irec.rm_owner)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - } else { - /* Non-inode owner within the magic values? */ - if (irec.rm_owner <= XFS_RMAP_OWN_MIN || - irec.rm_owner > XFS_RMAP_OWN_FS) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Add an AGFL block to the rmap list. */ +STATIC int +xchk_rmapbt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* + * Set up bitmaps mapping all the AG metadata to compare with the rmapbt + * records. + * + * Grab our own btree cursors here if the scrub setup function didn't give us a + * btree cursor due to reports of poor health. We need to find out if the + * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being + * XCORRUPT. + */ +STATIC int +xchk_rmapbt_walk_ag_metadata( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *cur; + int error; + + /* OWN_FS: AG headers */ + error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp), + XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1); + if (error) + goto out; + + /* OWN_LOG: Internal log */ + if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + error = xagb_bitmap_set(&cr->log_owned, + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), + mp->m_sb.sb_logblocks); + if (error) + goto out; + } + + /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ + cur = sc->sa.bno_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.bno_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + cur = sc->sa.cnt_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.cnt_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur); + if (error) + goto out; + + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto out; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl, + &cr->ag_owned); + xfs_trans_brelse(sc->tp, agfl_bp); + if (error) + goto out; + + /* OWN_INOBT: inobt, finobt */ + cur = sc->sa.ino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, + XFS_BTNUM_INO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.ino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + cur = sc->sa.fino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp, XFS_BTNUM_FINO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.fino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + } + + /* OWN_REFC: refcountbt */ + if (xfs_has_reflink(sc->mp)) { + cur = sc->sa.refc_cur; + if (!cur) + cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp, + sc->sa.agf_bp, sc->sa.pag); + error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur); + if (cur != sc->sa.refc_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; } - xchk_rmapbt_xref(bs->sc, &irec); out: - return error; + /* + * If there's an error, set XFAIL and disable the bitmap + * cross-referencing checks, but proceed with the scrub anyway. + */ + if (error) + xchk_btree_xref_process_error(sc, sc->sa.rmap_cur, + sc->sa.rmap_cur->bc_nlevels - 1, &error); + else + cr->bitmaps_complete = true; + return 0; +} + +/* + * Check for set regions in the bitmaps; if there are any, the rmap records do + * not describe all the AG metadata. + */ +STATIC void +xchk_rmapbt_check_bitmaps( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_btree_cur *cur = sc->sa.rmap_cur; + unsigned int level; + + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XFAIL)) + return; + if (!cur) + return; + level = cur->bc_nlevels - 1; + + /* + * Any bitmap with bits still set indicates that the reverse mapping + * doesn't cover the entire primary structure. + */ + if (xagb_bitmap_hweight(&cr->fs_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->log_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->ag_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->inobt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); } /* Scrub the rmap btree for some AG. */ @@ -165,42 +536,63 @@ int xchk_rmapbt( struct xfs_scrub *sc) { - return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, - &XFS_RMAP_OINFO_AG, NULL); + struct xchk_rmap *cr; + int error; + + cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS); + if (!cr) + return -ENOMEM; + + xagb_bitmap_init(&cr->fs_owned); + xagb_bitmap_init(&cr->log_owned); + xagb_bitmap_init(&cr->ag_owned); + xagb_bitmap_init(&cr->inobt_owned); + xagb_bitmap_init(&cr->refcbt_owned); + + error = xchk_rmapbt_walk_ag_metadata(sc, cr); + if (error) + goto out; + + error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, + &XFS_RMAP_OINFO_AG, cr); + if (error) + goto out; + + xchk_rmapbt_check_bitmaps(sc, cr); + +out: + xagb_bitmap_destroy(&cr->refcbt_owned); + xagb_bitmap_destroy(&cr->inobt_owned); + xagb_bitmap_destroy(&cr->ag_owned); + xagb_bitmap_destroy(&cr->log_owned); + xagb_bitmap_destroy(&cr->fs_owned); + kfree(cr); + return error; } -/* xref check that the extent is owned by a given owner */ -static inline void -xchk_xref_check_owner( +/* xref check that the extent is owned only by a given owner */ +void +xchk_xref_is_only_owned_by( struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len, - const struct xfs_owner_info *oinfo, - bool should_have_rmap) + const struct xfs_owner_info *oinfo) { - bool has_rmap; + struct xfs_rmap_matches res; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, - &has_rmap); + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap != should_have_rmap) + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.non_owner_matches) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); -} - -/* xref check that the extent is owned by a given owner */ -void -xchk_xref_is_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - const struct xfs_owner_info *oinfo) -{ - xchk_xref_check_owner(sc, bno, len, oinfo, true); } /* xref check that the extent is not owned by a given owner */ @@ -211,7 +603,19 @@ xchk_xref_is_not_owned_by( xfs_extlen_t len, const struct xfs_owner_info *oinfo) { - xchk_xref_check_owner(sc, bno, len, oinfo, false); + struct xfs_rmap_matches res; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (res.matches != 0) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } /* xref check that the extent has no reverse mapping at all */ @@ -221,15 +625,15 @@ xchk_xref_has_no_owner( xfs_agblock_t bno, xfs_extlen_t len) { - bool has_rmap; + enum xbtree_recpacking outcome; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); + error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 0a3bde64c675..e7dace7b4be8 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 07a7a75f987f..02819bedc5b1 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -145,6 +145,21 @@ xchk_probe( /* Scrub setup and teardown */ +static inline void +xchk_fsgates_disable( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XCHK_FSGATES_ALL)) + return; + + trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + + if (sc->flags & XCHK_FSGATES_DRAIN) + xfs_drain_wait_disable(); + + sc->flags &= ~XCHK_FSGATES_ALL; +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -166,7 +181,7 @@ xchk_teardown( xfs_iunlock(sc->ip, sc->ilock_flags); if (sc->ip != ip_in && !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xfs_irele(sc->ip); + xchk_irele(sc, sc->ip); sc->ip = NULL; } if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) @@ -174,9 +189,14 @@ xchk_teardown( if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->buf) { + if (sc->buf_cleanup) + sc->buf_cleanup(sc->buf); kvfree(sc->buf); + sc->buf_cleanup = NULL; sc->buf = NULL; } + + xchk_fsgates_disable(sc); return error; } @@ -191,25 +211,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_superblock, .repair = xrep_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agf, .repair = xrep_agf, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agfl, .repair = xrep_agfl, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agi, .repair = xrep_agi, }, @@ -491,23 +511,20 @@ retry_op: /* Set up for the operation. */ error = sc->ops->setup(sc); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; if (error) goto out_teardown; /* Scrub for errors. */ error = sc->ops->scrub(sc); - if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - /* - * Scrubbers return -EDEADLOCK to mean 'try harder'. - * Tear down everything we hold, then set up again with - * preparation for worst-case scenarios. - */ - error = xchk_teardown(sc, 0); - if (error) - goto out_sc; - sc->flags |= XCHK_TRY_HARDER; - goto retry_op; - } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; xchk_update_health(sc); @@ -565,4 +582,21 @@ out: error = 0; } return error; +need_drain: + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_NEED_DRAIN; + goto retry_op; +try_harder: + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down + * everything we hold, then set up again with preparation for + * worst-case scenarios. + */ + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; + goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b4d391b4c938..e71903474cd7 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_SCRUB_H__ #define __XFS_SCRUB_SCRUB_H__ @@ -77,7 +77,17 @@ struct xfs_scrub { */ struct xfs_inode *ip; + /* Kernel memory buffer used by scrubbers; freed at teardown. */ void *buf; + + /* + * Clean up resources owned by whatever is in the buffer. Cleanup can + * be deferred with this hook as a means for scrub functions to pass + * data to repair functions. This function must not free the buffer + * itself. + */ + void (*buf_cleanup)(void *buf); + uint ilock_flags; /* See the XCHK/XREP state flags below. */ @@ -96,9 +106,19 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XCHK_REAPING_DISABLED (1 << 1) /* background block reaping paused */ +#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +/* + * The XCHK_FSGATES* flags reflect functionality in the main filesystem that + * are only enabled for this particular online fsck. When not in use, the + * features are gated off via dynamic code patching, which is why the state + * must be enabled during scrub setup and can only be torn down afterwards. + */ +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -152,7 +172,7 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); -void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, +void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); @@ -162,6 +182,8 @@ void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); +void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); #ifdef CONFIG_XFS_RT void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, xfs_extlen_t len); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index c1c99ffe7408..38708fb9a5d7 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index b5f94676c37c..0a975439d2b6 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 93ece6df02e3..68efd6fda61c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. See xfs_trace.h for documentation of @@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -93,6 +96,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } +#define XFS_SCRUB_STATE_STRINGS \ + { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_REAPING_DISABLED, "reaping_disabled" }, \ + { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ + { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_ALREADY_FIXED, "already_fixed" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -139,6 +149,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); +DECLARE_EVENT_CLASS(xchk_fsgate_class, + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags), + TP_ARGS(sc, fsgate_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, fsgate_flags) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->fsgate_flags = fsgate_flags; + ), + TP_printk("dev %d:%d type %s fsgates '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS)) +) + +#define DEFINE_SCRUB_FSHOOK_EVENT(name) \ +DEFINE_EVENT(xchk_fsgate_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \ + TP_ARGS(sc, fsgates_flags)) + +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -657,6 +694,38 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +TRACE_EVENT(xchk_refcount_incorrect, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + xfs_nlink_t seen), + TP_ARGS(pag, irec, seen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_nlink_t, seen) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = irec->rc_domain; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->seen = seen; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->seen) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2ceae614ade8..a39befa743ce 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_H__ #define __XFS_SCRUB_H__ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6e2f0013380a..7551c3ec4ea5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_bui_cache; struct kmem_cache *xfs_bud_cache; @@ -363,6 +364,34 @@ xfs_bmap_update_create_done( return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } +/* Take a passive ref to the AG containing the space we're mapping. */ +void +xfs_bmap_update_get_group( + struct xfs_mount *mp, + struct xfs_bmap_intent *bi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); + + /* + * Bump the intent count on behalf of the deferred rmap and refcount + * intent items that that we can queue when we finish this bmap work. + * This new intent item will bump the intent count before the bmap + * intent drops the intent count, ensuring that the intent count + * remains nonzero across the transaction roll. + */ + bi->bi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing mapping work. */ +static inline void +xfs_bmap_update_put_group( + struct xfs_bmap_intent *bi) +{ + xfs_perag_intent_put(bi->bi_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( @@ -381,6 +410,8 @@ xfs_bmap_update_finish_item( ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -393,7 +424,7 @@ xfs_bmap_update_abort_intent( xfs_bui_release(BUI_ITEM(intent)); } -/* Cancel a deferred rmap update. */ +/* Cancel a deferred bmap update. */ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) @@ -401,6 +432,8 @@ xfs_bmap_update_cancel_item( struct xfs_bmap_intent *bi; bi = container_of(item, struct xfs_bmap_intent, bi_list); + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); } @@ -509,10 +542,12 @@ xfs_bui_item_recover( fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_bmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_bmap_update_put_group(&fake); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a09dd2606479..f032d3a4b727 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -314,15 +314,13 @@ xfs_getbmap_report_one( if (isnullstartblock(got->br_startblock) || got->br_startblock == DELAYSTARTBLOCK) { /* - * Delalloc extents that start beyond EOF can occur due to - * speculative EOF allocation when the delalloc extent is larger - * than the largest freespace extent at conversion time. These - * extents cannot be converted by data writeback, so can exist - * here even if we are not supposed to be finding delalloc - * extents. + * Take the flush completion as being a point-in-time snapshot + * where there are no delalloc extents, and if any new ones + * have been created racily, just skip them as being 'after' + * the flush and so don't get reported. */ - if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) - ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) + return 0; p->bmv_oflags |= BMV_OF_DELALLOC; p->bmv_block = -2; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 54c774af6e1c..15d1e5a7c2d3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -286,8 +286,7 @@ xfs_buf_free_pages( if (bp->b_pages[i]) __free_page(bp->b_pages[i]); } - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += bp->b_page_count; + mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index ffa94102094d..43167f543afc 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -943,6 +943,16 @@ xlog_recover_buf_commit_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) { + bp->b_ops->verify_read(bp); + error = bp->b_error; + } goto out_release; } diff --git a/fs/xfs/xfs_dahash_test.c b/fs/xfs/xfs_dahash_test.c index 230651ab5ce4..0dab5941e080 100644 --- a/fs/xfs/xfs_dahash_test.c +++ b/fs/xfs/xfs_dahash_test.c @@ -9,6 +9,9 @@ #include "xfs_format.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_dir2_priv.h" #include "xfs_dahash_test.h" /* 4096 random bytes */ @@ -533,108 +536,109 @@ static struct dahash_test { uint16_t start; /* random 12 bit offset in buf */ uint16_t length; /* random 8 bit length of test */ xfs_dahash_t dahash; /* expected dahash result */ + xfs_dahash_t ascii_ci_dahash; /* expected ascii-ci dahash result */ } test[] __initdata = { - {0x0567, 0x0097, 0x96951389}, - {0x0869, 0x0055, 0x6455ab4f}, - {0x0c51, 0x00be, 0x8663afde}, - {0x044a, 0x00fc, 0x98fbe432}, - {0x0f29, 0x0079, 0x42371997}, - {0x08ba, 0x0052, 0x942be4f7}, - {0x01f2, 0x0013, 0x5262687e}, - {0x09e3, 0x00e2, 0x8ffb0908}, - {0x007c, 0x0051, 0xb3158491}, - {0x0854, 0x001f, 0x83bb20d9}, - {0x031b, 0x0008, 0x98970bdf}, - {0x0de7, 0x0027, 0xbfbf6f6c}, - {0x0f76, 0x0005, 0x906a7105}, - {0x092e, 0x00d0, 0x86631850}, - {0x0233, 0x0082, 0xdbdd914e}, - {0x04c9, 0x0075, 0x5a400a9e}, - {0x0b66, 0x0099, 0xae128b45}, - {0x000d, 0x00ed, 0xe61c216a}, - {0x0a31, 0x003d, 0xf69663b9}, - {0x00a3, 0x0052, 0x643c39ae}, - {0x0125, 0x00d5, 0x7c310b0d}, - {0x0105, 0x004a, 0x06a77e74}, - {0x0858, 0x008e, 0x265bc739}, - {0x045e, 0x0095, 0x13d6b192}, - {0x0dab, 0x003c, 0xc4498704}, - {0x00cd, 0x00b5, 0x802a4e2d}, - {0x069b, 0x008c, 0x5df60f71}, - {0x0454, 0x006c, 0x5f03d8bb}, - {0x040e, 0x0032, 0x0ce513b5}, - {0x0874, 0x00e2, 0x6a811fb3}, - {0x0521, 0x00b4, 0x93296833}, - {0x0ddc, 0x00cf, 0xf9305338}, - {0x0a70, 0x0023, 0x239549ea}, - {0x083e, 0x0027, 0x2d88ba97}, - {0x0241, 0x00a7, 0xfe0b32e1}, - {0x0dfc, 0x0096, 0x1a11e815}, - {0x023e, 0x001e, 0xebc9a1f3}, - {0x067e, 0x0066, 0xb1067f81}, - {0x09ea, 0x000e, 0x46fd7247}, - {0x036b, 0x008c, 0x1a39acdf}, - {0x078f, 0x0030, 0x964042ab}, - {0x085c, 0x008f, 0x1829edab}, - {0x02ec, 0x009f, 0x6aefa72d}, - {0x043b, 0x00ce, 0x65642ff5}, - {0x0a32, 0x00b8, 0xbd82759e}, - {0x0d3c, 0x0087, 0xf4d66d54}, - {0x09ec, 0x008a, 0x06bfa1ff}, - {0x0902, 0x0015, 0x755025d2}, - {0x08fe, 0x000e, 0xf690ce2d}, - {0x00fb, 0x00dc, 0xe55f1528}, - {0x0eaa, 0x003a, 0x0fe0a8d7}, - {0x05fb, 0x0006, 0x86281cfb}, - {0x0dd1, 0x00a7, 0x60ab51b4}, - {0x0005, 0x001b, 0xf51d969b}, - {0x077c, 0x00dd, 0xc2fed268}, - {0x0575, 0x00f5, 0x432c0b1a}, - {0x05be, 0x0088, 0x78baa04b}, - {0x0c89, 0x0068, 0xeda9e428}, - {0x0f5c, 0x0068, 0xec143c76}, - {0x06a8, 0x0009, 0xd72651ce}, - {0x060f, 0x008e, 0x765426cd}, - {0x07b1, 0x0047, 0x2cfcfa0c}, - {0x04f1, 0x0041, 0x55b172f9}, - {0x0e05, 0x00ac, 0x61efde93}, - {0x0bf7, 0x0097, 0x05b83eee}, - {0x04e9, 0x00f3, 0x9928223a}, - {0x023a, 0x0005, 0xdfada9bc}, - {0x0acb, 0x000e, 0x2217cecd}, - {0x0148, 0x0060, 0xbc3f7405}, - {0x0764, 0x0059, 0xcbc201b1}, - {0x021f, 0x0059, 0x5d6b2256}, - {0x0f1e, 0x006c, 0xdefeeb45}, - {0x071c, 0x00b9, 0xb9b59309}, - {0x0564, 0x0063, 0xae064271}, - {0x0b14, 0x0044, 0xdb867d9b}, - {0x0e5a, 0x0055, 0xff06b685}, - {0x015e, 0x00ba, 0x1115ccbc}, - {0x0379, 0x00e6, 0x5f4e58dd}, - {0x013b, 0x0067, 0x4897427e}, - {0x0e64, 0x0071, 0x7af2b7a4}, - {0x0a11, 0x0050, 0x92105726}, - {0x0109, 0x0055, 0xd0d000f9}, - {0x00aa, 0x0022, 0x815d229d}, - {0x09ac, 0x004f, 0x02f9d985}, - {0x0e1b, 0x00ce, 0x5cf92ab4}, - {0x08af, 0x00d8, 0x17ca72d1}, - {0x0e33, 0x000a, 0xda2dba6b}, - {0x0ee3, 0x006a, 0xb00048e5}, - {0x0648, 0x001a, 0x2364b8cb}, - {0x0315, 0x0085, 0x0596fd0d}, - {0x0fbb, 0x003e, 0x298230ca}, - {0x0422, 0x006a, 0x78ada4ab}, - {0x04ba, 0x0073, 0xced1fbc2}, - {0x007d, 0x0061, 0x4b7ff236}, - {0x070b, 0x00d0, 0x261cf0ae}, - {0x0c1a, 0x0035, 0x8be92ee2}, - {0x0af8, 0x0063, 0x824dcf03}, - {0x08f8, 0x006d, 0xd289710c}, - {0x021b, 0x00ee, 0x6ac1c41d}, - {0x05b5, 0x00da, 0x8e52f0e2}, + {0x0567, 0x0097, 0x96951389, 0xc153aa0d}, + {0x0869, 0x0055, 0x6455ab4f, 0xd07f69bf}, + {0x0c51, 0x00be, 0x8663afde, 0xf9add90c}, + {0x044a, 0x00fc, 0x98fbe432, 0xbf2abb76}, + {0x0f29, 0x0079, 0x42371997, 0x282588b3}, + {0x08ba, 0x0052, 0x942be4f7, 0x2e023547}, + {0x01f2, 0x0013, 0x5262687e, 0x5266287e}, + {0x09e3, 0x00e2, 0x8ffb0908, 0x1da892f3}, + {0x007c, 0x0051, 0xb3158491, 0xb67f9e63}, + {0x0854, 0x001f, 0x83bb20d9, 0x22bb21db}, + {0x031b, 0x0008, 0x98970bdf, 0x9cd70adf}, + {0x0de7, 0x0027, 0xbfbf6f6c, 0xae3f296c}, + {0x0f76, 0x0005, 0x906a7105, 0x906a7105}, + {0x092e, 0x00d0, 0x86631850, 0xa3f6ac04}, + {0x0233, 0x0082, 0xdbdd914e, 0x5d8c7aac}, + {0x04c9, 0x0075, 0x5a400a9e, 0x12f60711}, + {0x0b66, 0x0099, 0xae128b45, 0x7551310d}, + {0x000d, 0x00ed, 0xe61c216a, 0xc22d3c4c}, + {0x0a31, 0x003d, 0xf69663b9, 0x51960bf8}, + {0x00a3, 0x0052, 0x643c39ae, 0xa93c73a8}, + {0x0125, 0x00d5, 0x7c310b0d, 0xf221cbb3}, + {0x0105, 0x004a, 0x06a77e74, 0xa4ef4561}, + {0x0858, 0x008e, 0x265bc739, 0xd6c36d9b}, + {0x045e, 0x0095, 0x13d6b192, 0x5f5c1d62}, + {0x0dab, 0x003c, 0xc4498704, 0x10414654}, + {0x00cd, 0x00b5, 0x802a4e2d, 0xfbd17c9d}, + {0x069b, 0x008c, 0x5df60f71, 0x91ddca5f}, + {0x0454, 0x006c, 0x5f03d8bb, 0x5c59fce0}, + {0x040e, 0x0032, 0x0ce513b5, 0xa8cd99b1}, + {0x0874, 0x00e2, 0x6a811fb3, 0xca028316}, + {0x0521, 0x00b4, 0x93296833, 0x2c4d4880}, + {0x0ddc, 0x00cf, 0xf9305338, 0x2c94210d}, + {0x0a70, 0x0023, 0x239549ea, 0x22b561aa}, + {0x083e, 0x0027, 0x2d88ba97, 0x5cd8bb9d}, + {0x0241, 0x00a7, 0xfe0b32e1, 0x17b506b8}, + {0x0dfc, 0x0096, 0x1a11e815, 0xee4141bd}, + {0x023e, 0x001e, 0xebc9a1f3, 0x5689a1f3}, + {0x067e, 0x0066, 0xb1067f81, 0xd9952571}, + {0x09ea, 0x000e, 0x46fd7247, 0x42b57245}, + {0x036b, 0x008c, 0x1a39acdf, 0x58bf1586}, + {0x078f, 0x0030, 0x964042ab, 0xb04218b9}, + {0x085c, 0x008f, 0x1829edab, 0x9ceca89c}, + {0x02ec, 0x009f, 0x6aefa72d, 0x634cc2a7}, + {0x043b, 0x00ce, 0x65642ff5, 0x6c8a584e}, + {0x0a32, 0x00b8, 0xbd82759e, 0x0f96a34f}, + {0x0d3c, 0x0087, 0xf4d66d54, 0xb71ba5f4}, + {0x09ec, 0x008a, 0x06bfa1ff, 0x576ca80f}, + {0x0902, 0x0015, 0x755025d2, 0x517225c2}, + {0x08fe, 0x000e, 0xf690ce2d, 0xf690cf3d}, + {0x00fb, 0x00dc, 0xe55f1528, 0x707d7d92}, + {0x0eaa, 0x003a, 0x0fe0a8d7, 0x87638cc5}, + {0x05fb, 0x0006, 0x86281cfb, 0x86281cf9}, + {0x0dd1, 0x00a7, 0x60ab51b4, 0xe28ef00c}, + {0x0005, 0x001b, 0xf51d969b, 0xe71dd6d3}, + {0x077c, 0x00dd, 0xc2fed268, 0xdc30c555}, + {0x0575, 0x00f5, 0x432c0b1a, 0x81dd7d16}, + {0x05be, 0x0088, 0x78baa04b, 0xd69b433e}, + {0x0c89, 0x0068, 0xeda9e428, 0xe9b4fa0a}, + {0x0f5c, 0x0068, 0xec143c76, 0x9947067a}, + {0x06a8, 0x0009, 0xd72651ce, 0xd72651ee}, + {0x060f, 0x008e, 0x765426cd, 0x2099626f}, + {0x07b1, 0x0047, 0x2cfcfa0c, 0x1a4baa07}, + {0x04f1, 0x0041, 0x55b172f9, 0x15331a79}, + {0x0e05, 0x00ac, 0x61efde93, 0x320568cc}, + {0x0bf7, 0x0097, 0x05b83eee, 0xc72fb7a3}, + {0x04e9, 0x00f3, 0x9928223a, 0xe8c77de2}, + {0x023a, 0x0005, 0xdfada9bc, 0xdfadb9be}, + {0x0acb, 0x000e, 0x2217cecd, 0x0017d6cd}, + {0x0148, 0x0060, 0xbc3f7405, 0xf5fd6615}, + {0x0764, 0x0059, 0xcbc201b1, 0xbb089bf4}, + {0x021f, 0x0059, 0x5d6b2256, 0xa16a0a59}, + {0x0f1e, 0x006c, 0xdefeeb45, 0xfc34f9d6}, + {0x071c, 0x00b9, 0xb9b59309, 0xb645eae2}, + {0x0564, 0x0063, 0xae064271, 0x954dc6d1}, + {0x0b14, 0x0044, 0xdb867d9b, 0xdf432309}, + {0x0e5a, 0x0055, 0xff06b685, 0xa65ff257}, + {0x015e, 0x00ba, 0x1115ccbc, 0x11c365f4}, + {0x0379, 0x00e6, 0x5f4e58dd, 0x2d176d31}, + {0x013b, 0x0067, 0x4897427e, 0xc40532fe}, + {0x0e64, 0x0071, 0x7af2b7a4, 0x1fb7bf43}, + {0x0a11, 0x0050, 0x92105726, 0xb1185e51}, + {0x0109, 0x0055, 0xd0d000f9, 0x60a60bfd}, + {0x00aa, 0x0022, 0x815d229d, 0x215d379c}, + {0x09ac, 0x004f, 0x02f9d985, 0x10b90b20}, + {0x0e1b, 0x00ce, 0x5cf92ab4, 0x6a477573}, + {0x08af, 0x00d8, 0x17ca72d1, 0x385af156}, + {0x0e33, 0x000a, 0xda2dba6b, 0xda2dbb69}, + {0x0ee3, 0x006a, 0xb00048e5, 0xa9a2decc}, + {0x0648, 0x001a, 0x2364b8cb, 0x3364b1cb}, + {0x0315, 0x0085, 0x0596fd0d, 0xa651740f}, + {0x0fbb, 0x003e, 0x298230ca, 0x7fc617c7}, + {0x0422, 0x006a, 0x78ada4ab, 0xc576ae2a}, + {0x04ba, 0x0073, 0xced1fbc2, 0xaac8455b}, + {0x007d, 0x0061, 0x4b7ff236, 0x347d5739}, + {0x070b, 0x00d0, 0x261cf0ae, 0xc7fb1c10}, + {0x0c1a, 0x0035, 0x8be92ee2, 0x8be9b4e1}, + {0x0af8, 0x0063, 0x824dcf03, 0x53010388}, + {0x08f8, 0x006d, 0xd289710c, 0x30418edd}, + {0x021b, 0x00ee, 0x6ac1c41d, 0x2557e9a3}, + {0x05b5, 0x00da, 0x8e52f0e2, 0x98531012}, }; int __init @@ -644,12 +648,19 @@ xfs_dahash_test(void) unsigned int errors = 0; for (i = 0; i < ARRAY_SIZE(test); i++) { + struct xfs_name xname = { }; xfs_dahash_t hash; hash = xfs_da_hashname(test_buf + test[i].start, test[i].length); if (hash != test[i].dahash) errors++; + + xname.name = test_buf + test[i].start; + xname.len = test[i].length; + hash = xfs_ascii_ci_hashname(&xname); + if (hash != test[i].ascii_ci_dahash) + errors++; } if (errors) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8fb90da89787..7f071757f278 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert( error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - WARN_ON(error != -EEXIST); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); return error; diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c new file mode 100644 index 000000000000..005a66be44a2 --- /dev/null +++ b/fs/xfs/xfs_drain.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* + * Use a static key here to reduce the overhead of xfs_drain_rele. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no xfs_drain_wait callers. Online fsck is currently + * the only caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); + +void +xfs_drain_wait_disable(void) +{ + static_branch_dec(&xfs_drain_waiter_gate); +} + +void +xfs_drain_wait_enable(void) +{ + static_branch_inc(&xfs_drain_waiter_gate); +} + +void +xfs_defer_drain_init( + struct xfs_defer_drain *dr) +{ + atomic_set(&dr->dr_count, 0); + init_waitqueue_head(&dr->dr_waiters); +} + +void +xfs_defer_drain_free(struct xfs_defer_drain *dr) +{ + ASSERT(atomic_read(&dr->dr_count) == 0); +} + +/* Increase the pending intent count. */ +static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr) +{ + atomic_inc(&dr->dr_count); +} + +static inline bool has_waiters(struct wait_queue_head *wq_head) +{ + /* + * This memory barrier is paired with the one in set_current_state on + * the waiting side. + */ + smp_mb__after_atomic(); + return waitqueue_active(wq_head); +} + +/* Decrease the pending intent count, and wake any waiters, if appropriate. */ +static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) +{ + if (atomic_dec_and_test(&dr->dr_count) && + static_branch_unlikely(&xfs_drain_waiter_gate) && + has_waiters(&dr->dr_waiters)) + wake_up(&dr->dr_waiters); +} + +/* Are there intents pending? */ +static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr) +{ + return atomic_read(&dr->dr_count) > 0; +} + +/* + * Wait for the pending intent count for a drain to hit zero. + * + * Callers must not hold any locks that would prevent intents from being + * finished. + */ +static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) +{ + return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr)); +} + +/* + * Get a passive reference to an AG and declare an intent to update its + * metadata. + */ +struct xfs_perag * +xfs_perag_intent_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + if (!pag) + return NULL; + + xfs_perag_intent_hold(pag); + return pag; +} + +/* + * Release our intent to update this AG's metadata, and then release our + * passive ref to the AG. + */ +void +xfs_perag_intent_put( + struct xfs_perag *pag) +{ + xfs_perag_intent_rele(pag); + xfs_perag_put(pag); +} + +/* + * Declare an intent to update AG metadata. Other threads that need exclusive + * access can decide to back off if they see declared intentions. + */ +void +xfs_perag_intent_hold( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_hold(pag, __return_address); + xfs_defer_drain_grab(&pag->pag_intents_drain); +} + +/* Release our intent to update this AG's metadata. */ +void +xfs_perag_intent_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_rele(pag, __return_address); + xfs_defer_drain_rele(&pag->pag_intents_drain); +} + +/* + * Wait for the intent update count for this AG to hit zero. + * Callers must not hold any AG header buffers. + */ +int +xfs_perag_intent_drain( + struct xfs_perag *pag) +{ + trace_xfs_perag_wait_intents(pag, __return_address); + return xfs_defer_drain_wait(&pag->pag_intents_drain); +} + +/* Has anyone declared an intent to update this AG? */ +bool +xfs_perag_intent_busy( + struct xfs_perag *pag) +{ + return xfs_defer_drain_busy(&pag->pag_intents_drain); +} diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h new file mode 100644 index 000000000000..50a5772a8296 --- /dev/null +++ b/fs/xfs/xfs_drain.h @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef XFS_DRAIN_H_ +#define XFS_DRAIN_H_ + +struct xfs_perag; + +#ifdef CONFIG_XFS_DRAIN_INTENTS +/* + * Passive drain mechanism. This data structure tracks a count of some items + * and contains a waitqueue for callers who would like to wake up when the + * count hits zero. + */ +struct xfs_defer_drain { + /* Number of items pending in some part of the filesystem. */ + atomic_t dr_count; + + /* Queue to wait for dri_count to go to zero */ + struct wait_queue_head dr_waiters; +}; + +void xfs_defer_drain_init(struct xfs_defer_drain *dr); +void xfs_defer_drain_free(struct xfs_defer_drain *dr); + +void xfs_drain_wait_disable(void); +void xfs_drain_wait_enable(void); + +/* + * Deferred Work Intent Drains + * =========================== + * + * When a writer thread executes a chain of log intent items, the AG header + * buffer locks will cycle during a transaction roll to get from one intent + * item to the next in a chain. Although scrub takes all AG header buffer + * locks, this isn't sufficient to guard against scrub checking an AG while + * that writer thread is in the middle of finishing a chain because there's no + * higher level locking primitive guarding allocation groups. + * + * When there's a collision, cross-referencing between data structures (e.g. + * rmapbt and refcountbt) yields false corruption events; if repair is running, + * this results in incorrect repairs, which is catastrophic. + * + * The solution is to the perag structure the count of active intents and make + * scrub wait until it has both AG header buffer locks and the intent counter + * reaches zero. It is therefore critical that deferred work threads hold the + * AGI or AGF buffers when decrementing the intent counter. + * + * Given a list of deferred work items, the deferred work manager will complete + * a work item and all the sub-items that the parent item creates before moving + * on to the next work item in the list. This is also true for all levels of + * sub-items. Writer threads are permitted to queue multiple work items + * targetting the same AG, so a deferred work item (such as a BUI) that creates + * sub-items (such as RUIs) must bump the intent counter and maintain it until + * the sub-items can themselves bump the intent counter. + * + * Therefore, the intent count tracks entire lifetimes of deferred work items. + * All functions that create work items must increment the intent counter as + * soon as the item is added to the transaction and cannot drop the counter + * until the item is finished or cancelled. + */ +struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, + xfs_agnumber_t agno); +void xfs_perag_intent_put(struct xfs_perag *pag); + +void xfs_perag_intent_hold(struct xfs_perag *pag); +void xfs_perag_intent_rele(struct xfs_perag *pag); + +int xfs_perag_intent_drain(struct xfs_perag *pag); +bool xfs_perag_intent_busy(struct xfs_perag *pag); +#else +struct xfs_defer_drain { /* empty */ }; + +#define xfs_defer_drain_free(dr) ((void)0) +#define xfs_defer_drain_init(dr) ((void)0) + +#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno)) +#define xfs_perag_intent_put(pag) xfs_perag_put(pag) + +static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } +static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + +#endif /* XFS_DRAIN_H_ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 011b50469301..f9e36b810663 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -351,8 +351,6 @@ xfs_trans_free_extent( struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, - xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); int error; @@ -363,12 +361,13 @@ xfs_trans_free_extent( if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, - xefi->xefi_blockcount); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); - error = __xfs_free_extent(tp, xefi->xefi_startblock, + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -396,14 +395,13 @@ xfs_extent_free_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_extent_free_item *ra; struct xfs_extent_free_item *rb; ra = container_of(a, struct xfs_extent_free_item, xefi_list); rb = container_of(b, struct xfs_extent_free_item, xefi_list); - return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - - XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); + + return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; } /* Log a free extent to the intent item. */ @@ -462,6 +460,26 @@ xfs_extent_free_create_done( return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } +/* Take a passive ref to the AG containing the space we're freeing. */ +void +xfs_extent_free_get_group( + struct xfs_mount *mp, + struct xfs_extent_free_item *xefi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + xefi->xefi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after some freeing work. */ +static inline void +xfs_extent_free_put_group( + struct xfs_extent_free_item *xefi) +{ + xfs_perag_intent_put(xefi->xefi_pag); +} + /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( @@ -476,6 +494,8 @@ xfs_extent_free_finish_item( xefi = container_of(item, struct xfs_extent_free_item, xefi_list); error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -496,6 +516,8 @@ xfs_extent_free_cancel_item( struct xfs_extent_free_item *xefi; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -526,24 +548,21 @@ xfs_agfl_free_finish_item( struct xfs_extent *extp; struct xfs_buf *agbp; int error; - xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; - struct xfs_perag *pag; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(xefi->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno, + xefi->xefi_blockcount); - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); + error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); if (!error) - error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); - xfs_perag_put(pag); + error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, + agbno, agbp, &oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -562,6 +581,7 @@ xfs_agfl_free_finish_item( extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -632,7 +652,9 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; + xfs_extent_free_get_group(mp, &fake); error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 863289aaa441..aede746541f8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1389,25 +1389,10 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } -static vm_fault_t -xfs_filemap_map_pages( - struct vm_fault *vmf, - pgoff_t start_pgoff, - pgoff_t end_pgoff) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return ret; -} - static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = xfs_filemap_map_pages, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c9a7e270a428..351849fc18ff 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -767,7 +767,8 @@ again: return 0; out_error_or_again: - if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { + if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && + error == -EAGAIN) { delay(1); goto again; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 6cd180721659..87910191a9dd 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -34,10 +34,13 @@ struct xfs_icwalk { /* * Flags for xfs_iget() */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_UNTRUSTED 0x2 -#define XFS_IGET_DONTCACHE 0x4 -#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ +#define XFS_IGET_CREATE (1U << 0) +#define XFS_IGET_UNTRUSTED (1U << 1) +#define XFS_IGET_DONTCACHE (1U << 2) +/* don't read from disk or reinit */ +#define XFS_IGET_INCORE (1U << 3) +/* Return -EAGAIN immediately if the inode is unavailable. */ +#define XFS_IGET_NORETRY (1U << 4) int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c index 43005ce8bd48..2ddccb172fa0 100644 --- a/fs/xfs/xfs_iunlink_item.c +++ b/fs/xfs/xfs_iunlink_item.c @@ -168,9 +168,7 @@ xfs_iunlink_log_inode( iup->ip = ip; iup->next_agino = next_agino; iup->old_agino = ip->i_next_unlinked; - - atomic_inc(&pag->pag_ref); - iup->pag = pag; + iup->pag = xfs_perag_hold(pag); xfs_trans_add_item(tp, &iup->item); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 21be93bf006d..b3275e8d47b6 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -667,11 +667,10 @@ xfs_iwalk_threaded( iwag->mp = mp; /* - * perag is being handed off to async work, so take another + * perag is being handed off to async work, so take a passive * reference for the async work to release. */ - atomic_inc(&pag->pag_ref); - iwag->pag = pag; + iwag->pag = xfs_perag_hold(pag); iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e88f18f85e4b..74dcb05069e8 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -80,6 +80,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" +#include "xfs_drain.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 48d771a76add..edd8587658d5 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -279,14 +280,13 @@ xfs_refcount_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_refcount_intent *ra; struct xfs_refcount_intent *rb; ra = container_of(a, struct xfs_refcount_intent, ri_list); rb = container_of(b, struct xfs_refcount_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Set the phys extent flags for this reverse mapping. */ @@ -365,6 +365,26 @@ xfs_refcount_update_create_done( return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } +/* Take a passive ref to the AG containing the space we're refcounting. */ +void +xfs_refcount_update_get_group( + struct xfs_mount *mp, + struct xfs_refcount_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing refcounting work. */ +static inline void +xfs_refcount_update_put_group( + struct xfs_refcount_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( @@ -386,6 +406,8 @@ xfs_refcount_update_finish_item( ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -406,6 +428,8 @@ xfs_refcount_update_cancel_item( struct xfs_refcount_intent *ri; ri = container_of(item, struct xfs_refcount_intent, ri_list); + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -520,9 +544,13 @@ xfs_cui_item_recover( fake.ri_startblock = pmap->pe_startblock; fake.ri_blockcount = pmap->pe_len; - if (!requeue_only) + + if (!requeue_only) { + xfs_refcount_update_get_group(mp, &fake); error = xfs_trans_log_finish_refcount_update(tp, cudp, &fake, &rcur); + xfs_refcount_update_put_group(&fake); + } if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a1619d67015f..520c7ebdfed8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -320,14 +321,13 @@ xfs_rmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_rmap_intent *ra; struct xfs_rmap_intent *rb; ra = container_of(a, struct xfs_rmap_intent, ri_list); rb = container_of(b, struct xfs_rmap_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Log rmap updates in the intent item. */ @@ -390,6 +390,26 @@ xfs_rmap_update_create_done( return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } +/* Take a passive ref to the AG containing the space we're rmapping. */ +void +xfs_rmap_update_get_group( + struct xfs_mount *mp, + struct xfs_rmap_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing rmapping work. */ +static inline void +xfs_rmap_update_put_group( + struct xfs_rmap_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( @@ -405,6 +425,8 @@ xfs_rmap_update_finish_item( error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -425,6 +447,8 @@ xfs_rmap_update_cancel_item( struct xfs_rmap_intent *ri; ri = container_of(item, struct xfs_rmap_intent, ri_list); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -559,11 +583,13 @@ xfs_rui_item_recover( fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_rmap_update_put_group(&fake); if (error) goto abort_error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4f814f9e12ab..4d2e87462ac4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1548,6 +1548,19 @@ xfs_fs_fill_super( #endif } + /* ASCII case insensitivity is undergoing deprecation. */ + if (xfs_has_asciici(mp)) { +#ifdef CONFIG_XFS_SUPPORT_ASCII_CI + xfs_warn_once(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* Filesystem claims it needs repair, so refuse the mount. */ if (xfs_has_needsrepair(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 546a6cd96729..fade33735393 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -210,28 +210,10 @@ static struct ctl_table xfs_table[] = { {} }; -static struct ctl_table xfs_dir_table[] = { - { - .procname = "xfs", - .mode = 0555, - .child = xfs_table - }, - {} -}; - -static struct ctl_table xfs_root_table[] = { - { - .procname = "fs", - .mode = 0555, - .child = xfs_dir_table - }, - {} -}; - int xfs_sysctl_register(void) { - xfs_table_header = register_sysctl_table(xfs_root_table); + xfs_table_header = register_sysctl("fs/xfs", xfs_table); if (!xfs_table_header) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9c0006c55fec..cd4ca5b1fcb0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -190,6 +190,7 @@ DEFINE_EVENT(xfs_perag_class, name, \ TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_hold); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_grab); DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); @@ -2686,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, + void *item), + TP_ARGS(mp, dfp, item), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(void *, item) + __field(char, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type; + __entry->intent = dfp->dfp_intent; + __entry->item = item; + __entry->committed = dfp->dfp_done != NULL; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->item, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_item_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \ + void *item), \ + TP_ARGS(mp, dfp, item)) + +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); + /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -4325,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown, __entry->line_num) ); +#ifdef CONFIG_XFS_DRAIN_INTENTS +DECLARE_EVENT_CLASS(xfs_perag_intents_class, + TP_PROTO(struct xfs_perag *pag, void *caller_ip), + TP_ARGS(pag, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(long, nr_intents) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->nr_intents, + __entry->caller_ip) +); + +#define DEFINE_PERAG_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_perag_intents_class, name, \ + TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ + TP_ARGS(pag, caller_ip)) +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |