summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.h3
-rw-r--r--fs/9p/fid.c48
-rw-r--r--fs/9p/fid.h31
-rw-r--r--fs/9p/v9fs.c59
-rw-r--r--fs/9p/v9fs.h62
-rw-r--r--fs/9p/v9fs_vfs.h4
-rw-r--r--fs/9p/vfs_addr.c50
-rw-r--r--fs/9p/vfs_dir.c10
-rw-r--r--fs/9p/vfs_file.c206
-rw-r--r--fs/9p/vfs_inode.c111
-rw-r--r--fs/9p/vfs_inode_dotl.c90
-rw-r--r--fs/9p/vfs_super.c44
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/dir.c14
-rw-r--r--fs/afs/dir_edit.c2
-rw-r--r--fs/afs/file.c14
-rw-r--r--fs/afs/inode.c37
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/buffer.c89
-rw-r--r--fs/cachefiles/error_inject.c11
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/debugfs.c18
-rw-r--r--fs/ceph/dir.c13
-rw-r--r--fs/ceph/mds_client.c78
-rw-r--r--fs/ceph/mds_client.h5
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/ceph/xattr.c20
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/file.c16
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/smb2pdu.c115
-rw-r--r--fs/cifs/smb2pdu.h20
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/coda/sysctl.c11
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/eventpoll.c215
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext4/extents.c3
-rw-r--r--fs/ext4/inline.c19
-rw-r--r--fs/ext4/inode.c17
-rw-r--r--fs/ext4/mmp.c9
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/super.c77
-rw-r--r--fs/ext4/verity.c6
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/hostfs/Makefile6
-rw-r--r--fs/hostfs/hostfs_user_exp.c28
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c3
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/iomap/direct-io.c9
-rw-r--r--fs/iomap/trace.c1
-rw-r--r--fs/iomap/trace.h78
-rw-r--r--fs/kernfs/dir.c26
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c16
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c8
-rw-r--r--fs/ksmbd/server.c11
-rw-r--r--fs/ksmbd/smb2pdu.c203
-rw-r--r--fs/ksmbd/smb2pdu.h33
-rw-r--r--fs/ksmbd/vfs.c437
-rw-r--r--fs/ksmbd/vfs.h19
-rw-r--r--fs/ksmbd/vfs_cache.c5
-rw-r--r--fs/lockd/Makefile6
-rw-r--r--fs/lockd/clntlock.c58
-rw-r--r--fs/lockd/clntproc.c42
-rw-r--r--fs/lockd/host.c1
-rw-r--r--fs/lockd/svc.c20
-rw-r--r--fs/lockd/svclock.c21
-rw-r--r--fs/lockd/trace.c3
-rw-r--r--fs/lockd/trace.h106
-rw-r--r--fs/mpage.c66
-rw-r--r--fs/namei.c125
-rw-r--r--fs/netfs/buffered_read.c4
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfs/dir.c300
-rw-r--r--fs/nfs/export.c9
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/fscache.c238
-rw-r--r--fs/nfs/fscache.h131
-rw-r--r--fs/nfs/inode.c114
-rw-r--r--fs/nfs/internal.h24
-rw-r--r--fs/nfs/iostat.h17
-rw-r--r--fs/nfs/nfs3acl.c5
-rw-r--r--fs/nfs/nfs42xdr.c4
-rw-r--r--fs/nfs/nfs4proc.c17
-rw-r--r--fs/nfs/nfs4state.c8
-rw-r--r--fs/nfs/nfs4sysctl.c21
-rw-r--r--fs/nfs/nfstrace.h91
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/read.c105
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/sysctl.c20
-rw-r--r--fs/nfs_common/nfs_ssc.c1
-rw-r--r--fs/nfsd/export.c64
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/filecache.c430
-rw-r--r--fs/nfsd/filecache.h9
-rw-r--r--fs/nfsd/nfs4idmap.c8
-rw-r--r--fs/nfsd/vfs.c13
-rw-r--r--fs/nilfs2/page.c6
-rw-r--r--fs/ntfs/sysctl.c12
-rw-r--r--fs/ntfs3/attrib.c17
-rw-r--r--fs/ntfs3/bitmap.c25
-rw-r--r--fs/ntfs3/file.c50
-rw-r--r--fs/ntfs3/frecord.c46
-rw-r--r--fs/ntfs3/fslog.c83
-rw-r--r--fs/ntfs3/fsntfs.c84
-rw-r--r--fs/ntfs3/index.c81
-rw-r--r--fs/ntfs3/inode.c134
-rw-r--r--fs/ntfs3/lznt.c10
-rw-r--r--fs/ntfs3/namei.c19
-rw-r--r--fs/ntfs3/ntfs.h3
-rw-r--r--fs/ntfs3/ntfs_fs.h19
-rw-r--r--fs/ntfs3/record.c15
-rw-r--r--fs/ntfs3/run.c6
-rw-r--r--fs/ntfs3/super.c312
-rw-r--r--fs/ntfs3/xattr.c70
-rw-r--r--fs/ocfs2/ioctl.c37
-rw-r--r--fs/orangefs/inode.c9
-rw-r--r--fs/proc/array.c9
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/kcore.c85
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/proc_sysctl.c142
-rw-r--r--fs/proc/stat.c26
-rw-r--r--fs/proc/task_mmu.c12
-rw-r--r--fs/proc/vmcore.c22
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/smbfs_common/smb2pdu.h56
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/compress.c1
-rw-r--r--fs/ubifs/dir.c7
-rw-r--r--fs/ubifs/tnc.c142
-rw-r--r--fs/unicode/utf8-core.c1
-rw-r--r--fs/userfaultfd.c65
-rw-r--r--fs/xfs/Kconfig32
-rw-r--r--fs/xfs/Makefile5
-rw-r--r--fs/xfs/libxfs/xfs_ag.c23
-rw-r--r--fs/xfs/libxfs/xfs_ag.h9
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c115
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h22
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c39
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h8
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c19
-rw-r--r--fs/xfs/libxfs/xfs_btree.c204
-rw-r--r--fs/xfs/libxfs/xfs_btree.h141
-rw-r--r--fs/xfs/libxfs/xfs_defer.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c165
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h7
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c19
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h6
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c117
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h10
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c31
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c358
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h38
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c102
-rw-r--r--fs/xfs/libxfs/xfs_sb.c11
-rw-r--r--fs/xfs/libxfs/xfs_types.h12
-rw-r--r--fs/xfs/scrub/agheader.c30
-rw-r--r--fs/xfs/scrub/agheader_repair.c105
-rw-r--r--fs/xfs/scrub/alloc.c69
-rw-r--r--fs/xfs/scrub/attr.c312
-rw-r--r--fs/xfs/scrub/attr.h64
-rw-r--r--fs/xfs/scrub/bitmap.c428
-rw-r--r--fs/xfs/scrub/bitmap.h111
-rw-r--r--fs/xfs/scrub/bmap.c420
-rw-r--r--fs/xfs/scrub/btree.c102
-rw-r--r--fs/xfs/scrub/btree.h16
-rw-r--r--fs/xfs/scrub/common.c465
-rw-r--r--fs/xfs/scrub/common.h32
-rw-r--r--fs/xfs/scrub/dabtree.c7
-rw-r--r--fs/xfs/scrub/dabtree.h6
-rw-r--r--fs/xfs/scrub/dir.c246
-rw-r--r--fs/xfs/scrub/fscounters.c11
-rw-r--r--fs/xfs/scrub/health.c8
-rw-r--r--fs/xfs/scrub/health.h6
-rw-r--r--fs/xfs/scrub/ialloc.c304
-rw-r--r--fs/xfs/scrub/inode.c189
-rw-r--r--fs/xfs/scrub/parent.c300
-rw-r--r--fs/xfs/scrub/quota.c9
-rw-r--r--fs/xfs/scrub/readdir.c375
-rw-r--r--fs/xfs/scrub/readdir.h19
-rw-r--r--fs/xfs/scrub/refcount.c197
-rw-r--r--fs/xfs/scrub/repair.c112
-rw-r--r--fs/xfs/scrub/repair.h7
-rw-r--r--fs/xfs/scrub/rmap.c570
-rw-r--r--fs/xfs/scrub/rtbitmap.c6
-rw-r--r--fs/xfs/scrub/scrub.c74
-rw-r--r--fs/xfs/scrub/scrub.h32
-rw-r--r--fs/xfs/scrub/symlink.c6
-rw-r--r--fs/xfs/scrub/trace.c6
-rw-r--r--fs/xfs/scrub/trace.h75
-rw-r--r--fs/xfs/scrub/xfs_scrub.h6
-rw-r--r--fs/xfs/xfs_bmap_item.c37
-rw-r--r--fs/xfs/xfs_bmap_util.c14
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_buf_item_recover.c10
-rw-r--r--fs/xfs/xfs_dahash_test.c211
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_drain.c166
-rw-r--r--fs/xfs/xfs_drain.h87
-rw-r--r--fs/xfs/xfs_extfree_item.c54
-rw-r--r--fs/xfs/xfs_file.c17
-rw-r--r--fs/xfs/xfs_icache.c3
-rw-r--r--fs/xfs/xfs_icache.h11
-rw-r--r--fs/xfs/xfs_iunlink_item.c4
-rw-r--r--fs/xfs/xfs_iwalk.c5
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_refcount_item.c36
-rw-r--r--fs/xfs/xfs_rmap_item.c32
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_sysctl.c20
-rw-r--r--fs/xfs/xfs_trace.h72
230 files changed, 8496 insertions, 4878 deletions
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 1923affcdc62..ee1b6b06a2fd 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -8,9 +8,8 @@
#ifndef _9P_CACHE_H
#define _9P_CACHE_H
-#include <linux/fscache.h>
-
#ifdef CONFIG_9P_FSCACHE
+#include <linux/fscache.h>
extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses,
const char *dev_name);
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 805151114e96..de009a33e0e2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -41,14 +41,24 @@ void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid)
*pfid = NULL;
}
+static bool v9fs_is_writeable(int mode)
+{
+ if (mode & (P9_OWRITE|P9_ORDWR))
+ return true;
+ else
+ return false;
+}
+
/**
* v9fs_fid_find_inode - search for an open fid off of the inode list
* @inode: return a fid pointing to a specific inode
+ * @want_writeable: only consider fids which are writeable
* @uid: return a fid belonging to the specified user
+ * @any: ignore uid as a selection criteria
*
*/
-
-static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
+struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable,
+ kuid_t uid, bool any)
{
struct hlist_head *h;
struct p9_fid *fid, *ret = NULL;
@@ -58,7 +68,12 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
spin_lock(&inode->i_lock);
h = (struct hlist_head *)&inode->i_private;
hlist_for_each_entry(fid, h, ilist) {
- if (uid_eq(fid->uid, uid)) {
+ if (any || uid_eq(fid->uid, uid)) {
+ if (want_writeable && !v9fs_is_writeable(fid->mode)) {
+ p9_debug(P9_DEBUG_VFS, " mode: %x not writeable?\n",
+ fid->mode);
+ continue;
+ }
p9_fid_get(fid);
ret = fid;
break;
@@ -118,7 +133,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
spin_unlock(&dentry->d_lock);
} else {
if (dentry->d_inode)
- ret = v9fs_fid_find_inode(dentry->d_inode, uid);
+ ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
}
return ret;
@@ -299,28 +314,3 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
return v9fs_fid_lookup_with_uid(dentry, uid, any);
}
-struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
-{
- int err;
- struct p9_fid *fid, *ofid;
-
- ofid = v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0);
- fid = clone_fid(ofid);
- if (IS_ERR(fid))
- goto error_out;
- p9_fid_put(ofid);
- /*
- * writeback fid will only be used to write back the
- * dirty pages. We always request for the open fid in read-write
- * mode so that a partial page write which result in page
- * read can work.
- */
- err = p9_client_open(fid, O_RDWR);
- if (err < 0) {
- p9_fid_put(fid);
- fid = ERR_PTR(err);
- goto error_out;
- }
-error_out:
- return fid;
-}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 8a4e8cd12ca2..0c51889a60b3 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -7,14 +7,16 @@
#ifndef FS_9P_FID_H
#define FS_9P_FID_H
#include <linux/list.h>
+#include "v9fs.h"
+struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable,
+ kuid_t uid, bool any);
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
{
return v9fs_fid_lookup(dentry->d_parent);
}
void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid);
-struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid);
static inline struct p9_fid *clone_fid(struct p9_fid *fid)
{
@@ -32,4 +34,31 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
p9_fid_put(fid);
return nfid;
}
+/**
+ * v9fs_fid_addmodes - add cache flags to fid mode (for client use only)
+ * @fid: fid to augment
+ * @s_flags: session info mount flags
+ * @s_cache: session info cache flags
+ * @f_flags: unix open flags
+ *
+ * make sure mode reflects flags of underlying mounts
+ * also qid.version == 0 reflects a synthetic or legacy file system
+ * NOTE: these are set after open so only reflect 9p client not
+ * underlying file system on server.
+ */
+static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags,
+ int s_cache, unsigned int f_flags)
+{
+ if (fid->qid.type != P9_QTFILE)
+ return;
+
+ if ((!s_cache) ||
+ ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) ||
+ (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) {
+ fid->mode |= P9L_DIRECT; /* no read or write cache */
+ } else if ((!(s_cache & CACHE_WRITEBACK)) ||
+ (f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) {
+ fid->mode |= P9L_NOWRITECACHE;
+ }
+}
#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 61a51b90600d..c7f774fe398f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -38,9 +38,7 @@ enum {
/* String options */
Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
- Opt_nodevmap,
- /* Cache options */
- Opt_cache_loose, Opt_fscache, Opt_mmap,
+ Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv,
/* Access options */
Opt_access, Opt_posixacl,
/* Lock timeout option */
@@ -57,10 +55,10 @@ static const match_table_t tokens = {
{Opt_uname, "uname=%s"},
{Opt_remotename, "aname=%s"},
{Opt_nodevmap, "nodevmap"},
+ {Opt_noxattr, "noxattr"},
+ {Opt_directio, "directio"},
+ {Opt_ignoreqv, "ignoreqv"},
{Opt_cache, "cache=%s"},
- {Opt_cache_loose, "loose"},
- {Opt_fscache, "fscache"},
- {Opt_mmap, "mmap"},
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
@@ -68,32 +66,30 @@ static const match_table_t tokens = {
{Opt_err, NULL}
};
-static const char *const v9fs_cache_modes[nr__p9_cache_modes] = {
- [CACHE_NONE] = "none",
- [CACHE_MMAP] = "mmap",
- [CACHE_LOOSE] = "loose",
- [CACHE_FSCACHE] = "fscache",
-};
-
/* Interpret mount options for cache mode */
static int get_cache_mode(char *s)
{
int version = -EINVAL;
if (!strcmp(s, "loose")) {
- version = CACHE_LOOSE;
+ version = CACHE_SC_LOOSE;
p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
} else if (!strcmp(s, "fscache")) {
- version = CACHE_FSCACHE;
+ version = CACHE_SC_FSCACHE;
p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
} else if (!strcmp(s, "mmap")) {
- version = CACHE_MMAP;
+ version = CACHE_SC_MMAP;
p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
+ } else if (!strcmp(s, "readahead")) {
+ version = CACHE_SC_READAHEAD;
+ p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n");
} else if (!strcmp(s, "none")) {
- version = CACHE_NONE;
+ version = CACHE_SC_NONE;
p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
- } else
- pr_info("Unknown Cache mode %s\n", s);
+ } else if (kstrtoint(s, 0, &version) != 0) {
+ version = -EINVAL;
+ pr_info("Unknown Cache mode or invalid value %s\n", s);
+ }
return version;
}
@@ -121,9 +117,9 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
if (v9ses->nodev)
seq_puts(m, ",nodevmap");
if (v9ses->cache)
- seq_printf(m, ",%s", v9fs_cache_modes[v9ses->cache]);
+ seq_printf(m, ",cache=%x", v9ses->cache);
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->cachetag && v9ses->cache == CACHE_FSCACHE)
+ if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE))
seq_printf(m, ",cachetag=%s", v9ses->cachetag);
#endif
@@ -143,9 +139,16 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root)
break;
}
+ if (v9ses->flags & V9FS_IGNORE_QV)
+ seq_puts(m, ",ignoreqv");
+ if (v9ses->flags & V9FS_DIRECT_IO)
+ seq_puts(m, ",directio");
if (v9ses->flags & V9FS_POSIX_ACL)
seq_puts(m, ",posixacl");
+ if (v9ses->flags & V9FS_NO_XATTR)
+ seq_puts(m, ",noxattr");
+
return p9_show_client_options(m, v9ses->clnt);
}
@@ -266,14 +269,14 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_nodevmap:
v9ses->nodev = 1;
break;
- case Opt_cache_loose:
- v9ses->cache = CACHE_LOOSE;
+ case Opt_noxattr:
+ v9ses->flags |= V9FS_NO_XATTR;
break;
- case Opt_fscache:
- v9ses->cache = CACHE_FSCACHE;
+ case Opt_directio:
+ v9ses->flags |= V9FS_DIRECT_IO;
break;
- case Opt_mmap:
- v9ses->cache = CACHE_MMAP;
+ case Opt_ignoreqv:
+ v9ses->flags |= V9FS_IGNORE_QV;
break;
case Opt_cachetag:
#ifdef CONFIG_9P_FSCACHE
@@ -468,7 +471,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
- if (v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache & CACHE_FSCACHE) {
rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
if (rc < 0)
goto err_clnt;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index f3f74d197b5d..06a2514f0d88 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -31,29 +31,54 @@
#define V9FS_ACL_MASK V9FS_POSIX_ACL
enum p9_session_flags {
- V9FS_PROTO_2000U = 0x01,
- V9FS_PROTO_2000L = 0x02,
- V9FS_ACCESS_SINGLE = 0x04,
- V9FS_ACCESS_USER = 0x08,
- V9FS_ACCESS_CLIENT = 0x10,
- V9FS_POSIX_ACL = 0x20
+ V9FS_PROTO_2000U = 0x01,
+ V9FS_PROTO_2000L = 0x02,
+ V9FS_ACCESS_SINGLE = 0x04,
+ V9FS_ACCESS_USER = 0x08,
+ V9FS_ACCESS_CLIENT = 0x10,
+ V9FS_POSIX_ACL = 0x20,
+ V9FS_NO_XATTR = 0x40,
+ V9FS_IGNORE_QV = 0x80, /* ignore qid.version for cache hints */
+ V9FS_DIRECT_IO = 0x100,
+ V9FS_SYNC = 0x200
};
-/* possible values of ->cache */
/**
- * enum p9_cache_modes - user specified cache preferences
- * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
- * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ * enum p9_cache_shortcuts - human readable cache preferences
+ * @CACHE_SC_NONE: disable all caches
+ * @CACHE_SC_READAHEAD: only provide caching for readahead
+ * @CACHE_SC_MMAP: provide caching to enable mmap
+ * @CACHE_SC_LOOSE: non-coherent caching for files and meta data
+ * @CACHE_SC_FSCACHE: persistent non-coherent caching for files and meta-data
*
- * eventually support loose, tight, time, session, default always none
*/
-enum p9_cache_modes {
- CACHE_NONE,
- CACHE_MMAP,
- CACHE_LOOSE,
- CACHE_FSCACHE,
- nr__p9_cache_modes
+enum p9_cache_shortcuts {
+ CACHE_SC_NONE = 0b00000000,
+ CACHE_SC_READAHEAD = 0b00000001,
+ CACHE_SC_MMAP = 0b00000101,
+ CACHE_SC_LOOSE = 0b00001111,
+ CACHE_SC_FSCACHE = 0b10001111,
+};
+
+/**
+ * enum p9_cache_bits - possible values of ->cache
+ * @CACHE_NONE: caches disabled
+ * @CACHE_FILE: file caching (open to close)
+ * @CACHE_META: meta-data and directory caching
+ * @CACHE_WRITEBACK: write-back caching for files
+ * @CACHE_LOOSE: don't check cache consistency
+ * @CACHE_FSCACHE: local persistent caches
+ *
+ */
+
+enum p9_cache_bits {
+ CACHE_NONE = 0b00000000,
+ CACHE_FILE = 0b00000001,
+ CACHE_META = 0b00000010,
+ CACHE_WRITEBACK = 0b00000100,
+ CACHE_LOOSE = 0b00001000,
+ CACHE_FSCACHE = 0b10000000,
};
/**
@@ -62,7 +87,7 @@ enum p9_cache_modes {
* @nodev: set to 1 to disable device mapping
* @debug: debug level
* @afid: authentication handle
- * @cache: cache mode of type &p9_cache_modes
+ * @cache: cache mode of type &p9_cache_bits
* @cachetag: the tag of the cache associated with this session
* @fscache: session cookie associated with FS-Cache
* @uname: string user name to mount hierarchy as
@@ -112,7 +137,6 @@ struct v9fs_inode {
struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
unsigned int cache_validity;
- struct p9_fid *writeback_fid;
struct mutex v_mutex;
};
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 75106b9f293d..cdf441f22e07 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -36,10 +36,6 @@ extern const struct file_operations v9fs_dir_operations;
extern const struct file_operations v9fs_dir_operations_dotl;
extern const struct dentry_operations v9fs_dentry_operations;
extern const struct dentry_operations v9fs_cached_dentry_operations;
-extern const struct file_operations v9fs_cached_file_operations;
-extern const struct file_operations v9fs_cached_file_operations_dotl;
-extern const struct file_operations v9fs_mmap_file_operations;
-extern const struct file_operations v9fs_mmap_file_operations_dotl;
extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 425956eb9fde..8a635999a7d6 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,8 +56,6 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct inode *inode = file_inode(file);
- struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = file->private_data;
BUG_ON(!fid);
@@ -65,11 +63,8 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- if (rreq->origin == NETFS_READ_FOR_WRITE &&
- (fid->mode & O_ACCMODE) == O_WRONLY) {
- fid = v9inode->writeback_fid;
- BUG_ON(!fid);
- }
+ WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
+ !(fid->mode & P9_ORDWR));
p9_fid_get(fid);
rreq->netfs_priv = fid;
@@ -119,8 +114,6 @@ const struct netfs_request_ops v9fs_req_ops = {
static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct inode *inode = folio_inode(folio);
-
if (folio_test_private(folio))
return false;
#ifdef CONFIG_9P_FSCACHE
@@ -129,8 +122,8 @@ static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
return false;
folio_wait_fscache(folio);
}
+ fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
#endif
- fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
return true;
}
@@ -140,6 +133,7 @@ static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
folio_wait_fscache(folio);
}
+#ifdef CONFIG_9P_FSCACHE
static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
bool was_async)
{
@@ -153,17 +147,19 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
i_size_read(&v9inode->netfs.inode), 0);
}
}
+#endif
static int v9fs_vfs_write_folio_locked(struct folio *folio)
{
struct inode *inode = folio_inode(folio);
- struct v9fs_inode *v9inode = V9FS_I(inode);
- struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode);
loff_t start = folio_pos(folio);
loff_t i_size = i_size_read(inode);
struct iov_iter from;
size_t len = folio_size(folio);
+ struct p9_fid *writeback_fid;
int err;
+ struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
+ struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
if (start >= i_size)
return 0; /* Simultaneous truncation occurred */
@@ -172,25 +168,33 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
- /* We should have writeback_fid always set */
- BUG_ON(!v9inode->writeback_fid);
+ writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
+ if (!writeback_fid) {
+ WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
+ inode->i_private);
+ return -EINVAL;
+ }
folio_wait_fscache(folio);
folio_start_writeback(folio);
- p9_client_write(v9inode->writeback_fid, start, &from, &err);
+ p9_client_write(writeback_fid, start, &from, &err);
+#ifdef CONFIG_9P_FSCACHE
if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
+ fscache_cookie_enabled(cookie) &&
+ test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
folio_start_fscache(folio);
fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
+ folio_mapping(folio), start, len, i_size,
+ v9fs_write_to_cache_done, v9inode,
+ true);
}
+#endif
folio_end_writeback(folio);
+ p9_fid_put(writeback_fid);
+
return err;
}
@@ -297,7 +301,6 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
loff_t last_pos = pos + copied;
struct folio *folio = page_folio(subpage);
struct inode *inode = mapping->host;
- struct v9fs_inode *v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
@@ -317,7 +320,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
if (last_pos > inode->i_size) {
inode_add_bytes(inode, last_pos - inode->i_size);
i_size_write(inode, last_pos);
- fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos);
+#ifdef CONFIG_9P_FSCACHE
+ fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
+ &last_pos);
+#endif
}
folio_mark_dirty(folio);
out:
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 52bf87934650..45b684b7d8d7 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -196,9 +196,9 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
/**
- * v9fs_dir_release - called on a close of a file or directory
- * @inode: inode of the directory
- * @filp: file pointer to a directory
+ * v9fs_dir_release - close a directory or a file
+ * @inode: inode of the directory or file
+ * @filp: file pointer to a directory or file
*
*/
@@ -213,7 +213,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
fid = filp->private_data;
p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
inode, filp, fid ? fid->fid : -1);
+
if (fid) {
+ if ((S_ISREG(inode->i_mode)) && (filp->f_mode & FMODE_WRITE))
+ retval = filemap_fdatawrite(inode->i_mapping);
+
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 367a851eaa82..6c31b8c8112d 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -28,7 +28,6 @@
#include "fid.h"
#include "cache.h"
-static const struct vm_operations_struct v9fs_file_vm_ops;
static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
/**
@@ -41,13 +40,11 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
int v9fs_file_open(struct inode *inode, struct file *file)
{
int err;
- struct v9fs_inode *v9inode;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *writeback_fid;
+ struct p9_fid *fid;
int omode;
p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
- v9inode = V9FS_I(inode);
v9ses = v9fs_inode2v9ses(inode);
if (v9fs_proto_dotl(v9ses))
omode = v9fs_open_to_dotl_flags(file->f_flags);
@@ -60,7 +57,19 @@ int v9fs_file_open(struct inode *inode, struct file *file)
if (IS_ERR(fid))
return PTR_ERR(fid);
- err = p9_client_open(fid, omode);
+ if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) {
+ int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR;
+
+ p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n");
+ err = p9_client_open(fid, writeback_omode);
+ if (err < 0) {
+ p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n");
+ err = p9_client_open(fid, omode);
+ fid->mode |= P9L_DIRECT;
+ }
+ } else {
+ err = p9_client_open(fid, omode);
+ }
if (err < 0) {
p9_fid_put(fid);
return err;
@@ -72,36 +81,14 @@ int v9fs_file_open(struct inode *inode, struct file *file)
file->private_data = fid;
}
- mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache) && !v9inode->writeback_fid &&
- ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
- /*
- * clone a fid and add it to writeback_fid
- * we do it during open time instead of
- * page dirty time via write_begin/page_mkwrite
- * because we want write after unlink usecase
- * to work.
- */
- writeback_fid = v9fs_writeback_fid(file_dentry(file));
- if (IS_ERR(writeback_fid)) {
- err = PTR_ERR(writeback_fid);
- mutex_unlock(&v9inode->v_mutex);
- goto out_error;
- }
- v9inode->writeback_fid = (void *) writeback_fid;
- }
- mutex_unlock(&v9inode->v_mutex);
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->cache == CACHE_FSCACHE)
- fscache_use_cookie(v9fs_inode_cookie(v9inode),
+ if (v9ses->cache & CACHE_FSCACHE)
+ fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)),
file->f_mode & FMODE_WRITE);
#endif
+ v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags);
v9fs_open_fid_add(inode, &fid);
return 0;
-out_error:
- p9_fid_put(file->private_data);
- file->private_data = NULL;
- return err;
}
/**
@@ -368,8 +355,13 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct p9_fid *fid = iocb->ki_filp->private_data;
int ret, err = 0;
- p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
- iov_iter_count(to), iocb->ki_pos);
+ p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
+ fid->fid, iov_iter_count(to), iocb->ki_pos);
+
+ if (!(fid->mode & P9L_DIRECT)) {
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return generic_file_read_iter(iocb, to);
+ }
if (iocb->ki_filp->f_flags & O_NONBLOCK)
ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
@@ -392,10 +384,18 @@ static ssize_t
v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
+ struct p9_fid *fid = file->private_data;
ssize_t retval;
loff_t origin;
int err = 0;
+ p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
+
+ if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return generic_file_write_iter(iocb, from);
+ }
+
retval = generic_write_checks(iocb, from);
if (retval <= 0)
return retval;
@@ -477,45 +477,18 @@ static int
v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
int retval;
+ struct inode *inode = file_inode(filp);
+ struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+ p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp);
- retval = generic_file_mmap(filp, vma);
- if (!retval)
- vma->vm_ops = &v9fs_file_vm_ops;
-
- return retval;
-}
-
-static int
-v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- int retval;
- struct inode *inode;
- struct v9fs_inode *v9inode;
- struct p9_fid *fid;
-
- inode = file_inode(filp);
- v9inode = V9FS_I(inode);
- mutex_lock(&v9inode->v_mutex);
- if (!v9inode->writeback_fid &&
- (vma->vm_flags & VM_SHARED) &&
- (vma->vm_flags & VM_WRITE)) {
- /*
- * clone a fid and add it to writeback_fid
- * we do it during mmap instead of
- * page dirty time via write_begin/page_mkwrite
- * because we want write after unlink usecase
- * to work.
- */
- fid = v9fs_writeback_fid(file_dentry(filp));
- if (IS_ERR(fid)) {
- retval = PTR_ERR(fid);
- mutex_unlock(&v9inode->v_mutex);
- return retval;
- }
- v9inode->writeback_fid = (void *) fid;
+ if (!(v9ses->cache & CACHE_WRITEBACK)) {
+ p9_debug(P9_DEBUG_CACHE, "(no mmap mode)");
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+ invalidate_inode_pages2(filp->f_mapping);
+ return generic_file_readonly_mmap(filp, vma);
}
- mutex_unlock(&v9inode->v_mutex);
retval = generic_file_mmap(filp, vma);
if (!retval)
@@ -527,7 +500,6 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct v9fs_inode *v9inode;
struct folio *folio = page_folio(vmf->page);
struct file *filp = vmf->vma->vm_file;
struct inode *inode = file_inode(filp);
@@ -536,8 +508,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
folio, (unsigned long)filp->private_data);
- v9inode = V9FS_I(inode);
-
/* Wait for the page to be written to the cache before we allow it to
* be modified. We then assume the entire page will need writing back.
*/
@@ -550,7 +520,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(filp);
- BUG_ON(!v9inode->writeback_fid);
if (folio_lock_killable(folio) < 0)
return VM_FAULT_RETRY;
if (folio_mapping(folio) != inode->i_mapping)
@@ -563,35 +532,6 @@ out_unlock:
return VM_FAULT_NOPAGE;
}
-/**
- * v9fs_mmap_file_read_iter - read from a file
- * @iocb: The operation parameters
- * @to: The buffer to read into
- *
- */
-static ssize_t
-v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
- /* TODO: Check if there are dirty pages */
- return v9fs_file_read_iter(iocb, to);
-}
-
-/**
- * v9fs_mmap_file_write_iter - write to a file
- * @iocb: The operation parameters
- * @from: The data to write
- *
- */
-static ssize_t
-v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- /*
- * TODO: invalidate mmaps on filp's inode between
- * offset and offset+count
- */
- return v9fs_file_write_iter(iocb, from);
-}
-
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
{
struct inode *inode;
@@ -614,13 +554,6 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
}
-
-static const struct vm_operations_struct v9fs_file_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = v9fs_vm_page_mkwrite,
-};
-
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
.close = v9fs_mmap_vm_close,
.fault = filemap_fault,
@@ -628,34 +561,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
.page_mkwrite = v9fs_vm_page_mkwrite,
};
-
-const struct file_operations v9fs_cached_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .open = v9fs_file_open,
- .release = v9fs_dir_release,
- .lock = v9fs_file_lock,
- .mmap = v9fs_file_mmap,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fsync = v9fs_file_fsync,
-};
-
-const struct file_operations v9fs_cached_file_operations_dotl = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .open = v9fs_file_open,
- .release = v9fs_dir_release,
- .lock = v9fs_file_lock_dotl,
- .flock = v9fs_file_flock_dotl,
- .mmap = v9fs_file_mmap,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fsync = v9fs_file_fsync_dotl,
-};
-
const struct file_operations v9fs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = v9fs_file_read_iter,
@@ -677,34 +582,7 @@ const struct file_operations v9fs_file_operations_dotl = {
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
.flock = v9fs_file_flock_dotl,
- .mmap = generic_file_readonly_mmap,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fsync = v9fs_file_fsync_dotl,
-};
-
-const struct file_operations v9fs_mmap_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = v9fs_mmap_file_read_iter,
- .write_iter = v9fs_mmap_file_write_iter,
- .open = v9fs_file_open,
- .release = v9fs_dir_release,
- .lock = v9fs_file_lock,
- .mmap = v9fs_mmap_file_mmap,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .fsync = v9fs_file_fsync,
-};
-
-const struct file_operations v9fs_mmap_file_operations_dotl = {
- .llseek = generic_file_llseek,
- .read_iter = v9fs_mmap_file_read_iter,
- .write_iter = v9fs_mmap_file_write_iter,
- .open = v9fs_file_open,
- .release = v9fs_dir_release,
- .lock = v9fs_file_lock_dotl,
- .flock = v9fs_file_flock_dotl,
- .mmap = v9fs_mmap_file_mmap,
+ .mmap = v9fs_file_mmap,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 502ac74e4959..36b466e35887 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -229,7 +229,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
- v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
return &v9inode->netfs.inode;
@@ -286,24 +285,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
case S_IFREG:
if (v9fs_proto_dotl(v9ses)) {
inode->i_op = &v9fs_file_inode_operations_dotl;
- if (v9ses->cache == CACHE_LOOSE ||
- v9ses->cache == CACHE_FSCACHE)
- inode->i_fop =
- &v9fs_cached_file_operations_dotl;
- else if (v9ses->cache == CACHE_MMAP)
- inode->i_fop = &v9fs_mmap_file_operations_dotl;
- else
- inode->i_fop = &v9fs_file_operations_dotl;
+ inode->i_fop = &v9fs_file_operations_dotl;
} else {
inode->i_op = &v9fs_file_inode_operations;
- if (v9ses->cache == CACHE_LOOSE ||
- v9ses->cache == CACHE_FSCACHE)
- inode->i_fop =
- &v9fs_cached_file_operations;
- else if (v9ses->cache == CACHE_MMAP)
- inode->i_fop = &v9fs_mmap_file_operations;
- else
- inode->i_fop = &v9fs_file_operations;
+ inode->i_fop = &v9fs_file_operations;
}
break;
@@ -385,20 +370,23 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
*/
void v9fs_evict_inode(struct inode *inode)
{
- struct v9fs_inode *v9inode = V9FS_I(inode);
- __le32 version;
+ struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
+ __le32 __maybe_unused version;
truncate_inode_pages_final(&inode->i_data);
+
+#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
&version);
+#endif
+
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
+#ifdef CONFIG_9P_FSCACHE
fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
- /* clunk the fid stashed in writeback_fid */
- p9_fid_put(v9inode->writeback_fid);
- v9inode->writeback_fid = NULL;
+#endif
}
static int v9fs_test_inode(struct inode *inode, void *data)
@@ -778,7 +766,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
inode = NULL;
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
- else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -807,11 +795,12 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
{
int err;
u32 perm;
- struct v9fs_inode *v9inode;
+ struct v9fs_inode __maybe_unused *v9inode;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *inode_fid;
+ struct p9_fid *fid;
struct dentry *res = NULL;
struct inode *inode;
+ int p9_omode;
if (d_in_lookup(dentry)) {
res = v9fs_vfs_lookup(dir, dentry, 0);
@@ -830,9 +819,14 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
- fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags,
- v9fs_proto_dotu(v9ses)));
+ p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses));
+
+ if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
+ p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+ p9_debug(P9_DEBUG_CACHE,
+ "write-only file with writeback enabled, creating w/ O_RDWR\n");
+ }
+ fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
goto error;
@@ -841,33 +835,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_invalidate_inode_attr(dir);
inode = d_inode(dentry);
v9inode = V9FS_I(inode);
- mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache) && !v9inode->writeback_fid &&
- ((flags & O_ACCMODE) != O_RDONLY)) {
- /*
- * clone a fid and add it to writeback_fid
- * we do it during open time instead of
- * page dirty time via write_begin/page_mkwrite
- * because we want write after unlink usecase
- * to work.
- */
- inode_fid = v9fs_writeback_fid(dentry);
- if (IS_ERR(inode_fid)) {
- err = PTR_ERR(inode_fid);
- mutex_unlock(&v9inode->v_mutex);
- goto error;
- }
- v9inode->writeback_fid = (void *) inode_fid;
- }
- mutex_unlock(&v9inode->v_mutex);
err = finish_open(file, dentry, generic_file_open);
if (err)
goto error;
file->private_data = fid;
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache & CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
+#endif
+
+ v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags);
v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
@@ -1029,15 +1008,24 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_wstat *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+ generic_fillattr(&nop_mnt_idmap, inode, stat);
return 0;
+ } else if (v9ses->cache & CACHE_WRITEBACK) {
+ if (S_ISREG(inode->i_mode)) {
+ int retval = filemap_fdatawrite(inode->i_mapping);
+
+ if (retval)
+ p9_debug(P9_DEBUG_ERROR,
+ "flushing writeback during getattr returned %d\n", retval);
+ }
}
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
@@ -1069,7 +1057,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
{
int retval, use_dentry = 0;
struct inode *inode = d_inode(dentry);
- struct v9fs_inode *v9inode = V9FS_I(inode);
struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL;
struct p9_wstat wstat;
@@ -1114,8 +1101,12 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
}
/* Write all dirty data */
- if (d_is_reg(dentry))
- filemap_write_and_wait(inode->i_mapping);
+ if (d_is_reg(dentry)) {
+ retval = filemap_fdatawrite(inode->i_mapping);
+ if (retval)
+ p9_debug(P9_DEBUG_ERROR,
+ "flushing writeback during setattr returned %d\n", retval);
+ }
retval = p9_client_wstat(fid, &wstat);
@@ -1126,9 +1117,17 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
return retval;
if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
+ iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+ truncate_pagecache(inode, iattr->ia_size);
+
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache & CACHE_FSCACHE) {
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+
+ fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size);
+ }
+#endif
}
v9fs_invalidate_inode_attr(inode);
@@ -1412,7 +1411,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ flags = (v9ses->cache & CACHE_LOOSE) ?
V9FS_STAT2INODE_KEEP_ISIZE : 0;
v9fs_stat2inode(st, inode, inode->i_sb, flags);
out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a7da49906d99..5361cd2d7996 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -231,12 +231,12 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
int err = 0;
kgid_t gid;
umode_t mode;
+ int p9_omode = v9fs_open_to_dotl_flags(flags);
const unsigned char *name = NULL;
struct p9_qid qid;
struct inode *inode;
struct p9_fid *fid = NULL;
- struct v9fs_inode *v9inode;
- struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL;
+ struct p9_fid *dfid = NULL, *ofid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
struct dentry *res = NULL;
@@ -281,14 +281,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
/* Update mode based on ACL value */
err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
if (err) {
- p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
+ p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n",
err);
goto out;
}
- err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
- mode, gid, &qid);
+
+ if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) {
+ p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR;
+ p9_debug(P9_DEBUG_CACHE,
+ "write-only file with writeback enabled, creating w/ O_RDWR\n");
+ }
+ err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid);
if (err < 0) {
- p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
+ p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n",
err);
goto out;
}
@@ -313,36 +318,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- v9inode = V9FS_I(inode);
- mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache) && !v9inode->writeback_fid &&
- ((flags & O_ACCMODE) != O_RDONLY)) {
- /*
- * clone a fid and add it to writeback_fid
- * we do it during open time instead of
- * page dirty time via write_begin/page_mkwrite
- * because we want write after unlink usecase
- * to work.
- */
- inode_fid = v9fs_writeback_fid(dentry);
- if (IS_ERR(inode_fid)) {
- err = PTR_ERR(inode_fid);
- mutex_unlock(&v9inode->v_mutex);
- goto out;
- }
- v9inode->writeback_fid = (void *) inode_fid;
- }
- mutex_unlock(&v9inode->v_mutex);
/* Since we are opening a file, assign the open fid to the file */
err = finish_open(file, dentry, generic_file_open);
if (err)
goto out;
file->private_data = ofid;
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->cache == CACHE_FSCACHE)
+ if (v9ses->cache & CACHE_FSCACHE) {
+ struct v9fs_inode *v9inode = V9FS_I(inode);
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
+ }
#endif
+ v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags);
v9fs_open_fid_add(inode, &ofid);
file->f_mode |= FMODE_CREATED;
out:
@@ -414,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -457,13 +445,22 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
struct dentry *dentry = path->dentry;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
+ struct inode *inode = d_inode(dentry);
struct p9_stat_dotl *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+ generic_fillattr(&nop_mnt_idmap, inode, stat);
return 0;
+ } else if (v9ses->cache) {
+ if (S_ISREG(inode->i_mode)) {
+ int retval = filemap_fdatawrite(inode->i_mapping);
+
+ if (retval)
+ p9_debug(P9_DEBUG_ERROR,
+ "flushing writeback during getattr returned %d\n", retval);
+ }
}
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
@@ -539,12 +536,13 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *iattr)
{
int retval, use_dentry = 0;
+ struct inode *inode = d_inode(dentry);
+ struct v9fs_session_info __maybe_unused *v9ses;
struct p9_fid *fid = NULL;
struct p9_iattr_dotl p9attr = {
.uid = INVALID_UID,
.gid = INVALID_GID,
};
- struct inode *inode = d_inode(dentry);
p9_debug(P9_DEBUG_VFS, "\n");
@@ -552,6 +550,8 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if (retval)
return retval;
+ v9ses = v9fs_dentry2v9ses(dentry);
+
p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
if (iattr->ia_valid & ATTR_MODE)
p9attr.mode = iattr->ia_mode;
@@ -582,8 +582,12 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
return PTR_ERR(fid);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode))
- filemap_write_and_wait(inode->i_mapping);
+ if (S_ISREG(inode->i_mode)) {
+ retval = filemap_fdatawrite(inode->i_mapping);
+ if (retval < 0)
+ p9_debug(P9_DEBUG_ERROR,
+ "Flushing file prior to setattr failed: %d\n", retval);
+ }
retval = p9_client_setattr(fid, &p9attr);
if (retval < 0) {
@@ -592,9 +596,17 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
return retval;
}
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode))
+ if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
+ i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
+ truncate_pagecache(inode, iattr->ia_size);
+
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache & CACHE_FSCACHE)
+ fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)),
+ iattr->ia_size);
+#endif
+ }
v9fs_invalidate_inode_attr(inode);
setattr_copy(&nop_mnt_idmap, inode, iattr);
@@ -721,7 +733,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
}
v9fs_invalidate_inode_attr(dir);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
/* Now walk from the parent so we can get an unopened fid. */
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
@@ -798,7 +810,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
}
v9fs_invalidate_inode_attr(dir);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
/* Get the latest stat info from server. */
struct p9_fid *fid;
@@ -875,7 +887,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
}
/* instantiate inode and assign the unopened fid to the dentry */
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -960,7 +972,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
* We don't want to refresh inode->i_size,
* because we may have cached data
*/
- flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+ flags = (v9ses->cache & CACHE_LOOSE) ?
V9FS_STAT2INODE_KEEP_ISIZE : 0;
v9fs_stat2inode_dotl(st, inode, flags);
out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 10449994a972..73db55c050bf 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -63,7 +63,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_magic = V9FS_MAGIC;
if (v9fs_proto_dotl(v9ses)) {
sb->s_op = &v9fs_super_ops_dotl;
- sb->s_xattr = v9fs_xattr_handlers;
+ if (!(v9ses->flags & V9FS_NO_XATTR))
+ sb->s_xattr = v9fs_xattr_handlers;
} else {
sb->s_op = &v9fs_super_ops;
sb->s_time_max = U32_MAX;
@@ -83,9 +84,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT;
}
- sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
- if (!v9ses->cache)
- sb->s_flags |= SB_SYNCHRONOUS;
+ sb->s_flags |= SB_ACTIVE;
#ifdef CONFIG_9P_FS_POSIX_ACL
if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
@@ -136,7 +135,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
if (retval)
goto release_sb;
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
sb->s_d_op = &v9fs_cached_dentry_operations;
else
sb->s_d_op = &v9fs_dentry_operations;
@@ -277,7 +276,7 @@ static int v9fs_drop_inode(struct inode *inode)
struct v9fs_session_info *v9ses;
v9ses = v9fs_inode2v9ses(inode);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
return generic_drop_inode(inode);
/*
* in case of non cached mode always drop the
@@ -290,49 +289,30 @@ static int v9fs_drop_inode(struct inode *inode)
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
- int ret;
- struct p9_wstat wstat;
struct v9fs_inode *v9inode;
+
/*
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
- v9inode = V9FS_I(inode);
- if (!v9inode->writeback_fid)
- return 0;
- v9fs_blank_wstat(&wstat);
- ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
- if (ret < 0) {
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- return ret;
- }
+ v9inode = V9FS_I(inode);
fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
+
return 0;
}
static int v9fs_write_inode_dotl(struct inode *inode,
struct writeback_control *wbc)
{
- int ret;
struct v9fs_inode *v9inode;
- /*
- * send an fsync request to server irrespective of
- * wbc->sync_mode.
- */
+
v9inode = V9FS_I(inode);
- p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
- __func__, inode, v9inode->writeback_fid);
- if (!v9inode->writeback_fid)
- return 0;
-
- ret = p9_client_fsync(v9inode->writeback_fid, 0);
- if (ret < 0) {
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- return ret;
- }
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+
fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
+
return 0;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index e99830c65033..cc07a0cd3172 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -250,16 +250,9 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-#
-# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
-#
-config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
- bool
-
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ depends on ARCH_WANT_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/fs/Makefile b/fs/Makefile
index 05f89b5c962f..834f1c3dba46 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -6,7 +6,6 @@
# Rewritten to use lists instead of if-statements.
#
-obj-$(CONFIG_SYSCTL) += sysctls.o
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -50,7 +49,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
obj-$(CONFIG_COREDUMP) += coredump.o
-obj-$(CONFIG_SYSCTL) += drop_caches.o
+obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o
obj-$(CONFIG_FHANDLE) += fhandle.o
obj-y += iomap/
@@ -124,7 +123,7 @@ obj-$(CONFIG_9P_FS) += 9p/
obj-$(CONFIG_AFS_FS) += afs/
obj-$(CONFIG_NILFS2_FS) += nilfs2/
obj-$(CONFIG_BEFS_FS) += befs/
-obj-$(CONFIG_HOSTFS) += hostfs/
+obj-y += hostfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_TRACING) += tracefs/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 82690d1dd49a..4dd97afa536c 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -275,6 +275,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
loff_t i_size;
int nr_pages, i;
int ret;
+ loff_t remote_size = 0;
_enter("");
@@ -289,6 +290,8 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
expand:
i_size = i_size_read(&dvnode->netfs.inode);
+ if (i_size < remote_size)
+ i_size = remote_size;
if (i_size < 2048) {
ret = afs_bad(dvnode, afs_file_error_dir_small);
goto error;
@@ -319,16 +322,16 @@ expand:
struct folio *folio;
folio = filemap_get_folio(mapping, i);
- if (!folio) {
+ if (IS_ERR(folio)) {
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_stat_v(dvnode, n_inval);
-
- ret = -ENOMEM;
folio = __filemap_get_folio(mapping,
i, FGP_LOCK | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto error;
+ }
folio_attach_private(folio, (void *)1);
folio_unlock(folio);
}
@@ -364,6 +367,7 @@ expand:
* buffer.
*/
up_write(&dvnode->validate_lock);
+ remote_size = req->file_size;
goto expand;
}
@@ -524,7 +528,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
*/
folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE,
FGP_ACCESSED, 0);
- if (!folio) {
+ if (IS_ERR(folio)) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index 0ab7752d1b75..f0eddccbdd95 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -115,7 +115,7 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping->gfp_mask);
- if (!folio)
+ if (IS_ERR(folio))
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
else if (folio && !folio_test_private(folio))
folio_attach_private(folio, (void *)1);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 68d6d5dc608d..719b31374879 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma)
static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- struct afs_file *af = vmf->vma->vm_file->private_data;
- switch (afs_validate(vnode, af->key)) {
- case 0:
+ if (afs_pagecache_valid(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
- case -ENOMEM:
- return VM_FAULT_OOM;
- case -EINTR:
- case -ERESTARTSYS:
- return VM_FAULT_RETRY;
- case -ESTALE:
- default:
- return VM_FAULT_SIGBUS;
- }
+ return 0;
}
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0167e96e5198..866bab860a88 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -230,6 +230,7 @@ static void afs_apply_status(struct afs_operation *op,
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
change_size = true;
+ data_changed = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -449,7 +450,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
0 : FSCACHE_ADV_SINGLE_CHUNK,
&key, sizeof(key),
&aux, sizeof(aux),
- vnode->status.size));
+ i_size_read(&vnode->netfs.inode)));
#endif
}
@@ -668,6 +669,24 @@ bool afs_check_validity(struct afs_vnode *vnode)
}
/*
+ * Returns true if the pagecache is still valid. Does not sleep.
+ */
+bool afs_pagecache_valid(struct afs_vnode *vnode)
+{
+ if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
+ return true;
+ }
+
+ if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
+ afs_check_validity(vnode))
+ return true;
+
+ return false;
+}
+
+/*
* validate a vnode/inode
* - there are several things we need to check
* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -684,14 +703,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
vnode->fid.vid, vnode->fid.vnode, vnode->flags,
key_serial(key));
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- goto valid;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
+ if (afs_pagecache_valid(vnode))
goto valid;
down_write(&vnode->validate_lock);
@@ -765,6 +777,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
stat->nlink -= 1;
+
+ /* Lie about the size of directories. We maintain a locally
+ * edited copy and may make different allocation decisions on
+ * it, but we need to give userspace the server's size.
+ */
+ if (S_ISDIR(inode->i_mode))
+ stat->size = vnode->netfs.remote_i_size;
} while (need_seqretry(&vnode->cb_lock, seq));
done_seqretry(&vnode->cb_lock, seq);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 68ae91d21b57..9d3d64921106 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern bool afs_check_validity(struct afs_vnode *);
extern int afs_validate(struct afs_vnode *, struct key *);
+bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 571f3b9a417e..c822d6006033 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping,
_debug("kill %lx (to %lx)", index, last);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
@@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
_debug("redirty %llx @%llx", len, start);
folio = filemap_get_folio(mapping, index);
- if (!folio) {
+ if (IS_ERR(folio)) {
next = index + 1;
continue;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8a884e795f6a..1033fbdfdbec 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2058,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm)
has_dumped = 1;
- offset += sizeof(elf); /* Elf header */
+ offset += sizeof(elf); /* ELF header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
/* Write notes phdr entry */
@@ -2174,7 +2174,6 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
-MODULE_LICENSE("GPL");
#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
#include "binfmt_elf_test.c"
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a05eafcacfb2..05a1471d5283 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1540,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
- offset = sizeof(*elf); /* Elf header */
+ offset = sizeof(*elf); /* ELF header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
/* Write notes phdr entry */
diff --git a/fs/buffer.c b/fs/buffer.c
index b3eb905f87d6..a7fc561758b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -842,7 +842,7 @@ int remove_inode_buffers(struct inode *inode)
}
/*
- * Create the appropriate buffers when given a page for data area and
+ * Create the appropriate buffers when given a folio for data area and
* the size of each buffer.. Use the bh->b_this_page linked list to
* follow the buffers created. Return NULL if unable to create more
* buffers.
@@ -850,8 +850,8 @@ int remove_inode_buffers(struct inode *inode)
* The retry flag is used to differentiate async IO (paging, swapping)
* which may not fail from ordinary buffer allocations.
*/
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- bool retry)
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+ bool retry)
{
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
@@ -861,12 +861,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
if (retry)
gfp |= __GFP_NOFAIL;
- /* The page lock pins the memcg */
- memcg = page_memcg(page);
+ /* The folio lock pins the memcg */
+ memcg = folio_memcg(folio);
old_memcg = set_active_memcg(memcg);
head = NULL;
- offset = PAGE_SIZE;
+ offset = folio_size(folio);
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(gfp);
if (!bh)
@@ -878,8 +878,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bh->b_size = size;
- /* Link the buffer to its page */
- set_bh_page(bh, page, offset);
+ /* Link the buffer to its folio */
+ folio_set_bh(bh, folio, offset);
}
out:
set_active_memcg(old_memcg);
@@ -898,6 +898,13 @@ no_grow:
goto out;
}
+EXPORT_SYMBOL_GPL(folio_alloc_buffers);
+
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
+ bool retry)
+{
+ return folio_alloc_buffers(page_folio(page), size, retry);
+}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
static inline void
@@ -1484,6 +1491,21 @@ void set_bh_page(struct buffer_head *bh,
}
EXPORT_SYMBOL(set_bh_page);
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+ unsigned long offset)
+{
+ bh->b_folio = folio;
+ BUG_ON(offset >= folio_size(folio));
+ if (folio_test_highmem(folio))
+ /*
+ * This catches illegal uses and preserves the offset:
+ */
+ bh->b_data = (char *)(0 + offset);
+ else
+ bh->b_data = folio_address(folio) + offset;
+}
+EXPORT_SYMBOL(folio_set_bh);
+
/*
* Called when truncating a buffer on a page completely.
*/
@@ -1571,18 +1593,17 @@ out:
}
EXPORT_SYMBOL(block_invalidate_folio);
-
/*
* We attach and possibly dirty the buffers atomically wrt
* block_dirty_folio() via private_lock. try_to_free_buffers
- * is already excluded via the page lock.
+ * is already excluded via the folio lock.
*/
-void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+ unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
- head = alloc_page_buffers(page, blocksize, true);
+ head = folio_alloc_buffers(folio, blocksize, true);
bh = head;
do {
bh->b_state |= b_state;
@@ -1591,19 +1612,26 @@ void create_empty_buffers(struct page *page,
} while (bh);
tail->b_this_page = head;
- spin_lock(&page->mapping->private_lock);
- if (PageUptodate(page) || PageDirty(page)) {
+ spin_lock(&folio->mapping->private_lock);
+ if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
bh = head;
do {
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
set_buffer_dirty(bh);
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
bh = bh->b_this_page;
} while (bh != head);
}
- attach_page_private(page, head);
- spin_unlock(&page->mapping->private_lock);
+ folio_attach_private(folio, head);
+ spin_unlock(&folio->mapping->private_lock);
+}
+EXPORT_SYMBOL(folio_create_empty_buffers);
+
+void create_empty_buffers(struct page *page,
+ unsigned long blocksize, unsigned long b_state)
+{
+ folio_create_empty_buffers(page_folio(page), blocksize, b_state);
}
EXPORT_SYMBOL(create_empty_buffers);
@@ -1694,14 +1722,17 @@ static inline int block_size_bits(unsigned int blocksize)
return ilog2(blocksize);
}
-static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+static struct buffer_head *folio_create_buffers(struct folio *folio,
+ struct inode *inode,
+ unsigned int b_state)
{
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_has_buffers(page))
- create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
- b_state);
- return page_buffers(page);
+ if (!folio_buffers(folio))
+ folio_create_empty_buffers(folio,
+ 1 << READ_ONCE(inode->i_blkbits),
+ b_state);
+ return folio_buffers(folio);
}
/*
@@ -1745,8 +1776,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
int nr_underway = 0;
blk_opf_t write_flags = wbc_to_write_flags(wbc);
- head = create_page_buffers(page, inode,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ head = folio_create_buffers(page_folio(page), inode,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
/*
* Be very careful. We have no exclusion from block_dirty_folio
@@ -2009,7 +2040,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
- head = create_page_buffers(&folio->page, inode, 0);
+ head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
@@ -2295,7 +2326,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- head = create_page_buffers(&folio->page, inode, 0);
+ head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 58f8aec964e4..18de8a876b02 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -22,18 +22,9 @@ static struct ctl_table cachefiles_sysctls[] = {
{}
};
-static struct ctl_table cachefiles_sysctls_root[] = {
- {
- .procname = "cachefiles",
- .mode = 0555,
- .child = cachefiles_sysctls,
- },
- {}
-};
-
int __init cachefiles_register_error_injection(void)
{
- cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root);
+ cachefiles_sysctl = register_sysctl("cachefiles", cachefiles_sysctls);
if (!cachefiles_sysctl)
return -ENOMEM;
return 0;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d5335f445233..6bb251a4d613 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -808,6 +808,7 @@ static int ceph_writepages_start(struct address_space *mapping,
bool should_loop, range_whole = false;
bool done = false;
bool caching = ceph_is_cache_enabled(inode);
+ xa_mark_t tag;
if (wbc->sync_mode == WB_SYNC_NONE &&
fsc->write_congested)
@@ -834,6 +835,11 @@ static int ceph_writepages_start(struct address_space *mapping,
start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
index = start_index;
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
+ tag = PAGECACHE_TAG_TOWRITE;
+ } else {
+ tag = PAGECACHE_TAG_DIRTY;
+ }
retry:
/* find oldest snap context with dirty data */
snapc = get_oldest_context(inode, &ceph_wbc, NULL);
@@ -872,6 +878,9 @@ retry:
dout(" non-head snapc, range whole\n");
}
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+
ceph_put_snap_context(last_snapc);
last_snapc = snapc;
@@ -888,7 +897,7 @@ retry:
get_more_pages:
nr_folios = filemap_get_folios_tag(mapping, &index,
- end, PAGECACHE_TAG_DIRTY, &fbatch);
+ end, tag, &fbatch);
dout("pagevec_lookup_range_tag got %d\n", nr_folios);
if (!nr_folios && !locked_pages)
break;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7cc20772eac9..789be30d6ee2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -431,7 +431,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
*
* Called with i_ceph_lock held.
*/
-static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
{
struct ceph_cap *cap;
struct rb_node *n = ci->i_caps.rb_node;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index bec3c4549c07..3904333fa6c3 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -248,14 +248,20 @@ static int metrics_caps_show(struct seq_file *s, void *p)
return 0;
}
-static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
+static int caps_show_cb(struct inode *inode, int mds, void *p)
{
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct seq_file *s = p;
-
- seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
- cap->session->s_mds,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->implemented));
+ struct ceph_cap *cap;
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap)
+ seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+ cap->session->s_mds,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented));
+ spin_unlock(&ci->i_ceph_lock);
return 0;
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0ced8b570e42..cb67ac821f0e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1050,6 +1050,9 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct ceph_mds_request *req;
int err;
+ if (dentry->d_flags & DCACHE_DISCONNECTED)
+ return -EINVAL;
+
err = ceph_wait_on_conflict_unlink(dentry);
if (err)
return err;
@@ -1057,8 +1060,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
- dout("link in dir %p old_dentry %p dentry %p\n", dir,
- old_dentry, dentry);
+ dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
+ dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
d_drop(dentry);
@@ -1067,6 +1070,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
+ /*
+ * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
+ * will just pass the ino# to MDSs.
+ */
+ if (old_dentry->d_flags & DCACHE_DISCONNECTED)
+ req->r_ino2 = ceph_vino(d_inode(old_dentry));
req->r_parent = dir;
ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 27a245d959c0..29cf00220b09 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1632,8 +1632,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
* Caller must hold session s_mutex.
*/
int ceph_iterate_session_caps(struct ceph_mds_session *session,
- int (*cb)(struct inode *, struct ceph_cap *,
- void *), void *arg)
+ int (*cb)(struct inode *, int mds, void *),
+ void *arg)
{
struct list_head *p;
struct ceph_cap *cap;
@@ -1645,6 +1645,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
p = session->s_caps.next;
while (p != &session->s_caps) {
+ int mds;
+
cap = list_entry(p, struct ceph_cap, session_caps);
inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
@@ -1652,6 +1654,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
continue;
}
session->s_cap_iterator = cap;
+ mds = cap->mds;
spin_unlock(&session->s_cap_lock);
if (last_inode) {
@@ -1663,7 +1666,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
old_cap = NULL;
}
- ret = cb(inode, cap, arg);
+ ret = cb(inode, mds, arg);
last_inode = inode;
spin_lock(&session->s_cap_lock);
@@ -1696,20 +1699,25 @@ out:
return ret;
}
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
bool invalidate = false;
- int iputs;
+ struct ceph_cap *cap;
+ int iputs = 0;
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
- iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap) {
+ dout(" removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->netfs.inode);
+
+ iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
+ }
spin_unlock(&ci->i_ceph_lock);
- wake_up_all(&ci->i_cap_wq);
+ if (cap)
+ wake_up_all(&ci->i_cap_wq);
if (invalidate)
ceph_queue_invalidate(inode);
while (iputs--)
@@ -1780,8 +1788,7 @@ enum {
*
* caller must hold s_mutex.
*/
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned long ev = (unsigned long)arg;
@@ -1792,12 +1799,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
} else if (ev == RENEWCAPS) {
- if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) {
- /* mds did not re-issue stale cap */
- spin_lock(&ci->i_ceph_lock);
+ struct ceph_cap *cap;
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ /* mds did not re-issue stale cap */
+ if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
cap->issued = cap->implemented = CEPH_CAP_PIN;
- spin_unlock(&ci->i_ceph_lock);
- }
+ spin_unlock(&ci->i_ceph_lock);
} else if (ev == FORCE_RO) {
}
wake_up_all(&ci->i_cap_wq);
@@ -1959,16 +1968,22 @@ out:
* Yes, this is a bit sloppy. Our only real goal here is to respond to
* memory pressure from the MDS, though, so it needn't be perfect.
*/
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+static int trim_caps_cb(struct inode *inode, int mds, void *arg)
{
int *remaining = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
int used, wanted, oissued, mine;
+ struct ceph_cap *cap;
if (*remaining <= 0)
return -1;
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ return 0;
+ }
mine = cap->issued | cap->implemented;
used = __ceph_caps_used(ci);
wanted = __ceph_caps_file_wanted(ci);
@@ -2555,6 +2570,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u64 ino1 = 0, ino2 = 0;
int pathlen1 = 0, pathlen2 = 0;
bool freepath1 = false, freepath2 = false;
+ struct dentry *old_dentry = NULL;
int len;
u16 releases;
void *p, *end;
@@ -2572,7 +2588,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
}
/* If r_old_dentry is set, then assume that its parent is locked */
- ret = set_request_path_attr(NULL, req->r_old_dentry,
+ if (req->r_old_dentry &&
+ !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
+ old_dentry = req->r_old_dentry;
+ ret = set_request_path_attr(NULL, old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
&path2, &pathlen2, &ino2, &freepath2, true);
@@ -3911,26 +3930,22 @@ out_unlock:
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
{
union {
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- struct ceph_inode_info *ci = cap->ci;
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
+ struct ceph_cap *cap;
char *path;
- int pathlen = 0, err;
+ int pathlen = 0, err = 0;
u64 pathbase;
u64 snap_follows;
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
-
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
@@ -3947,6 +3962,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ goto out_err;
+ }
+ dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+ inode, ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
+
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0598faa50e2e..724307ff89cd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -355,8 +355,8 @@ struct ceph_snapid_map {
struct rb_node node;
struct list_head lru;
atomic_t ref;
- u64 snap;
dev_t dev;
+ u64 snap;
unsigned long last_used;
};
@@ -541,8 +541,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
- int (*cb)(struct inode *,
- struct ceph_cap *, void *),
+ int (*cb)(struct inode *, int mds, void *),
void *arg);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6ecca2c6d137..d24bf0db5234 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1192,6 +1192,8 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
struct ceph_inode_info *ci);
+extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci,
+ int mds);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1fe1b62abebd..806183959c47 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -535,6 +535,8 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
return NULL;
}
+#define MAX_XATTR_VAL_PRINT_LEN 256
+
static int __set_xattr(struct ceph_inode_info *ci,
const char *name, int name_len,
const char *val, int val_len,
@@ -597,7 +599,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
- dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+ dout("%s count=%d\n", __func__, ci->i_xattrs.count);
} else {
kfree(*newxattr);
*newxattr = NULL;
@@ -625,11 +627,13 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (new) {
rb_link_node(&xattr->node, parent, p);
rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("__set_xattr_val p=%p\n", p);
+ dout("%s p=%p\n", __func__, p);
}
- dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
+ dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__,
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name,
+ min(val_len, MAX_XATTR_VAL_PRINT_LEN), val,
+ val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
return 0;
}
@@ -655,13 +659,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
else if (c > 0)
p = &(*p)->rb_right;
else {
- dout("__get_xattr %s: found %.*s\n", name,
- xattr->val_len, xattr->val);
+ int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
+
+ dout("%s %s: found %.*s%s\n", __func__, name, len,
+ xattr->val, xattr->val_len > len ? "..." : "");
return xattr;
}
}
- dout("__get_xattr %s: not found\n", name);
+ dout("%s %s: not found\n", __func__, name);
return NULL;
}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 415176b2cf32..74cd6fafb33e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -162,6 +162,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 41
-#define CIFS_VERSION "2.42"
+#define SMB3_PRODUCT_BUILD 43
+#define CIFS_VERSION "2.43"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1cbb90587995..7bfef741f758 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1916,18 +1916,22 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_GOOD)
ses->ses_status = SES_EXITING;
- cifs_free_ipc(ses);
-
if (ses->ses_status == SES_EXITING && server->ops->logoff) {
+ spin_unlock(&ses->ses_lock);
+ cifs_free_ipc(ses);
xid = get_xid();
rc = server->ops->logoff(xid, ses);
if (rc)
cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
__func__, rc);
_free_xid(xid);
+ } else {
+ spin_unlock(&ses->ses_lock);
+ cifs_free_ipc(ses);
}
spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b33d2e7b0f98..c5fcefdfd797 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4882,6 +4882,8 @@ void cifs_oplock_break(struct work_struct *work)
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
bool purge_cache = false;
+ struct cifs_deferred_close *dclose;
+ bool is_deferred = false;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -4918,6 +4920,20 @@ void cifs_oplock_break(struct work_struct *work)
oplock_break_ack:
/*
+ * When oplock break is received and there are no active
+ * file handles but cached, then schedule deferred close immediately.
+ * So, new open will not use cached handle.
+ */
+ spin_lock(&CIFS_I(inode)->deferred_lock);
+ is_deferred = cifs_is_deferred_close(cfile, &dclose);
+ spin_unlock(&CIFS_I(inode)->deferred_lock);
+
+ if (!CIFS_CACHE_HANDLE(cinode) && is_deferred &&
+ cfile->deferred_close_scheduled && delayed_work_pending(&cfile->deferred)) {
+ cifs_close_deferred_file(cinode);
+ }
+
+ /*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
* not bother sending an oplock release if session to server still is
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7f085ed2d866..cd914be905b2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -749,7 +749,9 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&cifs_inode->deferred_lock);
cifs_del_deferred_close(cfile);
+ spin_unlock(&cifs_inode->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
@@ -762,7 +764,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
spin_unlock(&cifs_inode->open_file_lock);
list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) {
- _cifsFileInfo_put(tmp_list->cfile, true, false);
+ _cifsFileInfo_put(tmp_list->cfile, false, false);
list_del(&tmp_list->list);
kfree(tmp_list);
}
@@ -780,7 +782,9 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
@@ -815,7 +819,9 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
if (strstr(full_path, path)) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
cifs_del_deferred_close(cfile);
+ spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 366f0c3b799b..0521aa1da644 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -821,7 +821,6 @@ create_posix_buf(umode_t mode)
static int
add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
iov[num].iov_base = create_posix_buf(mode);
@@ -830,11 +829,6 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) +
- iov[num - 1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix));
*num_iovec = num + 1;
return 0;
}
@@ -2069,7 +2063,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
static void
parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf)
{
- struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc;
+ struct create_disk_id_rsp *pdisk_id = (struct create_disk_id_rsp *)cc;
cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n",
pdisk_id->DiskFileId, pdisk_id->VolumeId);
@@ -2172,10 +2166,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
}
static int
-add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
+add_lease_context(struct TCP_Server_Info *server,
+ struct smb2_create_req *req,
+ struct kvec *iov,
unsigned int *num_iovec, u8 *lease_key, __u8 *oplock)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock);
@@ -2183,12 +2178,6 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
return -ENOMEM;
iov[num].iov_len = server->vals->create_lease_size;
req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) +
- iov[num - 1].iov_len);
- le32_add_cpu(&req->CreateContextsLength,
- server->vals->create_lease_size);
*num_iovec = num + 1;
return 0;
}
@@ -2267,18 +2256,12 @@ static int
add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
iov[num].iov_base = create_durable_v2_buf(oparms);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_durable_v2);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) +
- iov[1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
*num_iovec = num + 1;
return 0;
}
@@ -2287,7 +2270,6 @@ static int
add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
/* indicate that we don't need to relock the file */
@@ -2297,12 +2279,6 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) +
- iov[1].iov_len);
- le32_add_cpu(&req->CreateContextsLength,
- sizeof(struct create_durable_handle_reconnect_v2));
*num_iovec = num + 1;
return 0;
}
@@ -2311,7 +2287,6 @@ static int
add_durable_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms, bool use_persistent)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
if (use_persistent) {
@@ -2331,11 +2306,6 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_durable);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset =
- cpu_to_le32(sizeof(struct smb2_create_req) +
- iov[1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
*num_iovec = num + 1;
return 0;
}
@@ -2369,18 +2339,12 @@ create_twarp_buf(__u64 timewarp)
static int
add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
iov[num].iov_base = create_twarp_buf(timewarp);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct crt_twarp_ctxt);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) +
- iov[num - 1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_twarp_ctxt));
*num_iovec = num + 1;
return 0;
}
@@ -2503,7 +2467,6 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
static int
add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set_owner)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
unsigned int len = 0;
@@ -2511,11 +2474,6 @@ add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = len;
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) +
- iov[num - 1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, len);
*num_iovec = num + 1;
return 0;
}
@@ -2546,18 +2504,12 @@ create_query_id_buf(void)
static int
add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
{
- struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
iov[num].iov_base = create_query_id_buf();
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct crt_query_id_ctxt);
- if (!req->CreateContextsOffset)
- req->CreateContextsOffset = cpu_to_le32(
- sizeof(struct smb2_create_req) +
- iov[num - 1].iov_len);
- le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt));
*num_iovec = num + 1;
return 0;
}
@@ -2720,6 +2672,9 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
rc = add_posix_context(iov, &n_iov, mode);
if (rc)
goto err_free_req;
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) +
+ iov[1].iov_len);
pc_buf = iov[n_iov-1].iov_base;
}
@@ -2857,21 +2812,13 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
(oparms->create_options & CREATE_NOT_FILE))
req->RequestedOplockLevel = *oplock; /* no srv lease support */
else {
- rc = add_lease_context(server, iov, &n_iov,
+ rc = add_lease_context(server, req, iov, &n_iov,
oparms->fid->lease_key, oplock);
if (rc)
return rc;
}
if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
- /* need to set Next field of lease context if we request it */
- if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next =
- cpu_to_le32(server->vals->create_lease_size);
- }
-
rc = add_durable_context(iov, &n_iov, oparms,
tcon->use_persistent);
if (rc)
@@ -2879,13 +2826,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
}
if (tcon->posix_extensions) {
- if (n_iov > 2) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next =
- cpu_to_le32(iov[n_iov-1].iov_len);
- }
-
rc = add_posix_context(iov, &n_iov, oparms->mode);
if (rc)
return rc;
@@ -2893,13 +2833,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
if (tcon->snapshot_time) {
cifs_dbg(FYI, "adding snapshot context\n");
- if (n_iov > 2) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next =
- cpu_to_le32(iov[n_iov-1].iov_len);
- }
-
rc = add_twarp_context(iov, &n_iov, tcon->snapshot_time);
if (rc)
return rc;
@@ -2923,12 +2856,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
set_owner = false;
if (set_owner | set_mode) {
- if (n_iov > 2) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
- }
-
cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode);
rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner);
if (rc)
@@ -2936,12 +2863,30 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
}
}
+ add_query_id_context(iov, &n_iov);
+
if (n_iov > 2) {
- struct create_context *ccontext =
- (struct create_context *)iov[n_iov-1].iov_base;
- ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len);
+ /*
+ * We have create contexts behind iov[1] (the file
+ * name), point at them from the main create request
+ */
+ req->CreateContextsOffset = cpu_to_le32(
+ sizeof(struct smb2_create_req) +
+ iov[1].iov_len);
+ req->CreateContextsLength = 0;
+
+ for (unsigned int i = 2; i < (n_iov-1); i++) {
+ struct kvec *v = &iov[i];
+ size_t len = v->iov_len;
+ struct create_context *cctx =
+ (struct create_context *)v->iov_base;
+
+ cctx->Next = cpu_to_le32(len);
+ le32_add_cpu(&req->CreateContextsLength, len);
+ }
+ le32_add_cpu(&req->CreateContextsLength,
+ iov[n_iov-1].iov_len);
}
- add_query_id_context(iov, &n_iov);
rqst->rq_nvec = n_iov;
return 0;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 2114e8a0c63a..220994d0a0f7 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -132,17 +132,6 @@ struct share_redirect_error_context_rsp {
#define SMB2_LEASE_HANDLE_CACHING_HE 0x02
#define SMB2_LEASE_WRITE_CACHING_HE 0x04
-struct create_durable {
- struct create_context ccontext;
- __u8 Name[8];
- union {
- __u8 Reserved[16];
- struct {
- __u64 PersistentFileId;
- __u64 VolatileFileId;
- } Fid;
- } Data;
-} __packed;
/* See MS-SMB2 2.2.13.2.11 */
/* Flags */
@@ -170,15 +159,6 @@ struct durable_reconnect_context_v2 {
__le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
} __packed;
-/* See MS-SMB2 2.2.14.2.9 */
-struct create_on_disk_id {
- struct create_context ccontext;
- __u8 Name[8];
- __le64 DiskFileId;
- __le64 VolumeId;
- __u32 Reserved[4];
-} __packed;
-
/* See MS-SMB2 2.2.14.2.12 */
struct durable_reconnect_context_v2_rsp {
__le32 Timeout;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index b39580ad4ce5..3c3148588491 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -361,7 +361,7 @@ static int __init init_coda_psdev(void)
__func__, CODA_PSDEV_MAJOR);
return -EIO;
}
- coda_psdev_class = class_create(THIS_MODULE, "coda");
+ coda_psdev_class = class_create("coda");
if (IS_ERR(coda_psdev_class)) {
err = PTR_ERR(coda_psdev_class);
goto out_chrdev;
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index fda3b702b1c5..a247c14aaab7 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -39,19 +39,10 @@ static struct ctl_table coda_table[] = {
{}
};
-static struct ctl_table fs_table[] = {
- {
- .procname = "coda",
- .mode = 0555,
- .child = coda_table
- },
- {}
-};
-
void coda_sysctl_init(void)
{
if ( !fs_table_header )
- fs_table_header = register_sysctl_table(fs_table);
+ fs_table_header = register_sysctl("coda", coda_table);
}
void coda_sysctl_clean(void)
diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+ iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4f757a71f99b..980483455cc0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,7 +43,7 @@
* LOCKING:
* There are three level of locking required by epoll :
*
- * 1) epmutex (mutex)
+ * 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
* 3) ep->lock (rwlock)
*
@@ -57,14 +57,8 @@
* we need a lock that will allow us to sleep. This lock is a
* mutex (ep->mtx). It is acquired during the event transfer loop,
* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
- * Then we also need a global mutex to serialize eventpoll_release_file()
- * and ep_free().
- * This mutex is acquired by ep_free() during the epoll file
- * cleanup path and it is also acquired by eventpoll_release_file()
- * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
- * It is also acquired when inserting an epoll fd onto another epoll
- * fd. We do this so that we walk the epoll tree and ensure that this
+ * The epnested_mutex is acquired when inserting an epoll fd onto another
+ * epoll fd. We do this so that we walk the epoll tree and ensure that this
* insertion does not create a cycle of epoll file descriptors, which
* could lead to deadlock. We need a global mutex to prevent two
* simultaneous inserts (A into B and B into A) from racing and
@@ -80,9 +74,9 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->lock") to have it working,
+ * mutex "epnested_mutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
- * Events that require holding "epmutex" are very rare, while for
+ * Events that require holding "epnested_mutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
* a better scalability.
*/
@@ -153,6 +147,13 @@ struct epitem {
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
+ /*
+ * Protected by file->f_lock, true for to-be-released epitem already
+ * removed from the "struct file" items list; together with
+ * eventpoll->refcount orchestrates "struct eventpoll" disposal
+ */
+ bool dying;
+
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
@@ -217,6 +218,12 @@ struct eventpoll {
u64 gen;
struct hlist_head refs;
+ /*
+ * usage count, used together with epitem->dying to
+ * orchestrate the disposal of this struct
+ */
+ refcount_t refcount;
+
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
@@ -240,10 +247,8 @@ struct ep_pqueue {
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;
-/*
- * This mutex is used to serialize ep_free() and eventpoll_release_file().
- */
-static DEFINE_MUTEX(epmutex);
+/* Used for cycles detection */
+static DEFINE_MUTEX(epnested_mutex);
static u64 loop_check_gen = 0;
@@ -258,7 +263,7 @@ static struct kmem_cache *pwq_cache __read_mostly;
/*
* List of files with newly added links, where we may need to limit the number
- * of emanating paths. Protected by the epmutex.
+ * of emanating paths. Protected by the epnested_mutex.
*/
struct epitems_head {
struct hlist_head epitems;
@@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
/*
* This function unregisters poll callbacks from the associated file
- * descriptor. Must be called with "mtx" held (or "epmutex" if called from
- * ep_free).
+ * descriptor. Must be called with "mtx" held.
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
@@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head)
kmem_cache_free(epi_cache, epi);
}
+static void ep_get(struct eventpoll *ep)
+{
+ refcount_inc(&ep->refcount);
+}
+
+/*
+ * Returns true if the event poll can be disposed
+ */
+static bool ep_refcount_dec_and_test(struct eventpoll *ep)
+{
+ if (!refcount_dec_and_test(&ep->refcount))
+ return false;
+
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
+ return true;
+}
+
+static void ep_free(struct eventpoll *ep)
+{
+ mutex_destroy(&ep->mtx);
+ free_uid(ep->user);
+ wakeup_source_unregister(ep->ws);
+ kfree(ep);
+}
+
/*
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
+ * If the dying flag is set, do the removal only if force is true.
+ * This prevents ep_clear_and_put() from dropping all the ep references
+ * while running concurrently with eventpoll_release_file().
+ * Returns true if the eventpoll can be disposed.
*/
-static int ep_remove(struct eventpoll *ep, struct epitem *epi)
+static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{
struct file *file = epi->ffd.file;
struct epitems_head *to_free;
@@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
+ if (epi->dying && !force) {
+ spin_unlock(&file->f_lock);
+ return false;
+ }
+
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
@@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
call_rcu(&epi->rcu, epi_rcu_free);
percpu_counter_dec(&ep->user->epoll_watches);
+ return ep_refcount_dec_and_test(ep);
+}
- return 0;
+/*
+ * ep_remove variant for callers owing an additional reference to the ep
+ */
+static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+{
+ WARN_ON_ONCE(__ep_remove(ep, epi, false));
}
-static void ep_free(struct eventpoll *ep)
+static void ep_clear_and_put(struct eventpoll *ep)
{
- struct rb_node *rbp;
+ struct rb_node *rbp, *next;
struct epitem *epi;
+ bool dispose;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(ep, NULL, 0);
- /*
- * We need to lock this because we could be hit by
- * eventpoll_release_file() while we're freeing the "struct eventpoll".
- * We do not need to hold "ep->mtx" here because the epoll file
- * is on the way to be removed and no one has references to it
- * anymore. The only hit might come from eventpoll_release_file() but
- * holding "epmutex" is sufficient here.
- */
- mutex_lock(&epmutex);
+ mutex_lock(&ep->mtx);
/*
* Walks through the whole tree by unregistering poll callbacks.
@@ -767,26 +805,25 @@ static void ep_free(struct eventpoll *ep)
}
/*
- * Walks through the whole tree by freeing each "struct epitem". At this
- * point we are sure no poll callbacks will be lingering around, and also by
- * holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->lock".
- * We do not need to lock ep->mtx, either, we only do it to prevent
- * a lockdep warning.
+ * Walks through the whole tree and try to free each "struct epitem".
+ * Note that ep_remove_safe() will not remove the epitem in case of a
+ * racing eventpoll_release_file(); the latter will do the removal.
+ * At this point we are sure no poll callbacks will be lingering around.
+ * Since we still own a reference to the eventpoll struct, the loop can't
+ * dispose it.
*/
- mutex_lock(&ep->mtx);
- while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
+ next = rb_next(rbp);
epi = rb_entry(rbp, struct epitem, rbn);
- ep_remove(ep, epi);
+ ep_remove_safe(ep, epi);
cond_resched();
}
+
+ dispose = ep_refcount_dec_and_test(ep);
mutex_unlock(&ep->mtx);
- mutex_unlock(&epmutex);
- mutex_destroy(&ep->mtx);
- free_uid(ep->user);
- wakeup_source_unregister(ep->ws);
- kfree(ep);
+ if (dispose)
+ ep_free(ep);
}
static int ep_eventpoll_release(struct inode *inode, struct file *file)
@@ -794,7 +831,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
struct eventpoll *ep = file->private_data;
if (ep)
- ep_free(ep);
+ ep_clear_and_put(ep);
return 0;
}
@@ -906,33 +943,34 @@ void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
struct epitem *epi;
- struct hlist_node *next;
+ bool dispose;
/*
- * We don't want to get "file->f_lock" because it is not
- * necessary. It is not necessary because we're in the "struct file"
- * cleanup path, and this means that no one is using this file anymore.
- * So, for example, epoll_ctl() cannot hit here since if we reach this
- * point, the file counter already went to zero and fget() would fail.
- * The only hit might come from ep_free() but by holding the mutex
- * will correctly serialize the operation. We do need to acquire
- * "ep->mtx" after "epmutex" because ep_remove() requires it when called
- * from anywhere but ep_free().
- *
- * Besides, ep_remove() acquires the lock, so we can't hold it here.
+ * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
+ * touching the epitems list before eventpoll_release_file() can access
+ * the ep->mtx.
*/
- mutex_lock(&epmutex);
- if (unlikely(!file->f_ep)) {
- mutex_unlock(&epmutex);
- return;
- }
- hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
+again:
+ spin_lock(&file->f_lock);
+ if (file->f_ep && file->f_ep->first) {
+ epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
+ epi->dying = true;
+ spin_unlock(&file->f_lock);
+
+ /*
+ * ep access is safe as we still own a reference to the ep
+ * struct
+ */
ep = epi->ep;
- mutex_lock_nested(&ep->mtx, 0);
- ep_remove(ep, epi);
+ mutex_lock(&ep->mtx);
+ dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
+
+ if (dispose)
+ ep_free(ep);
+ goto again;
}
- mutex_unlock(&epmutex);
+ spin_unlock(&file->f_lock);
}
static int ep_alloc(struct eventpoll **pep)
@@ -955,6 +993,7 @@ static int ep_alloc(struct eventpoll **pep)
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
+ refcount_set(&ep->refcount, 1);
*pep = ep;
@@ -1223,10 +1262,10 @@ out_unlock:
*/
list_del_init(&wait->entry);
/*
- * ->whead != NULL protects us from the race with ep_free()
- * or ep_remove(), ep_remove_wait_queue() takes whead->lock
- * held by the caller. Once we nullify it, nothing protects
- * ep/epi or even wait.
+ * ->whead != NULL protects us from the race with
+ * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
+ * takes whead->lock held by the caller. Once we nullify it,
+ * nothing protects ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
@@ -1298,7 +1337,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
* is connected to n file sources. In this case each file source has 1 path
* of length 1. Thus, the numbers below should be more than sufficient. These
* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
- * and delete can't add additional paths. Protected by the epmutex.
+ * and delete can't add additional paths. Protected by the epnested_mutex.
*/
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];
@@ -1496,16 +1535,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (tep)
mutex_unlock(&tep->mtx);
+ /*
+ * ep_remove_safe() calls in the later error paths can't lead to
+ * ep_free() as the ep file itself still holds an ep reference.
+ */
+ ep_get(ep);
+
/* now check if we've created too many backpaths */
if (unlikely(full_check && reverse_path_check())) {
- ep_remove(ep, epi);
+ ep_remove_safe(ep, epi);
return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error) {
- ep_remove(ep, epi);
+ ep_remove_safe(ep, epi);
return error;
}
}
@@ -1529,7 +1574,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
* high memory pressure.
*/
if (unlikely(!epq.epi)) {
- ep_remove(ep, epi);
+ ep_remove_safe(ep, epi);
return -ENOMEM;
}
@@ -2025,7 +2070,7 @@ static int do_epoll_create(int flags)
out_free_fd:
put_unused_fd(fd);
out_free_ep:
- ep_free(ep);
+ ep_clear_and_put(ep);
return error;
}
@@ -2135,7 +2180,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
- * 'epmutex' on add is to prevent complex toplogies such as loops and
+ * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
@@ -2146,7 +2191,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
- error = epoll_mutex_lock(&epmutex, 0, nonblock);
+ error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
@@ -2180,10 +2225,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
- if (epi)
- error = ep_remove(ep, epi);
- else
+ if (epi) {
+ /*
+ * The eventpoll itself is still alive: the refcount
+ * can't go to zero here.
+ */
+ ep_remove_safe(ep, epi);
+ error = 0;
+ } else {
error = -ENOENT;
+ }
break;
case EPOLL_CTL_MOD:
if (epi) {
@@ -2201,7 +2252,7 @@ error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
- mutex_unlock(&epmutex);
+ mutex_unlock(&epnested_mutex);
}
fdput(tf);
diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..a466e797c8e2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,6 +65,7 @@
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -1034,7 +1035,7 @@ static int exec_mmap(struct mm_struct *mm)
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
return 0;
}
@@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm,
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
+ user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e79c767cc5e0..35703dce23a3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5795,7 +5795,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
* mapped - no physical clusters have been allocated, and the
* file has no extents
*/
- if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
+ ext4_has_inline_data(inode))
return 0;
/* search for the extent closest to the first block in the cluster */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index b9fb1177fff6..859bc4e2c9b0 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -566,9 +566,9 @@ retry:
* started */
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
- goto out;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ goto out_nofolio;
}
ext4_write_lock_xattr(inode, &no_expand);
@@ -633,6 +633,7 @@ out:
folio_unlock(folio);
folio_put(folio);
}
+out_nofolio:
if (sem_held)
ext4_write_unlock_xattr(inode, &no_expand);
if (handle)
@@ -693,8 +694,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto out;
}
@@ -854,8 +855,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
@@ -947,8 +948,8 @@ retry_journal:
*/
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
- if (!folio) {
- ret = -ENOMEM;
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto out_journal;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbd352e3986..0d5ba922e411 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1162,8 +1162,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
retry_grab:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -2906,8 +2906,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/* In case writeback began while the folio was unlocked */
folio_wait_stable(folio);
@@ -2982,6 +2982,9 @@ static int ext4_da_write_end(struct file *file,
ext4_has_inline_data(inode))
return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ if (unlikely(copied < len) && !PageUptodate(page))
+ copied = 0;
+
start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
@@ -3618,8 +3621,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
blocksize = inode->i_sb->s_blocksize;
@@ -5201,7 +5204,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
while (1) {
struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!folio)
+ if (IS_ERR(folio))
return;
ret = __ext4_journalled_invalidate_folio(folio, offset,
folio_size(folio) - offset);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 4681fff6665f..4022bc713421 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -282,6 +282,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
mmp_block >= ext4_blocks_count(es)) {
ext4_warning(sb, "Invalid MMP block in superblock");
+ retval = -EINVAL;
goto failed;
}
@@ -307,6 +308,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (seq == EXT4_MMP_SEQ_FSCK) {
dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ retval = -EBUSY;
goto failed;
}
@@ -320,6 +322,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ retval = -ETIMEDOUT;
goto failed;
}
@@ -330,6 +333,7 @@ int ext4_multi_mount_protect(struct super_block *sb,
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
+ retval = -EBUSY;
goto failed;
}
@@ -349,6 +353,7 @@ skip:
*/
if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
ext4_warning(sb, "MMP startup interrupted, failing mount");
+ retval = -ETIMEDOUT;
goto failed;
}
@@ -359,6 +364,7 @@ skip:
if (seq != le32_to_cpu(mmp->mmp_seq)) {
dump_mmp_msg(sb, mmp,
"Device is already active on another node.");
+ retval = -EBUSY;
goto failed;
}
@@ -378,6 +384,7 @@ skip:
EXT4_SB(sb)->s_mmp_tsk = NULL;
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
+ retval = -ENOMEM;
goto failed;
}
@@ -385,5 +392,5 @@ skip:
failed:
brelse(bh);
- return 1;
+ return retval;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e509c22a21ed..b5af2fc03b2f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -140,18 +140,18 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
flags = memalloc_nofs_save();
folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping[0]));
- if (!folio[0]) {
+ if (IS_ERR(folio[0])) {
memalloc_nofs_restore(flags);
- return -ENOMEM;
+ return PTR_ERR(folio[0]);
}
folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping[1]));
memalloc_nofs_restore(flags);
- if (!folio[1]) {
+ if (IS_ERR(folio[1])) {
folio_unlock(folio[0]);
folio_put(folio[0]);
- return -ENOMEM;
+ return PTR_ERR(folio[1]);
}
/*
* __filemap_get_folio() may not wait on folio's writeback if
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d03bf0ecf505..d39f386e9baf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1259,7 +1259,7 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int aborted = 0;
- int i, err;
+ int err;
/*
* Unregister sysfs before destroying jbd2 journal.
@@ -1311,7 +1311,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_flex_groups_free(sbi);
ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ for (int i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
@@ -5196,10 +5196,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t logical_sb_block;
struct inode *root;
- int ret = -ENOMEM;
- unsigned int i;
int needs_recovery;
- int err = 0;
+ int err;
ext4_group_t first_not_zeroed;
struct ext4_fs_context *ctx = fc->fs_private;
int silent = fc->sb_flags & SB_SILENT;
@@ -5212,8 +5210,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_sectors_written_start =
part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
- /* -EINVAL is default */
- ret = -EINVAL;
err = ext4_load_super(sb, &logical_sb_block, silent);
if (err)
goto out_fail;
@@ -5239,7 +5235,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- if (ext4_inode_info_init(sb, es))
+ err = ext4_inode_info_init(sb, es);
+ if (err)
goto failed_mount;
err = parse_apply_sb_mount_options(sb, ctx);
@@ -5255,10 +5252,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_apply_options(fc, sb);
- if (ext4_encoding_init(sb, es))
+ err = ext4_encoding_init(sb, es);
+ if (err)
goto failed_mount;
- if (ext4_check_journal_data_mode(sb))
+ err = ext4_check_journal_data_mode(sb);
+ if (err)
goto failed_mount;
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -5267,18 +5266,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* i_version is always enabled now */
sb->s_flags |= SB_I_VERSION;
- if (ext4_check_feature_compatibility(sb, es, silent))
+ err = ext4_check_feature_compatibility(sb, es, silent);
+ if (err)
goto failed_mount;
- if (ext4_block_group_meta_init(sb, silent))
+ err = ext4_block_group_meta_init(sb, silent);
+ if (err)
goto failed_mount;
ext4_hash_info_init(sb);
- if (ext4_handle_clustersize(sb))
+ err = ext4_handle_clustersize(sb);
+ if (err)
goto failed_mount;
- if (ext4_check_geometry(sb, es))
+ err = ext4_check_geometry(sb, es);
+ if (err)
goto failed_mount;
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
@@ -5289,8 +5292,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err)
goto failed_mount3;
- /* Register extent status tree shrinker */
- if (ext4_es_register_shrinker(sbi))
+ err = ext4_es_register_shrinker(sbi);
+ if (err)
goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi);
@@ -5329,10 +5332,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_has_feature_orphan_present(sb) ||
ext4_has_feature_journal_needs_recovery(sb));
- if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
- if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
+ err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
+ if (err)
goto failed_mount3a;
+ }
+ err = -EINVAL;
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
@@ -5384,6 +5390,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (!sbi->s_ea_block_cache) {
ext4_msg(sb, KERN_ERR,
"Failed to create ea_block_cache");
+ err = -EINVAL;
goto failed_mount_wq;
}
@@ -5392,6 +5399,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (!sbi->s_ea_inode_cache) {
ext4_msg(sb, KERN_ERR,
"Failed to create ea_inode_cache");
+ err = -EINVAL;
goto failed_mount_wq;
}
}
@@ -5426,7 +5434,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!EXT4_SB(sb)->rsv_conversion_wq) {
printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
+ err = -ENOMEM;
goto failed_mount4;
}
@@ -5438,28 +5446,28 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(root)) {
ext4_msg(sb, KERN_ERR, "get root inode failed");
- ret = PTR_ERR(root);
+ err = PTR_ERR(root);
root = NULL;
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
iput(root);
+ err = -EFSCORRUPTED;
goto failed_mount4;
}
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
- ret = -ENOMEM;
+ err = -ENOMEM;
goto failed_mount4;
}
- ret = ext4_setup_super(sb, es, sb_rdonly(sb));
- if (ret == -EROFS) {
+ err = ext4_setup_super(sb, es, sb_rdonly(sb));
+ if (err == -EROFS) {
sb->s_flags |= SB_RDONLY;
- ret = 0;
- } else if (ret)
+ } else if (err)
goto failed_mount4a;
ext4_set_resv_clusters(sb);
@@ -5503,7 +5511,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_journal->j_commit_callback =
ext4_journal_commit_callback;
- if (ext4_percpu_param_init(sbi))
+ err = ext4_percpu_param_init(sbi);
+ if (err)
goto failed_mount6;
if (ext4_has_feature_flex_bg(sb))
@@ -5511,7 +5520,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
- ret = -ENOMEM;
+ err = -ENOMEM;
goto failed_mount6;
}
@@ -5628,7 +5637,7 @@ failed_mount:
#endif
#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
@@ -5637,7 +5646,7 @@ failed_mount:
ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
- return err ? err : ret;
+ return err;
}
static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -6565,12 +6574,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
sb->s_flags &= ~SB_RDONLY;
- if (ext4_has_feature_mmp(sb))
- if (ext4_multi_mount_protect(sb,
- le64_to_cpu(es->s_mmp_block))) {
- err = -EROFS;
+ if (ext4_has_feature_mmp(sb)) {
+ err = ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block));
+ if (err)
goto restore_opts;
- }
+ }
#ifdef CONFIG_QUOTA
enable_quota = 1;
#endif
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3b01247066dd..2f37e1ea3955 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -366,14 +366,16 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
- if (!folio || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
- if (folio)
+ if (!IS_ERR(folio))
folio_put(folio);
else if (num_ra_pages > 1)
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
folio = read_mapping_folio(inode->i_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
return folio_file_page(folio, index);
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 179a5c5e28fd..91e89e68177e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -623,7 +623,7 @@ static int __init cuse_init(void)
/* CUSE is not prepared for FUSE_DEV_IOC_CLONE */
cuse_channel_fops.unlocked_ioctl = NULL;
- cuse_class = class_create(THIS_MODULE, "cuse");
+ cuse_class = class_create("cuse");
if (IS_ERR(cuse_class))
return PTR_ERR(cuse_class);
diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile
index 587bcd6e50a3..16be592e8085 100644
--- a/fs/hostfs/Makefile
+++ b/fs/hostfs/Makefile
@@ -3,9 +3,11 @@
# Licensed under the GPL
#
-hostfs-objs := hostfs_kern.o hostfs_user.o
+hostfs-objs := hostfs_kern.o
-obj-y :=
+hostfs-builtin-$(CONFIG_HOSTFS) += hostfs_user.o hostfs_user_exp.o
+
+obj-y := $(hostfs-builtin-y) $(hostfs-builtin-m)
obj-$(CONFIG_HOSTFS) += hostfs.o
include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/fs/hostfs/hostfs_user_exp.c b/fs/hostfs/hostfs_user_exp.c
new file mode 100644
index 000000000000..250c91c55c46
--- /dev/null
+++ b/fs/hostfs/hostfs_user_exp.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include "hostfs.h"
+
+EXPORT_SYMBOL_GPL(stat_file);
+EXPORT_SYMBOL_GPL(access_file);
+EXPORT_SYMBOL_GPL(open_file);
+EXPORT_SYMBOL_GPL(open_dir);
+EXPORT_SYMBOL_GPL(seek_dir);
+EXPORT_SYMBOL_GPL(read_dir);
+EXPORT_SYMBOL_GPL(read_file);
+EXPORT_SYMBOL_GPL(write_file);
+EXPORT_SYMBOL_GPL(lseek_file);
+EXPORT_SYMBOL_GPL(fsync_file);
+EXPORT_SYMBOL_GPL(replace_file);
+EXPORT_SYMBOL_GPL(close_file);
+EXPORT_SYMBOL_GPL(close_dir);
+EXPORT_SYMBOL_GPL(file_create);
+EXPORT_SYMBOL_GPL(set_attr);
+EXPORT_SYMBOL_GPL(make_symlink);
+EXPORT_SYMBOL_GPL(unlink_file);
+EXPORT_SYMBOL_GPL(do_mkdir);
+EXPORT_SYMBOL_GPL(hostfs_do_rmdir);
+EXPORT_SYMBOL_GPL(do_mknod);
+EXPORT_SYMBOL_GPL(link_file);
+EXPORT_SYMBOL_GPL(hostfs_do_readlink);
+EXPORT_SYMBOL_GPL(rename_file);
+EXPORT_SYMBOL_GPL(rename2_file);
+EXPORT_SYMBOL_GPL(do_statfs);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9062da6da567..ecfdfb2529a3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
@@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
struct folio *folio;
folio = filemap_lock_folio(mapping, idx);
- if (!folio)
+ if (IS_ERR(folio))
return;
start = start & ~huge_page_mask(h);
diff --git a/fs/inode.c b/fs/inode.c
index 3ec5a8f7b644..577799b7855f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += reap;
+ mm_account_reclaimed_pages(reap);
}
iput(inode);
spin_lock(lru_lock);
diff --git a/fs/internal.h b/fs/internal.h
index ab36ed8fa41c..bd3b2810a36b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
*/
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root);
-extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
- const char *, unsigned int, struct path *);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 10a203515583..063133ec77f4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -468,19 +468,12 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
{
unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
- struct folio *folio;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
- if (folio)
- return folio;
-
- if (iter->flags & IOMAP_NOWAIT)
- return ERR_PTR(-EAGAIN);
- return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
start_byte >> PAGE_SHIFT);
- if (!folio) {
+ if (IS_ERR(folio)) {
start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
PAGE_SIZE;
continue;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f771001574d0..019cc87d0fb3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -130,6 +130,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (ret > 0)
ret += dio->done_before;
+ trace_iomap_dio_complete(iocb, dio->error, ret);
kfree(dio);
return ret;
@@ -493,6 +494,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct blk_plug plug;
struct iomap_dio *dio;
+ trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
+
if (!iomi.len)
return NULL;
@@ -541,7 +544,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
/* for data sync or sync, we need sync completion processing */
- if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
+ if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
@@ -650,8 +653,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
*/
dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
- if (!wait_for_completion)
+ if (!wait_for_completion) {
+ trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len);
return ERR_PTR(-EIOCBQUEUED);
+ }
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
index da217246b1a9..728d5443daf5 100644
--- a/fs/iomap/trace.c
+++ b/fs/iomap/trace.c
@@ -3,6 +3,7 @@
* Copyright (c) 2019 Christoph Hellwig
*/
#include <linux/iomap.h>
+#include <linux/uio.h>
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index f6ea9540d082..c16fd55f5595 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -83,6 +83,7 @@ DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
+DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
#define IOMAP_TYPE_STRINGS \
{ IOMAP_HOLE, "HOLE" }, \
@@ -107,6 +108,11 @@ DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
{ IOMAP_F_BUFFER_HEAD, "BH" }, \
{ IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }
+#define IOMAP_DIO_STRINGS \
+ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
+ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
+ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }
+
DECLARE_EVENT_CLASS(iomap_class,
TP_PROTO(struct inode *inode, struct iomap *iomap),
TP_ARGS(inode, iomap),
@@ -183,6 +189,78 @@ TRACE_EVENT(iomap_iter,
(void *)__entry->caller)
);
+TRACE_EVENT(iomap_dio_rw_begin,
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter,
+ unsigned int dio_flags, size_t done_before),
+ TP_ARGS(iocb, iter, dio_flags, done_before),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(loff_t, isize)
+ __field(loff_t, pos)
+ __field(size_t, count)
+ __field(size_t, done_before)
+ __field(int, ki_flags)
+ __field(unsigned int, dio_flags)
+ __field(bool, aio)
+ ),
+ TP_fast_assign(
+ __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ __entry->ino = file_inode(iocb->ki_filp)->i_ino;
+ __entry->isize = file_inode(iocb->ki_filp)->i_size;
+ __entry->pos = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
+ __entry->done_before = done_before;
+ __entry->ki_flags = iocb->ki_flags;
+ __entry->dio_flags = dio_flags;
+ __entry->aio = !is_sync_kiocb(iocb);
+ ),
+ TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->isize,
+ __entry->pos,
+ __entry->count,
+ __entry->done_before,
+ __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
+ __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS),
+ __entry->aio)
+);
+
+TRACE_EVENT(iomap_dio_complete,
+ TP_PROTO(struct kiocb *iocb, int error, ssize_t ret),
+ TP_ARGS(iocb, error, ret),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(loff_t, isize)
+ __field(loff_t, pos)
+ __field(int, ki_flags)
+ __field(bool, aio)
+ __field(int, error)
+ __field(ssize_t, ret)
+ ),
+ TP_fast_assign(
+ __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ __entry->ino = file_inode(iocb->ki_filp)->i_ino;
+ __entry->isize = file_inode(iocb->ki_filp)->i_size;
+ __entry->pos = iocb->ki_pos;
+ __entry->ki_flags = iocb->ki_flags;
+ __entry->aio = !is_sync_kiocb(iocb);
+ __entry->error = error;
+ __entry->ret = ret;
+ ),
+ TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->isize,
+ __entry->pos,
+ __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
+ __entry->aio,
+ __entry->error,
+ __entry->ret)
+);
+
#endif /* _IOMAP_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 90de0e498371..45b6919903e6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,7 @@
#include "kernfs-internal.h"
-static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
+static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
/*
* Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
* call pr_cont() while holding rename_lock. Because sometimes pr_cont()
@@ -196,9 +196,9 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
unsigned long flags;
int ret;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ read_lock_irqsave(&kernfs_rename_lock, flags);
ret = kernfs_name_locked(kn, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&kernfs_rename_lock, flags);
return ret;
}
@@ -224,9 +224,9 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
unsigned long flags;
int ret;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ read_lock_irqsave(&kernfs_rename_lock, flags);
ret = kernfs_path_from_node_locked(to, from, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&kernfs_rename_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
@@ -294,10 +294,10 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
struct kernfs_node *parent;
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ read_lock_irqsave(&kernfs_rename_lock, flags);
parent = kn->parent;
kernfs_get(parent);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&kernfs_rename_lock, flags);
return parent;
}
@@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
/* Update timestamps on the parent */
+ down_write(&root->kernfs_iattr_rwsem);
+
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&root->kernfs_iattr_rwsem);
up_write(&root->kernfs_rwsem);
/*
@@ -940,6 +943,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
idr_init(&root->ino_idr);
init_rwsem(&root->kernfs_rwsem);
+ init_rwsem(&root->kernfs_iattr_rwsem);
+ init_rwsem(&root->kernfs_supers_rwsem);
INIT_LIST_HEAD(&root->supers);
/*
@@ -1462,11 +1467,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
pos->parent ? pos->parent->iattr : NULL;
/* update timestamps on the parent */
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
kernfs_put(pos);
}
@@ -1723,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
kernfs_get(new_parent);
/* rename_lock protects ->parent and ->name accessors */
- spin_lock_irq(&kernfs_rename_lock);
+ write_lock_irq(&kernfs_rename_lock);
old_parent = kn->parent;
kn->parent = new_parent;
@@ -1734,7 +1742,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
kn->name = new_name;
}
- spin_unlock_irq(&kernfs_rename_lock);
+ write_unlock_irq(&kernfs_rename_lock);
kn->hash = kernfs_name_hash(kn->name, kn->ns);
kernfs_link_sibling(kn);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e4a50e4ff0d2..40c4661f15b7 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -922,8 +922,8 @@ repeat:
root = kernfs_root(kn);
/* kick fsnotify */
- down_write(&root->kernfs_rwsem);
+ down_read(&root->kernfs_supers_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
struct inode *p_inode = NULL;
@@ -960,7 +960,7 @@ repeat:
iput(inode);
}
- up_write(&root->kernfs_rwsem);
+ up_read(&root->kernfs_supers_rwsem);
kernfs_put(kn);
goto repeat;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 30494dcb0df3..b22b74d1a115 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int ret;
struct kernfs_root *root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_iattr_rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_iattr_rwsem);
return ret;
}
@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EINVAL;
root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_iattr_rwsem);
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
setattr_copy(&nop_mnt_idmap, inode, iattr);
out:
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_iattr_rwsem);
return error;
}
@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&nop_mnt_idmap, inode, stat);
- up_read(&root->kernfs_rwsem);
+ up_read(&root->kernfs_iattr_rwsem);
return 0;
}
@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
kn = inode->i_private;
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&nop_mnt_idmap, inode, mask);
- up_read(&root->kernfs_rwsem);
+ up_read(&root->kernfs_iattr_rwsem);
return ret;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 236c3a6113f1..a9b854cdfdb5 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -47,6 +47,8 @@ struct kernfs_root {
wait_queue_head_t deactivate_waitq;
struct rw_semaphore kernfs_rwsem;
+ struct rw_semaphore kernfs_iattr_rwsem;
+ struct rw_semaphore kernfs_supers_rwsem;
};
/* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e08e8d999807..d49606accb07 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -351,9 +351,9 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_supers_rwsem);
list_add(&info->node, &info->root->supers);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_supers_rwsem);
}
fc->root = dget(sb->s_root);
@@ -380,9 +380,9 @@ void kernfs_kill_sb(struct super_block *sb)
struct kernfs_super_info *info = kernfs_info(sb);
struct kernfs_root *root = info->root;
- down_write(&root->kernfs_rwsem);
+ down_write(&root->kernfs_supers_rwsem);
list_del(&info->node);
- up_write(&root->kernfs_rwsem);
+ up_write(&root->kernfs_supers_rwsem);
/*
* Remove the superblock from fs_supers/s_instances
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 0d8242789dc8..0979577fd23e 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -415,7 +415,7 @@ int server_queue_ctrl_reset_work(void)
return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET);
}
-static ssize_t stats_show(struct class *class, struct class_attribute *attr,
+static ssize_t stats_show(const struct class *class, const struct class_attribute *attr,
char *buf)
{
/*
@@ -434,8 +434,8 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
server_conf.ipc_last_active / HZ);
}
-static ssize_t kill_server_store(struct class *class,
- struct class_attribute *attr, const char *buf,
+static ssize_t kill_server_store(const struct class *class,
+ const struct class_attribute *attr, const char *buf,
size_t len)
{
if (!sysfs_streq(buf, "hard"))
@@ -455,7 +455,7 @@ static const char * const debug_type_strings[] = {"smb", "auth", "vfs",
"oplock", "ipc", "conn",
"rdma"};
-static ssize_t debug_show(struct class *class, struct class_attribute *attr,
+static ssize_t debug_show(const struct class *class, const struct class_attribute *attr,
char *buf)
{
ssize_t sz = 0;
@@ -473,7 +473,7 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
return sz;
}
-static ssize_t debug_store(struct class *class, struct class_attribute *attr,
+static ssize_t debug_store(const struct class *class, const struct class_attribute *attr,
const char *buf, size_t len)
{
int i;
@@ -513,7 +513,6 @@ ATTRIBUTE_GROUPS(ksmbd_control_class);
static struct class ksmbd_control_class = {
.name = "ksmbd-control",
- .owner = THIS_MODULE,
.class_groups = ksmbd_control_class_groups,
};
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 67b7e766a06b..bbc9e92935fb 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -756,19 +756,6 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt,
pneg_ctxt->Ciphers[0] = cipher_type;
}
-static void build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt,
- __le16 comp_algo)
-{
- pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES;
- pneg_ctxt->DataLength =
- cpu_to_le16(sizeof(struct smb2_compression_capabilities_context)
- - sizeof(struct smb2_neg_context));
- pneg_ctxt->Reserved = cpu_to_le32(0);
- pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1);
- pneg_ctxt->Flags = cpu_to_le32(0);
- pneg_ctxt->CompressionAlgorithms[0] = comp_algo;
-}
-
static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt,
__le16 sign_algo)
{
@@ -808,7 +795,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
struct smb2_negotiate_rsp *rsp,
void *smb2_buf_len)
{
- char *pneg_ctxt = (char *)rsp +
+ char * const pneg_ctxt = (char *)rsp +
le32_to_cpu(rsp->NegotiateContextOffset);
int neg_ctxt_cnt = 1;
int ctxt_size;
@@ -817,61 +804,46 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
"assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n");
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt,
conn->preauth_info->Preauth_HashId);
- rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING);
ctxt_size = sizeof(struct smb2_preauth_neg_context);
- /* Round to 8 byte boundary */
- pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8);
if (conn->cipher_type) {
+ /* Round to 8 byte boundary */
ctxt_size = round_up(ctxt_size, 8);
ksmbd_debug(SMB,
"assemble SMB2_ENCRYPTION_CAPABILITIES context\n");
- build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt,
+ build_encrypt_ctxt((struct smb2_encryption_neg_context *)
+ (pneg_ctxt + ctxt_size),
conn->cipher_type);
- rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ neg_ctxt_cnt++;
ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2;
- /* Round to 8 byte boundary */
- pneg_ctxt +=
- round_up(sizeof(struct smb2_encryption_neg_context) + 2,
- 8);
}
- if (conn->compress_algorithm) {
- ctxt_size = round_up(ctxt_size, 8);
- ksmbd_debug(SMB,
- "assemble SMB2_COMPRESSION_CAPABILITIES context\n");
- /* Temporarily set to SMB3_COMPRESS_NONE */
- build_compression_ctxt((struct smb2_compression_capabilities_context *)pneg_ctxt,
- conn->compress_algorithm);
- rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
- ctxt_size += sizeof(struct smb2_compression_capabilities_context) + 2;
- /* Round to 8 byte boundary */
- pneg_ctxt += round_up(sizeof(struct smb2_compression_capabilities_context) + 2,
- 8);
- }
+ /* compression context not yet supported */
+ WARN_ON(conn->compress_algorithm != SMB3_COMPRESS_NONE);
if (conn->posix_ext_supported) {
ctxt_size = round_up(ctxt_size, 8);
ksmbd_debug(SMB,
"assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n");
- build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
- rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ build_posix_ctxt((struct smb2_posix_neg_context *)
+ (pneg_ctxt + ctxt_size));
+ neg_ctxt_cnt++;
ctxt_size += sizeof(struct smb2_posix_neg_context);
- /* Round to 8 byte boundary */
- pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8);
}
if (conn->signing_negotiated) {
ctxt_size = round_up(ctxt_size, 8);
ksmbd_debug(SMB,
"assemble SMB2_SIGNING_CAPABILITIES context\n");
- build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt,
+ build_sign_cap_ctxt((struct smb2_signing_capabilities *)
+ (pneg_ctxt + ctxt_size),
conn->signing_algorithm);
- rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt);
+ neg_ctxt_cnt++;
ctxt_size += sizeof(struct smb2_signing_capabilities) + 2;
}
+ rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
inc_rfc1001_len(smb2_buf_len, ctxt_size);
}
@@ -2436,7 +2408,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
return rc;
}
- rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
+ rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0);
if (rc) {
pr_err("cannot get linux path (%s), err = %d\n",
name, rc);
@@ -2727,8 +2699,10 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
- rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
+ rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1);
if (!rc) {
+ file_present = true;
+
if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) {
/*
* If file exists with under flags, return access
@@ -2737,7 +2711,6 @@ int smb2_open(struct ksmbd_work *work)
if (req->CreateDisposition == FILE_OVERWRITE_IF_LE ||
req->CreateDisposition == FILE_OPEN_IF_LE) {
rc = -EACCES;
- path_put(&path);
goto err_out;
}
@@ -2745,26 +2718,23 @@ int smb2_open(struct ksmbd_work *work)
ksmbd_debug(SMB,
"User does not have write permission\n");
rc = -EACCES;
- path_put(&path);
goto err_out;
}
} else if (d_is_symlink(path.dentry)) {
rc = -EACCES;
- path_put(&path);
goto err_out;
}
- }
- if (rc) {
+ file_present = true;
+ idmap = mnt_idmap(path.mnt);
+ } else {
if (rc != -ENOENT)
goto err_out;
ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n",
name, rc);
rc = 0;
- } else {
- file_present = true;
- idmap = mnt_idmap(path.mnt);
}
+
if (stream_name) {
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
if (s_type == DATA_STREAM) {
@@ -2892,8 +2862,9 @@ int smb2_open(struct ksmbd_work *work)
if ((daccess & FILE_DELETE_LE) ||
(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
- rc = ksmbd_vfs_may_delete(idmap,
- path.dentry);
+ rc = inode_permission(idmap,
+ d_inode(path.dentry->d_parent),
+ MAY_EXEC | MAY_WRITE);
if (rc)
goto err_out;
}
@@ -3264,10 +3235,13 @@ int smb2_open(struct ksmbd_work *work)
}
err_out:
- if (file_present || created)
- path_put(&path);
+ if (file_present || created) {
+ inode_unlock(d_inode(path.dentry->d_parent));
+ dput(path.dentry);
+ }
ksmbd_revert_fsids(work);
err_out1:
+
if (rc) {
if (rc == -EINVAL)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -5418,44 +5392,19 @@ int smb2_echo(struct ksmbd_work *work)
static int smb2_rename(struct ksmbd_work *work,
struct ksmbd_file *fp,
- struct mnt_idmap *idmap,
struct smb2_file_rename_info *file_info,
struct nls_table *local_nls)
{
struct ksmbd_share_config *share = fp->tcon->share_conf;
- char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL;
- char *pathname = NULL;
- struct path path;
- bool file_present = true;
- int rc;
+ char *new_name = NULL;
+ int rc, flags = 0;
ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n");
- pathname = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathname)
- return -ENOMEM;
-
- abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
- if (IS_ERR(abs_oldname)) {
- rc = -EINVAL;
- goto out;
- }
- old_name = strrchr(abs_oldname, '/');
- if (old_name && old_name[1] != '\0') {
- old_name++;
- } else {
- ksmbd_debug(SMB, "can't get last component in path %s\n",
- abs_oldname);
- rc = -ENOENT;
- goto out;
- }
-
new_name = smb2_get_name(file_info->FileName,
le32_to_cpu(file_info->FileNameLength),
local_nls);
- if (IS_ERR(new_name)) {
- rc = PTR_ERR(new_name);
- goto out;
- }
+ if (IS_ERR(new_name))
+ return PTR_ERR(new_name);
if (strchr(new_name, ':')) {
int s_type;
@@ -5481,7 +5430,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (rc)
goto out;
- rc = ksmbd_vfs_setxattr(idmap,
+ rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
fp->filp->f_path.dentry,
xattr_stream_name,
NULL, 0, 0);
@@ -5496,47 +5445,18 @@ static int smb2_rename(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "new name %s\n", new_name);
- rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1);
- if (rc) {
- if (rc != -ENOENT)
- goto out;
- file_present = false;
- } else {
- path_put(&path);
- }
-
if (ksmbd_share_veto_filename(share, new_name)) {
rc = -ENOENT;
ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name);
goto out;
}
- if (file_info->ReplaceIfExists) {
- if (file_present) {
- rc = ksmbd_vfs_remove_file(work, new_name);
- if (rc) {
- if (rc != -ENOTEMPTY)
- rc = -EINVAL;
- ksmbd_debug(SMB, "cannot delete %s, rc %d\n",
- new_name, rc);
- goto out;
- }
- }
- } else {
- if (file_present &&
- strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) {
- rc = -EEXIST;
- ksmbd_debug(SMB,
- "cannot rename already existing file\n");
- goto out;
- }
- }
+ if (!file_info->ReplaceIfExists)
+ flags = RENAME_NOREPLACE;
- rc = ksmbd_vfs_fp_rename(work, fp, new_name);
+ rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
out:
- kfree(pathname);
- if (!IS_ERR(new_name))
- kfree(new_name);
+ kfree(new_name);
return rc;
}
@@ -5576,18 +5496,17 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "target name is %s\n", target_name);
- rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0);
+ rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
+ &path, 0);
if (rc) {
if (rc != -ENOENT)
goto out;
file_present = false;
- } else {
- path_put(&path);
}
if (file_info->ReplaceIfExists) {
if (file_present) {
- rc = ksmbd_vfs_remove_file(work, link_name);
+ rc = ksmbd_vfs_remove_file(work, &path);
if (rc) {
rc = -EINVAL;
ksmbd_debug(SMB, "cannot delete %s\n",
@@ -5607,6 +5526,10 @@ static int smb2_create_link(struct ksmbd_work *work,
if (rc)
rc = -EINVAL;
out:
+ if (file_present) {
+ inode_unlock(d_inode(path.dentry->d_parent));
+ path_put(&path);
+ }
if (!IS_ERR(link_name))
kfree(link_name);
kfree(pathname);
@@ -5784,12 +5707,6 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
struct smb2_file_rename_info *rename_info,
unsigned int buf_len)
{
- struct mnt_idmap *idmap;
- struct ksmbd_file *parent_fp;
- struct dentry *parent;
- struct dentry *dentry = fp->filp->f_path.dentry;
- int ret;
-
if (!(fp->daccess & FILE_DELETE_LE)) {
pr_err("no right to delete : 0x%x\n", fp->daccess);
return -EACCES;
@@ -5799,32 +5716,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
le32_to_cpu(rename_info->FileNameLength))
return -EINVAL;
- idmap = file_mnt_idmap(fp->filp);
- if (ksmbd_stream_fd(fp))
- goto next;
-
- parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
- if (ret) {
- dput(parent);
- return ret;
- }
-
- parent_fp = ksmbd_lookup_fd_inode(d_inode(parent));
- inode_unlock(d_inode(parent));
- dput(parent);
+ if (!le32_to_cpu(rename_info->FileNameLength))
+ return -EINVAL;
- if (parent_fp) {
- if (parent_fp->daccess & FILE_DELETE_LE) {
- pr_err("parent dir is opened with delete access\n");
- ksmbd_fd_put(work, parent_fp);
- return -ESHARE;
- }
- ksmbd_fd_put(work, parent_fp);
- }
-next:
- return smb2_rename(work, fp, idmap, rename_info,
- work->conn->local_nls);
+ return smb2_rename(work, fp, rename_info, work->conn->local_nls);
}
static int set_file_disposition_info(struct ksmbd_file *fp,
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 9420dd2813fb..67dc552f2ef7 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -70,18 +70,6 @@ struct create_durable_req_v2 {
__u8 CreateGuid[16];
} __packed;
-struct create_durable_reconn_req {
- struct create_context ccontext;
- __u8 Name[8];
- union {
- __u8 Reserved[16];
- struct {
- __u64 PersistentFileId;
- __u64 VolatileFileId;
- } Fid;
- } Data;
-} __packed;
-
struct create_durable_reconn_v2_req {
struct create_context ccontext;
__u8 Name[8];
@@ -109,12 +97,6 @@ struct create_app_inst_id_vers {
__le64 AppInstanceVersionLow;
} __packed;
-struct create_mxac_req {
- struct create_context ccontext;
- __u8 Name[8];
- __le64 Timestamp;
-} __packed;
-
struct create_alloc_size_req {
struct create_context ccontext;
__u8 Name[8];
@@ -137,21 +119,6 @@ struct create_durable_v2_rsp {
__le32 Flags;
} __packed;
-struct create_mxac_rsp {
- struct create_context ccontext;
- __u8 Name[8];
- __le32 QueryStatus;
- __le32 MaximalAccess;
-} __packed;
-
-struct create_disk_id_rsp {
- struct create_context ccontext;
- __u8 Name[8];
- __le64 DiskFileId;
- __le64 VolumeId;
- __u8 Reserved[16];
-} __packed;
-
/* equivalent of the contents of SMB3.1.1 POSIX open context response */
struct create_posix_rsp {
struct create_context ccontext;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 5ea9229dad2c..778c152708e4 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -18,8 +18,7 @@
#include <linux/vmalloc.h>
#include <linux/sched/xacct.h>
#include <linux/crc32c.h>
-
-#include "../internal.h" /* for vfs_path_lookup */
+#include <linux/namei.h>
#include "glob.h"
#include "oplock.h"
@@ -37,19 +36,6 @@
#include "mgmt/user_session.h"
#include "mgmt/user_config.h"
-static char *extract_last_component(char *path)
-{
- char *p = strrchr(path, '/');
-
- if (p && p[1] != '\0') {
- *p = '\0';
- p++;
- } else {
- p = NULL;
- }
- return p;
-}
-
static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
struct inode *parent_inode,
struct inode *inode)
@@ -63,65 +49,77 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
/**
* ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
- *
- * the parent dentry got by dget_parent or @parent could be
- * unstable, we try to lock a parent inode and lookup the
- * child dentry again.
- *
- * the reference count of @parent isn't incremented.
*/
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
- struct dentry *child)
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
{
- struct dentry *dentry;
- int ret = 0;
-
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
- dentry = lookup_one(idmap, child->d_name.name, parent,
- child->d_name.len);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- goto out_err;
- }
-
- if (dentry != child) {
- ret = -ESTALE;
- dput(dentry);
- goto out_err;
+ if (child->d_parent != parent) {
+ inode_unlock(d_inode(parent));
+ return -ENOENT;
}
- dput(dentry);
return 0;
-out_err:
- inode_unlock(d_inode(parent));
- return ret;
}
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap,
- struct dentry *dentry)
+static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
+ char *pathname, unsigned int flags,
+ struct path *path)
{
- struct dentry *parent;
- int ret;
+ struct qstr last;
+ struct filename *filename;
+ struct path *root_share_path = &share_conf->vfs_path;
+ int err, type;
+ struct path parent_path;
+ struct dentry *d;
+
+ if (pathname[0] == '\0') {
+ pathname = share_conf->path;
+ root_share_path = NULL;
+ } else {
+ flags |= LOOKUP_BENEATH;
+ }
+
+ filename = getname_kernel(pathname);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
+ err = vfs_path_parent_lookup(filename, flags,
+ &parent_path, &last, &type,
+ root_share_path);
+ putname(filename);
+ if (err)
+ return err;
- parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
- if (ret) {
- dput(parent);
- return ret;
+ if (unlikely(type != LAST_NORM)) {
+ path_put(&parent_path);
+ return -ENOENT;
}
- ret = inode_permission(idmap, d_inode(parent),
- MAY_EXEC | MAY_WRITE);
+ inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+ if (IS_ERR(d))
+ goto err_out;
- inode_unlock(d_inode(parent));
- dput(parent);
- return ret;
+ if (d_is_negative(d)) {
+ dput(d);
+ goto err_out;
+ }
+
+ path->dentry = d;
+ path->mnt = share_conf->vfs_path.mnt;
+ path_put(&parent_path);
+
+ return 0;
+
+err_out:
+ inode_unlock(parent_path.dentry->d_inode);
+ path_put(&parent_path);
+ return -ENOENT;
}
int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
struct dentry *dentry, __le32 *daccess)
{
- struct dentry *parent;
int ret = 0;
*daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL);
@@ -138,18 +136,9 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC))
*daccess |= FILE_EXECUTE_LE;
- parent = dget_parent(dentry);
- ret = ksmbd_vfs_lock_parent(idmap, parent, dentry);
- if (ret) {
- dput(parent);
- return ret;
- }
-
- if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE))
+ if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE))
*daccess |= FILE_DELETE_LE;
- inode_unlock(d_inode(parent));
- dput(parent);
return ret;
}
@@ -582,54 +571,32 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
{
struct mnt_idmap *idmap;
- struct path path;
- struct dentry *parent;
+ struct dentry *parent = path->dentry->d_parent;
int err;
if (ksmbd_override_fsids(work))
return -ENOMEM;
- err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false);
- if (err) {
- ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
- ksmbd_revert_fsids(work);
- return err;
- }
-
- idmap = mnt_idmap(path.mnt);
- parent = dget_parent(path.dentry);
- err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry);
- if (err) {
- dput(parent);
- path_put(&path);
- ksmbd_revert_fsids(work);
- return err;
- }
-
- if (!d_inode(path.dentry)->i_nlink) {
+ if (!d_inode(path->dentry)->i_nlink) {
err = -ENOENT;
goto out_err;
}
- if (S_ISDIR(d_inode(path.dentry)->i_mode)) {
- err = vfs_rmdir(idmap, d_inode(parent), path.dentry);
+ idmap = mnt_idmap(path->mnt);
+ if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
+ err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
if (err && err != -ENOTEMPTY)
- ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name,
- err);
+ ksmbd_debug(VFS, "rmdir failed, err %d\n", err);
} else {
- err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL);
+ err = vfs_unlink(idmap, d_inode(parent), path->dentry, NULL);
if (err)
- ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name,
- err);
+ ksmbd_debug(VFS, "unlink failed, err %d\n", err);
}
out_err:
- inode_unlock(d_inode(parent));
- dput(parent);
- path_put(&path);
ksmbd_revert_fsids(work);
return err;
}
@@ -688,149 +655,114 @@ out1:
return err;
}
-static int ksmbd_validate_entry_in_use(struct dentry *src_dent)
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+ char *newname, int flags)
{
- struct dentry *dst_dent;
-
- spin_lock(&src_dent->d_lock);
- list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) {
- struct ksmbd_file *child_fp;
+ struct dentry *old_parent, *new_dentry, *trap;
+ struct dentry *old_child = old_path->dentry;
+ struct path new_path;
+ struct qstr new_last;
+ struct renamedata rd;
+ struct filename *to;
+ struct ksmbd_share_config *share_conf = work->tcon->share_conf;
+ struct ksmbd_file *parent_fp;
+ int new_type;
+ int err, lookup_flags = LOOKUP_NO_SYMLINKS;
- if (d_really_is_negative(dst_dent))
- continue;
+ if (ksmbd_override_fsids(work))
+ return -ENOMEM;
- child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent));
- if (child_fp) {
- spin_unlock(&src_dent->d_lock);
- ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n");
- return -EACCES;
- }
+ to = getname_kernel(newname);
+ if (IS_ERR(to)) {
+ err = PTR_ERR(to);
+ goto revert_fsids;
}
- spin_unlock(&src_dent->d_lock);
- return 0;
-}
+retry:
+ err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
+ &new_path, &new_last, &new_type,
+ &share_conf->vfs_path);
+ if (err)
+ goto out1;
-static int __ksmbd_vfs_rename(struct ksmbd_work *work,
- struct mnt_idmap *src_idmap,
- struct dentry *src_dent_parent,
- struct dentry *src_dent,
- struct mnt_idmap *dst_idmap,
- struct dentry *dst_dent_parent,
- struct dentry *trap_dent,
- char *dst_name)
-{
- struct dentry *dst_dent;
- int err;
+ if (old_path->mnt != new_path.mnt) {
+ err = -EXDEV;
+ goto out2;
+ }
- if (!work->tcon->posix_extensions) {
- err = ksmbd_validate_entry_in_use(src_dent);
- if (err)
- return err;
+ trap = lock_rename_child(old_child, new_path.dentry);
+
+ old_parent = dget(old_child->d_parent);
+ if (d_unhashed(old_child)) {
+ err = -EINVAL;
+ goto out3;
}
- if (d_really_is_negative(src_dent_parent))
- return -ENOENT;
- if (d_really_is_negative(dst_dent_parent))
- return -ENOENT;
- if (d_really_is_negative(src_dent))
- return -ENOENT;
- if (src_dent == trap_dent)
- return -EINVAL;
+ parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent));
+ if (parent_fp) {
+ if (parent_fp->daccess & FILE_DELETE_LE) {
+ pr_err("parent dir is opened with delete access\n");
+ err = -ESHARE;
+ ksmbd_fd_put(work, parent_fp);
+ goto out3;
+ }
+ ksmbd_fd_put(work, parent_fp);
+ }
- if (ksmbd_override_fsids(work))
- return -ENOMEM;
+ new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+ lookup_flags | LOOKUP_RENAME_TARGET);
+ if (IS_ERR(new_dentry)) {
+ err = PTR_ERR(new_dentry);
+ goto out3;
+ }
- dst_dent = lookup_one(dst_idmap, dst_name,
- dst_dent_parent, strlen(dst_name));
- err = PTR_ERR(dst_dent);
- if (IS_ERR(dst_dent)) {
- pr_err("lookup failed %s [%d]\n", dst_name, err);
- goto out;
+ if (d_is_symlink(new_dentry)) {
+ err = -EACCES;
+ goto out4;
}
- err = -ENOTEMPTY;
- if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) {
- struct renamedata rd = {
- .old_mnt_idmap = src_idmap,
- .old_dir = d_inode(src_dent_parent),
- .old_dentry = src_dent,
- .new_mnt_idmap = dst_idmap,
- .new_dir = d_inode(dst_dent_parent),
- .new_dentry = dst_dent,
- };
- err = vfs_rename(&rd);
+ if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
+ err = -EEXIST;
+ goto out4;
}
- if (err)
- pr_err("vfs_rename failed err %d\n", err);
- if (dst_dent)
- dput(dst_dent);
-out:
- ksmbd_revert_fsids(work);
- return err;
-}
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
- char *newname)
-{
- struct mnt_idmap *idmap;
- struct path dst_path;
- struct dentry *src_dent_parent, *dst_dent_parent;
- struct dentry *src_dent, *trap_dent, *src_child;
- char *dst_name;
- int err;
+ if (old_child == trap) {
+ err = -EINVAL;
+ goto out4;
+ }
- dst_name = extract_last_component(newname);
- if (!dst_name) {
- dst_name = newname;
- newname = "";
+ if (new_dentry == trap) {
+ err = -ENOTEMPTY;
+ goto out4;
}
- src_dent_parent = dget_parent(fp->filp->f_path.dentry);
- src_dent = fp->filp->f_path.dentry;
+ rd.old_mnt_idmap = mnt_idmap(old_path->mnt),
+ rd.old_dir = d_inode(old_parent),
+ rd.old_dentry = old_child,
+ rd.new_mnt_idmap = mnt_idmap(new_path.mnt),
+ rd.new_dir = new_path.dentry->d_inode,
+ rd.new_dentry = new_dentry,
+ rd.flags = flags,
+ err = vfs_rename(&rd);
+ if (err)
+ ksmbd_debug(VFS, "vfs_rename failed err %d\n", err);
- err = ksmbd_vfs_kern_path(work, newname,
- LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
- &dst_path, false);
- if (err) {
- ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err);
- goto out;
+out4:
+ dput(new_dentry);
+out3:
+ dput(old_parent);
+ unlock_rename(old_parent, new_path.dentry);
+out2:
+ path_put(&new_path);
+
+ if (retry_estale(err, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
}
- dst_dent_parent = dst_path.dentry;
-
- trap_dent = lock_rename(src_dent_parent, dst_dent_parent);
- dget(src_dent);
- dget(dst_dent_parent);
- idmap = file_mnt_idmap(fp->filp);
- src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent,
- src_dent->d_name.len);
- if (IS_ERR(src_child)) {
- err = PTR_ERR(src_child);
- goto out_lock;
- }
-
- if (src_child != src_dent) {
- err = -ESTALE;
- dput(src_child);
- goto out_lock;
- }
- dput(src_child);
-
- err = __ksmbd_vfs_rename(work,
- idmap,
- src_dent_parent,
- src_dent,
- mnt_idmap(dst_path.mnt),
- dst_dent_parent,
- trap_dent,
- dst_name);
-out_lock:
- dput(src_dent);
- dput(dst_dent_parent);
- unlock_rename(src_dent_parent, dst_dent_parent);
- path_put(&dst_path);
-out:
- dput(src_dent_parent);
+out1:
+ putname(to);
+revert_fsids:
+ ksmbd_revert_fsids(work);
return err;
}
@@ -1081,14 +1013,16 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
return vfs_removexattr(idmap, dentry, attr_name);
}
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
- struct dentry *dir, struct dentry *dentry)
+int ksmbd_vfs_unlink(struct file *filp)
{
int err = 0;
+ struct dentry *dir, *dentry = filp->f_path.dentry;
+ struct mnt_idmap *idmap = file_mnt_idmap(filp);
- err = ksmbd_vfs_lock_parent(idmap, dir, dentry);
+ dir = dget_parent(dentry);
+ err = ksmbd_vfs_lock_parent(dir, dentry);
if (err)
- return err;
+ goto out;
dget(dentry);
if (S_ISDIR(d_inode(dentry)->i_mode))
@@ -1100,6 +1034,8 @@ int ksmbd_vfs_unlink(struct mnt_idmap *idmap,
inode_unlock(d_inode(dir));
if (err)
ksmbd_debug(VFS, "failed to delete, err %d\n", err);
+out:
+ dput(dir);
return err;
}
@@ -1202,7 +1138,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
}
/**
- * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
* @name: file path that is relative to share
* @flags: lookup flags
* @path: if lookup succeed, return path info
@@ -1210,24 +1146,20 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *path, bool caseless)
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+ unsigned int flags, struct path *path,
+ bool caseless)
{
struct ksmbd_share_config *share_conf = work->tcon->share_conf;
int err;
+ struct path parent_path;
- flags |= LOOKUP_BENEATH;
- err = vfs_path_lookup(share_conf->vfs_path.dentry,
- share_conf->vfs_path.mnt,
- name,
- flags,
- path);
+ err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path);
if (!err)
- return 0;
+ return err;
if (caseless) {
char *filepath;
- struct path parent;
size_t path_len, remain_len;
filepath = kstrdup(name, GFP_KERNEL);
@@ -1237,10 +1169,10 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
path_len = strlen(filepath);
remain_len = path_len;
- parent = share_conf->vfs_path;
- path_get(&parent);
+ parent_path = share_conf->vfs_path;
+ path_get(&parent_path);
- while (d_can_lookup(parent.dentry)) {
+ while (d_can_lookup(parent_path.dentry)) {
char *filename = filepath + path_len - remain_len;
char *next = strchrnul(filename, '/');
size_t filename_len = next - filename;
@@ -1249,12 +1181,11 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
if (filename_len == 0)
break;
- err = ksmbd_vfs_lookup_in_dir(&parent, filename,
+ err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
filename_len,
work->conn->um);
- path_put(&parent);
if (err)
- goto out;
+ goto out2;
next[0] = '\0';
@@ -1262,23 +1193,31 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
share_conf->vfs_path.mnt,
filepath,
flags,
- &parent);
+ path);
if (err)
- goto out;
- else if (is_last) {
- *path = parent;
- goto out;
- }
+ goto out2;
+ else if (is_last)
+ goto out1;
+ path_put(&parent_path);
+ parent_path = *path;
next[0] = '/';
remain_len -= filename_len + 1;
}
- path_put(&parent);
err = -EINVAL;
-out:
+out2:
+ path_put(&parent_path);
+out1:
kfree(filepath);
}
+
+ if (!err) {
+ err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry);
+ if (err)
+ dput(path->dentry);
+ path_put(&parent_path);
+ }
return err;
}
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 9d676ab0cd25..a4ae89f3230d 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -71,9 +71,7 @@ struct ksmbd_kstat {
__le32 file_attributes;
};
-int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent,
- struct dentry *child);
-int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry);
+int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child);
int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
struct dentry *dentry, __le32 *daccess);
int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode);
@@ -84,12 +82,12 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
char *buf, size_t count, loff_t *pos, bool sync,
ssize_t *written);
int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
-int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
+int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path);
int ksmbd_vfs_link(struct ksmbd_work *work,
const char *oldname, const char *newname);
int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
-int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
- char *newname);
+int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
+ char *newname, int flags);
int ksmbd_vfs_truncate(struct ksmbd_work *work,
struct ksmbd_file *fp, loff_t size);
struct srv_copychunk;
@@ -116,9 +114,9 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
struct dentry *dentry, char *attr_name);
-int ksmbd_vfs_kern_path(struct ksmbd_work *work,
- char *name, unsigned int flags, struct path *path,
- bool caseless);
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
+ unsigned int flags, struct path *path,
+ bool caseless);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
@@ -131,8 +129,7 @@ struct file_allocated_range_buffer;
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
struct file_allocated_range_buffer *ranges,
unsigned int in_count, unsigned int *out_count);
-int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir,
- struct dentry *dentry);
+int ksmbd_vfs_unlink(struct file *filp);
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat);
int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
struct mnt_idmap *idmap,
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 054a7d2e0f48..2d0138e72d78 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -244,7 +244,6 @@ void ksmbd_release_inode_hash(void)
static void __ksmbd_inode_close(struct ksmbd_file *fp)
{
- struct dentry *dir, *dentry;
struct ksmbd_inode *ci = fp->f_ci;
int err;
struct file *filp;
@@ -263,11 +262,9 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
if (atomic_dec_and_test(&ci->m_count)) {
write_lock(&ci->m_lock);
if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) {
- dentry = filp->f_path.dentry;
- dir = dentry->d_parent;
ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING);
write_unlock(&ci->m_lock);
- ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry);
+ ksmbd_vfs_unlink(filp);
write_lock(&ci->m_lock);
}
write_unlock(&ci->m_lock);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 6d5e83ed4476..ac9f9d84510e 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -3,10 +3,12 @@
# Makefile for the linux lock manager stuff
#
+ccflags-y += -I$(src) # needed for trace events
+
obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o xdr.o
+lockd-objs-y += clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o
lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
lockd-objs-$(CONFIG_PROC_FS) += procfs.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 82b19a30e0f0..e3972aa3045a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,9 +14,12 @@
#include <linux/nfs_fs.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/lockd.h>
#include <linux/kthread.h>
+#include "trace.h"
+
#define NLMDBG_FACILITY NLMDBG_CLIENT
/*
@@ -29,18 +32,6 @@ static int reclaimer(void *ptr);
* client perspective.
*/
-/*
- * This is the representation of a blocked client lock.
- */
-struct nlm_wait {
- struct list_head b_list; /* linked list */
- wait_queue_head_t b_wait; /* where to wait on */
- struct nlm_host * b_host;
- struct file_lock * b_lock; /* local file lock */
- unsigned short b_reclaim; /* got to reclaim lock */
- __be32 b_status; /* grant callback status */
-};
-
static LIST_HEAD(nlm_blocked);
static DEFINE_SPINLOCK(nlm_blocked_lock);
@@ -94,41 +85,42 @@ void nlmclnt_done(struct nlm_host *host)
}
EXPORT_SYMBOL_GPL(nlmclnt_done);
+void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct file_lock *fl)
+{
+ block->b_host = host;
+ block->b_lock = fl;
+ init_waitqueue_head(&block->b_wait);
+ block->b_status = nlm_lck_blocked;
+}
+
/*
* Queue up a lock for blocking so that the GRANTED request can see it
*/
-struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
+void nlmclnt_queue_block(struct nlm_wait *block)
{
- struct nlm_wait *block;
-
- block = kmalloc(sizeof(*block), GFP_KERNEL);
- if (block != NULL) {
- block->b_host = host;
- block->b_lock = fl;
- init_waitqueue_head(&block->b_wait);
- block->b_status = nlm_lck_blocked;
-
- spin_lock(&nlm_blocked_lock);
- list_add(&block->b_list, &nlm_blocked);
- spin_unlock(&nlm_blocked_lock);
- }
- return block;
+ spin_lock(&nlm_blocked_lock);
+ list_add(&block->b_list, &nlm_blocked);
+ spin_unlock(&nlm_blocked_lock);
}
-void nlmclnt_finish_block(struct nlm_wait *block)
+/*
+ * Dequeue the block and return its final status
+ */
+__be32 nlmclnt_dequeue_block(struct nlm_wait *block)
{
- if (block == NULL)
- return;
+ __be32 status;
+
spin_lock(&nlm_blocked_lock);
list_del(&block->b_list);
+ status = block->b_status;
spin_unlock(&nlm_blocked_lock);
- kfree(block);
+ return status;
}
/*
* Block on a lock
*/
-int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
+int nlmclnt_wait(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
{
long ret;
@@ -154,7 +146,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/* Reset the lock status after a server reboot so we resend */
if (block->b_status == nlm_lck_denied_grace_period)
block->b_status = nlm_lck_blocked;
- req->a_res.status = block->b_status;
return 0;
}
@@ -198,6 +189,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
res = nlm_granted;
}
spin_unlock(&nlm_blocked_lock);
+ trace_nlmclnt_grant(lock, addr, svc_addr_len(addr), res);
return res;
}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 16b4de868cd2..fba6c7fa7474 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -20,6 +20,8 @@
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
+#include "trace.h"
+
#define NLMDBG_FACILITY NLMDBG_CLIENT
#define NLMCLNT_GRACE_WAIT (5*HZ)
#define NLMCLNT_POLL_TIMEOUT (30*HZ)
@@ -451,6 +453,9 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
status = nlm_stat_to_errno(req->a_res.status);
}
out:
+ trace_nlmclnt_test(&req->a_args.lock,
+ (const struct sockaddr *)&req->a_host->h_addr,
+ req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
}
@@ -516,9 +521,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
const struct cred *cred = nfs_file_cred(fl->fl_file);
struct nlm_host *host = req->a_host;
struct nlm_res *resp = &req->a_res;
- struct nlm_wait *block = NULL;
+ struct nlm_wait block;
unsigned char fl_flags = fl->fl_flags;
unsigned char fl_type;
+ __be32 b_status;
int status = -ENOLCK;
if (nsm_monitor(host) < 0)
@@ -531,31 +537,41 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
if (status < 0)
goto out;
- block = nlmclnt_prepare_block(host, fl);
+ nlmclnt_prepare_block(&block, host, fl);
again:
/*
* Initialise resp->status to a valid non-zero value,
* since 0 == nlm_lck_granted
*/
resp->status = nlm_lck_blocked;
- for(;;) {
+
+ /*
+ * A GRANTED callback can come at any time -- even before the reply
+ * to the LOCK request arrives, so we queue the wait before
+ * requesting the lock.
+ */
+ nlmclnt_queue_block(&block);
+ for (;;) {
/* Reboot protection */
fl->fl_u.nfs_fl.state = host->h_state;
status = nlmclnt_call(cred, req, NLMPROC_LOCK);
if (status < 0)
break;
/* Did a reclaimer thread notify us of a server reboot? */
- if (resp->status == nlm_lck_denied_grace_period)
+ if (resp->status == nlm_lck_denied_grace_period)
continue;
if (resp->status != nlm_lck_blocked)
break;
/* Wait on an NLM blocking lock */
- status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+ status = nlmclnt_wait(&block, req, NLMCLNT_POLL_TIMEOUT);
if (status < 0)
break;
- if (resp->status != nlm_lck_blocked)
+ if (block.b_status != nlm_lck_blocked)
break;
}
+ b_status = nlmclnt_dequeue_block(&block);
+ if (resp->status == nlm_lck_blocked)
+ resp->status = b_status;
/* if we were interrupted while blocking, then cancel the lock request
* and exit
@@ -564,7 +580,7 @@ again:
if (!req->a_args.block)
goto out_unlock;
if (nlmclnt_cancel(host, req->a_args.block, fl) == 0)
- goto out_unblock;
+ goto out;
}
if (resp->status == nlm_granted) {
@@ -593,16 +609,19 @@ again:
status = -ENOLCK;
else
status = nlm_stat_to_errno(resp->status);
-out_unblock:
- nlmclnt_finish_block(block);
out:
+ trace_nlmclnt_lock(&req->a_args.lock,
+ (const struct sockaddr *)&req->a_host->h_addr,
+ req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
out_unlock:
/* Fatal error: ensure that we remove the lock altogether */
+ trace_nlmclnt_lock(&req->a_args.lock,
+ (const struct sockaddr *)&req->a_host->h_addr,
+ req->a_host->h_addrlen, req->a_res.status);
dprintk("lockd: lock attempt ended in fatal error.\n"
" Attempting to unlock.\n");
- nlmclnt_finish_block(block);
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
down_read(&host->h_rwsem);
@@ -696,6 +715,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
/* What to do now? I'm out of my depth... */
status = -ENOLCK;
out:
+ trace_nlmclnt_unlock(&req->a_args.lock,
+ (const struct sockaddr *)&req->a_host->h_addr,
+ req->a_host->h_addrlen, req->a_res.status);
nlmclnt_release_call(req);
return status;
}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index cdc8e12cdac4..127a728fcbc8 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -629,6 +629,7 @@ nlm_shutdown_hosts_net(struct net *net)
rpc_shutdown_client(host->h_rpcclnt);
host->h_rpcclnt = NULL;
}
+ nlmsvc_free_host_resources(host);
}
/* Then, perform a garbage collection pass */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9a47303b2cba..bb94949bc223 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -510,24 +510,6 @@ static struct ctl_table nlm_sysctls[] = {
{ }
};
-static struct ctl_table nlm_sysctl_dir[] = {
- {
- .procname = "nfs",
- .mode = 0555,
- .child = nlm_sysctls,
- },
- { }
-};
-
-static struct ctl_table nlm_sysctl_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = nlm_sysctl_dir,
- },
- { }
-};
-
#endif /* CONFIG_SYSCTL */
/*
@@ -644,7 +626,7 @@ static int __init init_nlm(void)
#ifdef CONFIG_SYSCTL
err = -ENOMEM;
- nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
+ nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls);
if (nlm_sysctl_table == NULL)
goto err_sysctl;
#endif
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 4e30f3c50970..c43ccdf28ed9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -954,19 +954,32 @@ void
nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
{
struct nlm_block *block;
+ struct file_lock *fl;
+ int error;
dprintk("grant_reply: looking for cookie %x, s=%d \n",
*(unsigned int *)(cookie->data), status);
if (!(block = nlmsvc_find_block(cookie)))
return;
- if (status == nlm_lck_denied_grace_period) {
+ switch (status) {
+ case nlm_lck_denied_grace_period:
/* Try again in a couple of seconds */
nlmsvc_insert_block(block, 10 * HZ);
- } else {
+ break;
+ case nlm_lck_denied:
+ /* Client doesn't want it, just unlock it */
+ nlmsvc_unlink_block(block);
+ fl = &block->b_call->a_args.lock.fl;
+ fl->fl_type = F_UNLCK;
+ error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL);
+ if (error)
+ pr_warn("lockd: unable to unlock lock rejected by client!\n");
+ break;
+ default:
/*
- * Lock is now held by client, or has been rejected.
- * In both cases, the block should be removed.
+ * Either it was accepted or the status makes no sense
+ * just unlink it either way.
*/
nlmsvc_unlink_block(block);
}
diff --git a/fs/lockd/trace.c b/fs/lockd/trace.c
new file mode 100644
index 000000000000..d9a6ff6e673c
--- /dev/null
+++ b/fs/lockd/trace.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/lockd/trace.h b/fs/lockd/trace.h
new file mode 100644
index 000000000000..7461b13b6e74
--- /dev/null
+++ b/fs/lockd/trace.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockd
+
+#if !defined(_TRACE_LOCKD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKD_H
+
+#include <linux/tracepoint.h>
+#include <linux/crc32.h>
+#include <linux/nfs.h>
+#include <linux/lockd/lockd.h>
+
+#ifdef CONFIG_LOCKD_V4
+#define NLM_STATUS_LIST \
+ nlm_status_code(LCK_GRANTED) \
+ nlm_status_code(LCK_DENIED) \
+ nlm_status_code(LCK_DENIED_NOLOCKS) \
+ nlm_status_code(LCK_BLOCKED) \
+ nlm_status_code(LCK_DENIED_GRACE_PERIOD) \
+ nlm_status_code(DEADLCK) \
+ nlm_status_code(ROFS) \
+ nlm_status_code(STALE_FH) \
+ nlm_status_code(FBIG) \
+ nlm_status_code_end(FAILED)
+#else
+#define NLM_STATUS_LIST \
+ nlm_status_code(LCK_GRANTED) \
+ nlm_status_code(LCK_DENIED) \
+ nlm_status_code(LCK_DENIED_NOLOCKS) \
+ nlm_status_code(LCK_BLOCKED) \
+ nlm_status_code_end(LCK_DENIED_GRACE_PERIOD)
+#endif
+
+#undef nlm_status_code
+#undef nlm_status_code_end
+#define nlm_status_code(x) TRACE_DEFINE_ENUM(NLM_##x);
+#define nlm_status_code_end(x) TRACE_DEFINE_ENUM(NLM_##x);
+
+NLM_STATUS_LIST
+
+#undef nlm_status_code
+#undef nlm_status_code_end
+#define nlm_status_code(x) { NLM_##x, #x },
+#define nlm_status_code_end(x) { NLM_##x, #x }
+
+#define show_nlm_status(x) __print_symbolic(x, NLM_STATUS_LIST)
+
+DECLARE_EVENT_CLASS(nlmclnt_lock_event,
+ TP_PROTO(
+ const struct nlm_lock *lock,
+ const struct sockaddr *addr,
+ unsigned int addrlen,
+ __be32 status
+ ),
+
+ TP_ARGS(lock, addr, addrlen, status),
+
+ TP_STRUCT__entry(
+ __field(u32, oh)
+ __field(u32, svid)
+ __field(u32, fh)
+ __field(unsigned long, status)
+ __field(u64, start)
+ __field(u64, len)
+ __sockaddr(addr, addrlen)
+ ),
+
+ TP_fast_assign(
+ __entry->oh = ~crc32_le(0xffffffff, lock->oh.data, lock->oh.len);
+ __entry->svid = lock->svid;
+ __entry->fh = nfs_fhandle_hash(&lock->fh);
+ __entry->start = lock->lock_start;
+ __entry->len = lock->lock_len;
+ __entry->status = be32_to_cpu(status);
+ __assign_sockaddr(addr, addr, addrlen);
+ ),
+
+ TP_printk(
+ "addr=%pISpc oh=0x%08x svid=0x%08x fh=0x%08x start=%llu len=%llu status=%s",
+ __get_sockaddr(addr), __entry->oh, __entry->svid,
+ __entry->fh, __entry->start, __entry->len,
+ show_nlm_status(__entry->status)
+ )
+);
+
+#define DEFINE_NLMCLNT_EVENT(name) \
+ DEFINE_EVENT(nlmclnt_lock_event, name, \
+ TP_PROTO( \
+ const struct nlm_lock *lock, \
+ const struct sockaddr *addr, \
+ unsigned int addrlen, \
+ __be32 status \
+ ), \
+ TP_ARGS(lock, addr, addrlen, status))
+
+DEFINE_NLMCLNT_EVENT(nlmclnt_test);
+DEFINE_NLMCLNT_EVENT(nlmclnt_lock);
+DEFINE_NLMCLNT_EVENT(nlmclnt_unlock);
+DEFINE_NLMCLNT_EVENT(nlmclnt_grant);
+
+#endif /* _TRACE_LOCKD_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/mpage.c b/fs/mpage.c
index 22b9de5ddd68..242e213ee064 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,23 +43,49 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio)
+static void mpage_read_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
+ int err = blk_status_to_errno(bio->bi_status);
+
+ bio_for_each_folio_all(fi, bio) {
+ if (err)
+ folio_set_error(fi.folio);
+ else
+ folio_mark_uptodate(fi.folio);
+ folio_unlock(fi.folio);
+ }
+
+ bio_put(bio);
+}
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
- page_endio(page, bio_op(bio),
- blk_status_to_errno(bio->bi_status));
+static void mpage_write_end_io(struct bio *bio)
+{
+ struct folio_iter fi;
+ int err = blk_status_to_errno(bio->bi_status);
+
+ bio_for_each_folio_all(fi, bio) {
+ if (err) {
+ folio_set_error(fi.folio);
+ mapping_set_error(fi.folio->mapping, err);
+ }
+ folio_end_writeback(fi.folio);
}
bio_put(bio);
}
-static struct bio *mpage_bio_submit(struct bio *bio)
+static struct bio *mpage_bio_submit_read(struct bio *bio)
+{
+ bio->bi_end_io = mpage_read_end_io;
+ guard_bio_eod(bio);
+ submit_bio(bio);
+ return NULL;
+}
+
+static struct bio *mpage_bio_submit_write(struct bio *bio)
{
- bio->bi_end_io = mpage_end_io;
+ bio->bi_end_io = mpage_write_end_io;
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
@@ -265,7 +291,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* This folio will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
alloc_new:
if (args->bio == NULL) {
@@ -278,7 +304,7 @@ alloc_new:
length = first_hole << blkbits;
if (!bio_add_folio(args->bio, folio, length, 0)) {
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
goto alloc_new;
}
@@ -286,7 +312,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -294,7 +320,7 @@ out:
confused:
if (args->bio)
- args->bio = mpage_bio_submit(args->bio);
+ args->bio = mpage_bio_submit_read(args->bio);
if (!folio_test_uptodate(folio))
block_read_full_folio(folio, args->get_block);
else
@@ -356,7 +382,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
}
if (args.bio)
- mpage_bio_submit(args.bio);
+ mpage_bio_submit_read(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
@@ -373,7 +399,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
if (args.bio)
- mpage_bio_submit(args.bio);
+ mpage_bio_submit_read(args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_read_folio);
@@ -577,7 +603,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
alloc_new:
if (bio == NULL) {
@@ -596,7 +622,7 @@ alloc_new:
wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
goto alloc_new;
}
@@ -606,7 +632,7 @@ alloc_new:
folio_start_writeback(folio);
folio_unlock(folio);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -618,7 +644,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(bio);
+ bio = mpage_bio_submit_write(bio);
/*
* The caller has a ref on the inode, so *mapping is stable
@@ -652,7 +678,7 @@ mpage_writepages(struct address_space *mapping,
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio)
- mpage_bio_submit(mpd.bio);
+ mpage_bio_submit_write(mpd.bio);
blk_finish_plug(&plug);
return ret;
}
diff --git a/fs/namei.c b/fs/namei.c
index f04f7be58933..e4fe0879ae55 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -254,6 +254,7 @@ getname_kernel(const char * filename)
return result;
}
+EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
@@ -271,6 +272,7 @@ void putname(struct filename *name)
} else
__putname(name);
}
+EXPORT_SYMBOL(putname);
/**
* check_acl - perform ACL permission checking
@@ -1581,8 +1583,9 @@ static struct dentry *lookup_dcache(const struct qstr *name,
* when directory is guaranteed to have no in-lookup children
* at all.
*/
-static struct dentry *__lookup_hash(const struct qstr *name,
- struct dentry *base, unsigned int flags)
+struct dentry *lookup_one_qstr_excl(const struct qstr *name,
+ struct dentry *base,
+ unsigned int flags)
{
struct dentry *dentry = lookup_dcache(name, base, flags);
struct dentry *old;
@@ -1606,6 +1609,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
}
return dentry;
}
+EXPORT_SYMBOL(lookup_one_qstr_excl);
static struct dentry *lookup_fast(struct nameidata *nd)
{
@@ -2532,16 +2536,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags,
}
/* Note: this does not consume "name" */
-static int filename_parentat(int dfd, struct filename *name,
- unsigned int flags, struct path *parent,
- struct qstr *last, int *type)
+static int __filename_parentat(int dfd, struct filename *name,
+ unsigned int flags, struct path *parent,
+ struct qstr *last, int *type,
+ const struct path *root)
{
int retval;
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
- set_nameidata(&nd, dfd, name, NULL);
+ set_nameidata(&nd, dfd, name, root);
retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
if (unlikely(retval == -ECHILD))
retval = path_parentat(&nd, flags, parent);
@@ -2556,6 +2561,13 @@ static int filename_parentat(int dfd, struct filename *name,
return retval;
}
+static int filename_parentat(int dfd, struct filename *name,
+ unsigned int flags, struct path *parent,
+ struct qstr *last, int *type)
+{
+ return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
+}
+
/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
{
@@ -2571,7 +2583,7 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
return ERR_PTR(-EINVAL);
}
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- d = __lookup_hash(&last, path->dentry, 0);
+ d = lookup_one_qstr_excl(&last, path->dentry, 0);
if (IS_ERR(d)) {
inode_unlock(path->dentry->d_inode);
path_put(path);
@@ -2600,6 +2612,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
EXPORT_SYMBOL(kern_path);
/**
+ * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
+ * @filename: filename structure
+ * @flags: lookup flags
+ * @parent: pointer to struct path to fill
+ * @last: last component
+ * @type: type of the last component
+ * @root: pointer to struct path of the base directory
+ */
+int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
+ struct path *parent, struct qstr *last, int *type,
+ const struct path *root)
+{
+ return __filename_parentat(AT_FDCWD, filename, flags, parent, last,
+ type, root);
+}
+EXPORT_SYMBOL(vfs_path_parent_lookup);
+
+/**
* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
* @dentry: pointer to dentry of the base directory
* @mnt: pointer to vfs mount of the base directory
@@ -2980,20 +3010,10 @@ static inline int may_create(struct mnt_idmap *idmap,
return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}
-/*
- * p1 and p2 should be directories on the same fs.
- */
-struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
struct dentry *p;
- if (p1 == p2) {
- inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
- return NULL;
- }
-
- mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
-
p = d_ancestor(p2, p1);
if (p) {
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
@@ -3012,8 +3032,64 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
return NULL;
}
+
+/*
+ * p1 and p2 should be directories on the same fs.
+ */
+struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+{
+ if (p1 == p2) {
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ return NULL;
+ }
+
+ mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+ return lock_two_directories(p1, p2);
+}
EXPORT_SYMBOL(lock_rename);
+/*
+ * c1 and p2 should be on the same fs.
+ */
+struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
+{
+ if (READ_ONCE(c1->d_parent) == p2) {
+ /*
+ * hopefully won't need to touch ->s_vfs_rename_mutex at all.
+ */
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ /*
+ * now that p2 is locked, nobody can move in or out of it,
+ * so the test below is safe.
+ */
+ if (likely(c1->d_parent == p2))
+ return NULL;
+
+ /*
+ * c1 got moved out of p2 while we'd been taking locks;
+ * unlock and fall back to slow case.
+ */
+ inode_unlock(p2->d_inode);
+ }
+
+ mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
+ /*
+ * nobody can move out of any directories on this fs.
+ */
+ if (likely(c1->d_parent != p2))
+ return lock_two_directories(c1->d_parent, p2);
+
+ /*
+ * c1 got moved into p2 while we were taking locks;
+ * we need p2 locked and ->s_vfs_rename_mutex unlocked,
+ * for consistency with lock_rename().
+ */
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
+ return NULL;
+}
+EXPORT_SYMBOL(lock_rename_child);
+
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
inode_unlock(p1->d_inode);
@@ -3806,7 +3882,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
if (last.name[last.len] && !want_dir)
create_flags = 0;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
+ dentry = lookup_one_qstr_excl(&last, path->dentry,
+ reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -4166,7 +4243,7 @@ retry:
goto exit2;
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+ dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
@@ -4299,7 +4376,7 @@ retry:
goto exit2;
retry_deleg:
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+ dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
@@ -4863,7 +4940,8 @@ retry:
retry_deleg:
trap = lock_rename(new_path.dentry, old_path.dentry);
- old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
+ old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
+ lookup_flags);
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
@@ -4871,7 +4949,8 @@ retry_deleg:
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
- new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
+ new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
+ lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index e3d754a9e1b0..3404707ddbe7 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -349,8 +349,8 @@ int netfs_write_begin(struct netfs_inode *ctx,
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c1c7ed2fd860..b6fc169be1b1 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -170,6 +170,7 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+ select NETFS_SUPPORT
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6fbcbb8d6587..8257de6dba45 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *);
static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static void nfs_readdir_free_folio(struct folio *);
+static void nfs_readdir_clear_array(struct folio *);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = {
};
const struct address_space_operations nfs_dir_aops = {
- .free_folio = nfs_readdir_free_folio,
+ .free_folio = nfs_readdir_clear_array,
};
#define NFS_INIT_DTSIZE PAGE_SIZE
@@ -146,18 +146,18 @@ struct nfs_cache_array {
u64 change_attr;
u64 last_cookie;
unsigned int size;
- unsigned char page_full : 1,
- page_is_eof : 1,
+ unsigned char folio_full : 1,
+ folio_is_eof : 1,
cookies_are_ordered : 1;
struct nfs_cache_array_entry array[];
};
struct nfs_readdir_descriptor {
struct file *file;
- struct page *page;
+ struct folio *folio;
struct dir_context *ctx;
- pgoff_t page_index;
- pgoff_t page_index_max;
+ pgoff_t folio_index;
+ pgoff_t folio_index_max;
u64 dir_cookie;
u64 last_cookie;
loff_t current_index;
@@ -198,17 +198,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
nfs_set_dtsize(desc, desc->dtsize << 1);
}
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
- u64 change_attr)
+static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
{
struct nfs_cache_array *array;
- array = kmap_local_page(page);
+ array = kmap_local_folio(folio, 0);
array->change_attr = change_attr;
array->last_cookie = last_cookie;
array->size = 0;
- array->page_full = 0;
- array->page_is_eof = 0;
+ array->folio_full = 0;
+ array->folio_is_eof = 0;
array->cookies_are_ordered = 1;
kunmap_local(array);
}
@@ -216,44 +216,39 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
-static void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct folio *folio)
{
struct nfs_cache_array *array;
unsigned int i;
- array = kmap_local_page(page);
+ array = kmap_local_folio(folio, 0);
for (i = 0; i < array->size; i++)
kfree(array->array[i].name);
array->size = 0;
kunmap_local(array);
}
-static void nfs_readdir_free_folio(struct folio *folio)
+static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
{
- nfs_readdir_clear_array(&folio->page);
+ nfs_readdir_clear_array(folio);
+ nfs_readdir_folio_init_array(folio, last_cookie, change_attr);
}
-static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
- u64 change_attr)
+static struct folio *
+nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags)
{
- nfs_readdir_clear_array(page);
- nfs_readdir_page_init_array(page, last_cookie, change_attr);
+ struct folio *folio = folio_alloc(gfp_flags, 0);
+ if (folio)
+ nfs_readdir_folio_init_array(folio, last_cookie, 0);
+ return folio;
}
-static struct page *
-nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
+static void nfs_readdir_folio_array_free(struct folio *folio)
{
- struct page *page = alloc_page(gfp_flags);
- if (page)
- nfs_readdir_page_init_array(page, last_cookie, 0);
- return page;
-}
-
-static void nfs_readdir_page_array_free(struct page *page)
-{
- if (page) {
- nfs_readdir_clear_array(page);
- put_page(page);
+ if (folio) {
+ nfs_readdir_clear_array(folio);
+ folio_put(folio);
}
}
@@ -264,13 +259,13 @@ static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
{
- array->page_is_eof = 1;
- array->page_full = 1;
+ array->folio_is_eof = 1;
+ array->folio_full = 1;
}
static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
{
- return array->page_full;
+ return array->folio_full;
}
/*
@@ -302,18 +297,18 @@ static size_t nfs_readdir_array_maxentries(void)
*/
static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
{
- if (array->page_full)
+ if (array->folio_full)
return -ENOSPC;
if (array->size == nfs_readdir_array_maxentries()) {
- array->page_full = 1;
+ array->folio_full = 1;
return -ENOSPC;
}
return 0;
}
-static int nfs_readdir_page_array_append(struct page *page,
- const struct nfs_entry *entry,
- u64 *cookie)
+static int nfs_readdir_folio_array_append(struct folio *folio,
+ const struct nfs_entry *entry,
+ u64 *cookie)
{
struct nfs_cache_array *array;
struct nfs_cache_array_entry *cache_entry;
@@ -322,7 +317,7 @@ static int nfs_readdir_page_array_append(struct page *page,
name = nfs_readdir_copy_name(entry->name, entry->len);
- array = kmap_atomic(page);
+ array = kmap_atomic(folio_page(folio, 0));
if (!name)
goto out;
ret = nfs_readdir_array_can_expand(array);
@@ -361,17 +356,17 @@ out:
* 127 readdir entries for a typical 64-bit system, that works out to a
* cache of ~ 33 million entries per directory.
*/
-static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie)
{
if (cookie == 0)
return 0;
return hash_64(cookie, 18);
}
-static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
- u64 change_attr)
+static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie,
+ u64 change_attr)
{
- struct nfs_cache_array *array = kmap_local_page(page);
+ struct nfs_cache_array *array = kmap_local_folio(folio, 0);
int ret = true;
if (array->change_attr != change_attr)
@@ -382,81 +377,83 @@ static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
return ret;
}
-static void nfs_readdir_page_unlock_and_put(struct page *page)
+static void nfs_readdir_folio_unlock_and_put(struct folio *folio)
{
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
-static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie,
- u64 change_attr)
+static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie,
+ u64 change_attr)
{
- if (PageUptodate(page)) {
- if (nfs_readdir_page_validate(page, cookie, change_attr))
+ if (folio_test_uptodate(folio)) {
+ if (nfs_readdir_folio_validate(folio, cookie, change_attr))
return;
- nfs_readdir_clear_array(page);
+ nfs_readdir_clear_array(folio);
}
- nfs_readdir_page_init_array(page, cookie, change_attr);
- SetPageUptodate(page);
+ nfs_readdir_folio_init_array(folio, cookie, change_attr);
+ folio_mark_uptodate(folio);
}
-static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
- u64 cookie, u64 change_attr)
+static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping,
+ u64 cookie, u64 change_attr)
{
- pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
- struct page *page;
+ pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+ struct folio *folio;
- page = grab_cache_page(mapping, index);
- if (!page)
+ folio = filemap_grab_folio(mapping, index);
+ if (!folio)
return NULL;
- nfs_readdir_page_init_and_validate(page, cookie, change_attr);
- return page;
+ nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+ return folio;
}
-static u64 nfs_readdir_page_last_cookie(struct page *page)
+static u64 nfs_readdir_folio_last_cookie(struct folio *folio)
{
struct nfs_cache_array *array;
u64 ret;
- array = kmap_local_page(page);
+ array = kmap_local_folio(folio, 0);
ret = array->last_cookie;
kunmap_local(array);
return ret;
}
-static bool nfs_readdir_page_needs_filling(struct page *page)
+static bool nfs_readdir_folio_needs_filling(struct folio *folio)
{
struct nfs_cache_array *array;
bool ret;
- array = kmap_local_page(page);
+ array = kmap_local_folio(folio, 0);
ret = !nfs_readdir_array_is_full(array);
kunmap_local(array);
return ret;
}
-static void nfs_readdir_page_set_eof(struct page *page)
+static void nfs_readdir_folio_set_eof(struct folio *folio)
{
struct nfs_cache_array *array;
- array = kmap_local_page(page);
+ array = kmap_local_folio(folio, 0);
nfs_readdir_array_set_eof(array);
kunmap_local(array);
}
-static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
- u64 cookie, u64 change_attr)
+static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping,
+ u64 cookie, u64 change_attr)
{
- pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
- struct page *page;
+ pgoff_t index = nfs_readdir_folio_cookie_hash(cookie);
+ struct folio *folio;
- page = grab_cache_page_nowait(mapping, index);
- if (!page)
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ mapping_gfp_mask(mapping));
+ if (!folio)
return NULL;
- nfs_readdir_page_init_and_validate(page, cookie, change_attr);
- if (nfs_readdir_page_last_cookie(page) != cookie)
- nfs_readdir_page_reinit_array(page, cookie, change_attr);
- return page;
+ nfs_readdir_folio_init_and_validate(folio, cookie, change_attr);
+ if (nfs_readdir_folio_last_cookie(folio) != cookie)
+ nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+ return folio;
}
static inline
@@ -481,11 +478,11 @@ bool nfs_readdir_use_cookie(const struct file *filp)
static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
struct nfs_readdir_descriptor *desc)
{
- if (array->page_full) {
+ if (array->folio_full) {
desc->last_cookie = array->last_cookie;
desc->current_index += array->size;
desc->cache_entry_index = 0;
- desc->page_index++;
+ desc->folio_index++;
} else
desc->last_cookie = nfs_readdir_array_index_cookie(array);
}
@@ -494,7 +491,7 @@ static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
{
desc->current_index = 0;
desc->last_cookie = 0;
- desc->page_index = 0;
+ desc->folio_index = 0;
}
static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
@@ -506,7 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
if (diff < 0)
goto out_eof;
if (diff >= array->size) {
- if (array->page_is_eof)
+ if (array->folio_is_eof)
goto out_eof;
nfs_readdir_seek_next_array(array, desc);
return -EAGAIN;
@@ -554,7 +551,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
}
}
check_eof:
- if (array->page_is_eof) {
+ if (array->folio_is_eof) {
status = -EBADCOOKIE;
if (desc->dir_cookie == array->last_cookie)
desc->eof = true;
@@ -568,7 +565,7 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
struct nfs_cache_array *array;
int status;
- array = kmap_local_page(desc->page);
+ array = kmap_local_folio(desc->folio, 0);
if (desc->dir_cookie == 0)
status = nfs_readdir_search_for_pos(array, desc);
@@ -819,16 +816,17 @@ static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
}
/* Perform conversion from xdr to cache array */
-static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
- struct nfs_entry *entry,
- struct page **xdr_pages, unsigned int buflen,
- struct page **arrays, size_t narrays,
- u64 change_attr)
+static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
+ struct nfs_entry *entry,
+ struct page **xdr_pages, unsigned int buflen,
+ struct folio **arrays, size_t narrays,
+ u64 change_attr)
{
struct address_space *mapping = desc->file->f_mapping;
+ struct folio *new, *folio = *arrays;
struct xdr_stream stream;
+ struct page *scratch;
struct xdr_buf buf;
- struct page *scratch, *new, *page = *arrays;
u64 cookie;
int status;
@@ -844,36 +842,36 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
if (status != 0)
break;
- status = nfs_readdir_page_array_append(page, entry, &cookie);
+ status = nfs_readdir_folio_array_append(folio, entry, &cookie);
if (status != -ENOSPC)
continue;
- if (page->mapping != mapping) {
+ if (folio->mapping != mapping) {
if (!--narrays)
break;
- new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL);
+ new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL);
if (!new)
break;
arrays++;
- *arrays = page = new;
+ *arrays = folio = new;
} else {
- new = nfs_readdir_page_get_next(mapping, cookie,
- change_attr);
+ new = nfs_readdir_folio_get_next(mapping, cookie,
+ change_attr);
if (!new)
break;
- if (page != *arrays)
- nfs_readdir_page_unlock_and_put(page);
- page = new;
+ if (folio != *arrays)
+ nfs_readdir_folio_unlock_and_put(folio);
+ folio = new;
}
- desc->page_index_max++;
- status = nfs_readdir_page_array_append(page, entry, &cookie);
+ desc->folio_index_max++;
+ status = nfs_readdir_folio_array_append(folio, entry, &cookie);
} while (!status && !entry->eof);
switch (status) {
case -EBADCOOKIE:
if (!entry->eof)
break;
- nfs_readdir_page_set_eof(page);
+ nfs_readdir_folio_set_eof(folio);
fallthrough;
case -EAGAIN:
status = 0;
@@ -886,8 +884,8 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
;
}
- if (page != *arrays)
- nfs_readdir_page_unlock_and_put(page);
+ if (folio != *arrays)
+ nfs_readdir_folio_unlock_and_put(folio);
put_page(scratch);
return status;
@@ -927,11 +925,11 @@ out_freepages:
static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
__be32 *verf_arg, __be32 *verf_res,
- struct page **arrays, size_t narrays)
+ struct folio **arrays, size_t narrays)
{
u64 change_attr;
struct page **pages;
- struct page *page = *arrays;
+ struct folio *folio = *arrays;
struct nfs_entry *entry;
size_t array_size;
struct inode *inode = file_inode(desc->file);
@@ -942,7 +940,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
- entry->cookie = nfs_readdir_page_last_cookie(page);
+ entry->cookie = nfs_readdir_folio_last_cookie(folio);
entry->fh = nfs_alloc_fhandle();
entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
entry->server = NFS_SERVER(inode);
@@ -962,10 +960,10 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
pglen = status;
if (pglen != 0)
- status = nfs_readdir_page_filler(desc, entry, pages, pglen,
- arrays, narrays, change_attr);
+ status = nfs_readdir_folio_filler(desc, entry, pages, pglen,
+ arrays, narrays, change_attr);
else
- nfs_readdir_page_set_eof(page);
+ nfs_readdir_folio_set_eof(folio);
desc->buffer_fills++;
free_pages:
@@ -977,33 +975,33 @@ out:
return status;
}
-static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc)
+static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc)
{
- put_page(desc->page);
- desc->page = NULL;
+ folio_put(desc->folio);
+ desc->folio = NULL;
}
static void
-nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
+nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
{
- unlock_page(desc->page);
- nfs_readdir_page_put(desc);
+ folio_unlock(desc->folio);
+ nfs_readdir_folio_put(desc);
}
-static struct page *
-nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
+static struct folio *
+nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc)
{
struct address_space *mapping = desc->file->f_mapping;
u64 change_attr = inode_peek_iversion_raw(mapping->host);
u64 cookie = desc->last_cookie;
- struct page *page;
+ struct folio *folio;
- page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
- if (!page)
+ folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr);
+ if (!folio)
return NULL;
- if (desc->clear_cache && !nfs_readdir_page_needs_filling(page))
- nfs_readdir_page_reinit_array(page, cookie, change_attr);
- return page;
+ if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio))
+ nfs_readdir_folio_reinit_array(folio, cookie, change_attr);
+ return folio;
}
/*
@@ -1017,21 +1015,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
__be32 verf[NFS_DIR_VERIFIER_SIZE];
int res;
- desc->page = nfs_readdir_page_get_cached(desc);
- if (!desc->page)
+ desc->folio = nfs_readdir_folio_get_cached(desc);
+ if (!desc->folio)
return -ENOMEM;
- if (nfs_readdir_page_needs_filling(desc->page)) {
+ if (nfs_readdir_folio_needs_filling(desc->folio)) {
/* Grow the dtsize if we had to go back for more pages */
- if (desc->page_index == desc->page_index_max)
+ if (desc->folio_index == desc->folio_index_max)
nfs_grow_dtsize(desc);
- desc->page_index_max = desc->page_index;
+ desc->folio_index_max = desc->folio_index;
trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
desc->last_cookie,
- desc->page->index, desc->dtsize);
+ desc->folio->index, desc->dtsize);
res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
- &desc->page, 1);
+ &desc->folio, 1);
if (res < 0) {
- nfs_readdir_page_unlock_and_put_cached(desc);
+ nfs_readdir_folio_unlock_and_put_cached(desc);
trace_nfs_readdir_cache_fill_done(inode, res);
if (res == -EBADCOOKIE || res == -ENOTSYNC) {
invalidate_inode_pages2(desc->file->f_mapping);
@@ -1059,7 +1057,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
res = nfs_readdir_search_array(desc);
if (res == 0)
return 0;
- nfs_readdir_page_unlock_and_put_cached(desc);
+ nfs_readdir_folio_unlock_and_put_cached(desc);
return res;
}
@@ -1087,7 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
unsigned int i;
bool first_emit = !desc->dir_cookie;
- array = kmap_local_page(desc->page);
+ array = kmap_local_folio(desc->folio, 0);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
@@ -1114,7 +1112,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
break;
}
}
- if (array->page_is_eof)
+ if (array->folio_is_eof)
desc->eof = !desc->eob;
kunmap_local(array);
@@ -1136,7 +1134,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
*/
static int uncached_readdir(struct nfs_readdir_descriptor *desc)
{
- struct page **arrays;
+ struct folio **arrays;
size_t i, sz = 512;
__be32 verf[NFS_DIR_VERIFIER_SIZE];
int status = -ENOMEM;
@@ -1147,14 +1145,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL);
if (!arrays)
goto out;
- arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL);
+ arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL);
if (!arrays[0])
goto out;
- desc->page_index = 0;
+ desc->folio_index = 0;
desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
- desc->page_index_max = 0;
+ desc->folio_index_max = 0;
trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
-1, desc->dtsize);
@@ -1166,10 +1164,10 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
}
for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
- desc->page = arrays[i];
+ desc->folio = arrays[i];
nfs_do_filldir(desc, verf);
}
- desc->page = NULL;
+ desc->folio = NULL;
/*
* Grow the dtsize if we have to go back for more pages,
@@ -1179,16 +1177,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
if (!desc->eob)
nfs_grow_dtsize(desc);
else if (desc->buffer_fills == 1 &&
- i < (desc->page_index_max >> 1))
+ i < (desc->folio_index_max >> 1))
nfs_shrink_dtsize(desc);
}
out_free:
for (i = 0; i < sz && arrays[i]; i++)
- nfs_readdir_page_array_free(arrays[i]);
+ nfs_readdir_folio_array_free(arrays[i]);
out:
if (!nfs_readdir_use_cookie(desc->file))
nfs_readdir_rewind_search(desc);
- desc->page_index_max = -1;
+ desc->folio_index_max = -1;
kfree(arrays);
dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
return status;
@@ -1240,11 +1238,11 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
goto out;
desc->file = file;
desc->ctx = ctx;
- desc->page_index_max = -1;
+ desc->folio_index_max = -1;
spin_lock(&file->f_lock);
desc->dir_cookie = dir_ctx->dir_cookie;
- desc->page_index = dir_ctx->page_index;
+ desc->folio_index = dir_ctx->page_index;
desc->last_cookie = dir_ctx->last_cookie;
desc->attr_gencount = dir_ctx->attr_gencount;
desc->eof = dir_ctx->eof;
@@ -1291,8 +1289,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
nfs_do_filldir(desc, nfsi->cookieverf);
- nfs_readdir_page_unlock_and_put_cached(desc);
- if (desc->page_index == desc->page_index_max)
+ nfs_readdir_folio_unlock_and_put_cached(desc);
+ if (desc->folio_index == desc->folio_index_max)
desc->clear_cache = force_clear;
} while (!desc->eob && !desc->eof);
@@ -1300,7 +1298,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
dir_ctx->dir_cookie = desc->dir_cookie;
dir_ctx->last_cookie = desc->last_cookie;
dir_ctx->attr_gencount = desc->attr_gencount;
- dir_ctx->page_index = desc->page_index;
+ dir_ctx->page_index = desc->folio_index;
dir_ctx->force_clear = force_clear;
dir_ctx->eof = desc->eof;
dir_ctx->dtsize = desc->dtsize;
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index d6a6d1ebb8fd..be686b8e0c54 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -149,7 +149,10 @@ const struct export_operations nfs_export_ops = {
.encode_fh = nfs_encode_fh,
.fh_to_dentry = nfs_fh_to_dentry,
.get_parent = nfs_get_parent,
- .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
- EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
- EXPORT_OP_NOATOMIC_ATTR,
+ .flags = EXPORT_OP_NOWCC |
+ EXPORT_OP_NOSUBTREECHK |
+ EXPORT_OP_CLOSE_BEFORE_UNLINK |
+ EXPORT_OP_REMOTE_FS |
+ EXPORT_OP_NOATOMIC_ATTR |
+ EXPORT_OP_FLUSH_ON_CLOSE,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2474cbc30712..f0edf5a36237 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,8 +328,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
start:
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
*pagep = &folio->page;
ret = nfs_flush_incompatible(file, folio);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ea5f2976dfab..8c35d88a84b1 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -15,6 +15,9 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/iversion.h>
+#include <linux/xarray.h>
+#include <linux/fscache.h>
+#include <linux/netfs.h>
#include "internal.h"
#include "iostat.h"
@@ -163,13 +166,14 @@ void nfs_fscache_init_inode(struct inode *inode)
struct nfs_server *nfss = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
- nfsi->fscache = NULL;
+ netfs_inode(inode)->cache = NULL;
if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
nfs_fscache_update_auxdata(&auxdata, inode);
- nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+ netfs_inode(inode)->cache = fscache_acquire_cookie(
+ nfss->fscache,
0,
nfsi->fh.data, /* index_key */
nfsi->fh.size,
@@ -183,11 +187,8 @@ void nfs_fscache_init_inode(struct inode *inode)
*/
void nfs_fscache_clear_inode(struct inode *inode)
{
- struct nfs_inode *nfsi = NFS_I(inode);
- struct fscache_cookie *cookie = nfs_i_fscache(inode);
-
- fscache_relinquish_cookie(cookie, false);
- nfsi->fscache = NULL;
+ fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false);
+ netfs_inode(inode)->cache = NULL;
}
/*
@@ -212,7 +213,7 @@ void nfs_fscache_clear_inode(struct inode *inode)
void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
bool open_for_write = inode_is_open_for_write(inode);
if (!fscache_cookie_valid(cookie))
@@ -230,115 +231,160 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
void nfs_fscache_release_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
loff_t i_size = i_size_read(inode);
nfs_fscache_update_auxdata(&auxdata, inode);
fscache_unuse_cookie(cookie, &auxdata, &i_size);
}
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
+int nfs_netfs_read_folio(struct file *file, struct folio *folio)
{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = nfs_i_fscache(inode);
- struct iov_iter iter;
- struct bio_vec bvec;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- bvec_set_page(&bvec, page, PAGE_SIZE, 0);
- iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE);
-
- ret = fscache_begin_read_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
- NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
+ if (!netfs_inode(folio_inode(folio))->cache)
+ return -ENOBUFS;
+
+ return netfs_read_folio(file, folio);
}
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_page(struct inode *inode, struct page *page,
- bool no_space_allocated_yet)
+int nfs_netfs_readahead(struct readahead_control *ractl)
{
- struct netfs_cache_resources cres;
- struct fscache_cookie *cookie = nfs_i_fscache(inode);
- struct iov_iter iter;
- struct bio_vec bvec;
- loff_t start = page_offset(page);
- size_t len = PAGE_SIZE;
- int ret;
-
- memset(&cres, 0, sizeof(cres));
- bvec_set_page(&bvec, page, PAGE_SIZE, 0);
- iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
-
- ret = fscache_begin_write_operation(&cres, cookie);
- if (ret < 0)
- return ret;
-
- ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
- no_space_allocated_yet);
- if (ret == 0)
- ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
- fscache_end_operation(&cres);
- return ret;
+ struct inode *inode = ractl->mapping->host;
+
+ if (!netfs_inode(inode)->cache)
+ return -ENOBUFS;
+
+ netfs_readahead(ractl);
+ return 0;
}
-/*
- * Retrieve a page from fscache
- */
-int __nfs_fscache_read_page(struct inode *inode, struct page *page)
+static atomic_t nfs_netfs_debug_id;
+static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- int ret;
+ rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+ rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
- trace_nfs_fscache_read_page(inode, page);
- if (PageChecked(page)) {
- ClearPageChecked(page);
- ret = 1;
- goto out;
- }
+ return 0;
+}
- ret = fscache_fallback_read_page(inode, page);
- if (ret < 0) {
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
- SetPageChecked(page);
- goto out;
- }
+static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+{
+ put_nfs_open_context(rreq->netfs_priv);
+}
- /* Read completed synchronously */
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
- SetPageUptodate(page);
- ret = 0;
-out:
- trace_nfs_fscache_read_page_exit(inode, page, ret);
- return ret;
+static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources,
+ netfs_i_cookie(netfs_inode(rreq->inode)));
}
-/*
- * Store a newly fetched page in fscache. We can be certain there's no page
- * stored in the cache as yet otherwise we would've read it from there.
- */
-void __nfs_fscache_write_page(struct inode *inode, struct page *page)
+static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
{
- int ret;
+ struct nfs_netfs_io_data *netfs;
+
+ netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT);
+ if (!netfs)
+ return NULL;
+ netfs->sreq = sreq;
+ refcount_set(&netfs->refcount, 1);
+ return netfs;
+}
- trace_nfs_fscache_write_page(inode, page);
+static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
+{
+ size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
- ret = fscache_fallback_write_page(inode, page, true);
+ sreq->len = min(sreq->len, rsize);
+ return true;
+}
- if (ret != 0) {
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
- } else {
- nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
+{
+ struct nfs_netfs_io_data *netfs;
+ struct nfs_pageio_descriptor pgio;
+ struct inode *inode = sreq->rreq->inode;
+ struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
+ struct page *page;
+ int err;
+ pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+ pgoff_t last = ((sreq->start + sreq->len -
+ sreq->transferred - 1) >> PAGE_SHIFT);
+ XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
+
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ netfs = nfs_netfs_alloc(sreq);
+ if (!netfs)
+ return netfs_subreq_terminated(sreq, -ENOMEM, false);
+
+ pgio.pg_netfs = netfs; /* used in completion */
+
+ xas_lock(&xas);
+ xas_for_each(&xas, page, last) {
+ /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
+ if (err < 0) {
+ netfs->error = err;
+ goto out;
+ }
+ xas_lock(&xas);
}
- trace_nfs_fscache_write_page_exit(inode, page, ret);
+ xas_unlock(&xas);
+out:
+ nfs_pageio_complete_read(&pgio);
+ nfs_netfs_put(netfs);
+}
+
+void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
+{
+ struct nfs_netfs_io_data *netfs = hdr->netfs;
+
+ if (!netfs)
+ return;
+
+ nfs_netfs_get(netfs);
+}
+
+int nfs_netfs_folio_unlock(struct folio *folio)
+{
+ struct inode *inode = folio_file_mapping(folio)->host;
+
+ /*
+ * If fscache is enabled, netfs will unlock pages.
+ */
+ if (netfs_inode(inode)->cache)
+ return 0;
+
+ return 1;
}
+
+void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
+{
+ struct nfs_netfs_io_data *netfs = hdr->netfs;
+ struct netfs_io_subrequest *sreq;
+
+ if (!netfs)
+ return;
+
+ sreq = netfs->sreq;
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
+
+ if (hdr->error)
+ netfs->error = hdr->error;
+ else
+ atomic64_add(hdr->res.count, &netfs->transferred);
+
+ nfs_netfs_put(netfs);
+ hdr->netfs = NULL;
+}
+
+const struct netfs_request_ops nfs_netfs_ops = {
+ .init_request = nfs_netfs_init_request,
+ .free_request = nfs_netfs_free_request,
+ .begin_cache_operation = nfs_netfs_begin_cache_operation,
+ .issue_read = nfs_netfs_issue_read,
+ .clamp_length = nfs_netfs_clamp_length
+};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2a37af880978..e1706e736c64 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -34,6 +34,58 @@ struct nfs_fscache_inode_auxdata {
u64 change_attr;
};
+struct nfs_netfs_io_data {
+ /*
+ * NFS may split a netfs_io_subrequest into multiple RPCs, each
+ * with their own read completion. In netfs, we can only call
+ * netfs_subreq_terminated() once for each subrequest. Use the
+ * refcount here to double as a marker of the last RPC completion,
+ * and only call netfs via netfs_subreq_terminated() once.
+ */
+ refcount_t refcount;
+ struct netfs_io_subrequest *sreq;
+
+ /*
+ * Final disposition of the netfs_io_subrequest, sent in
+ * netfs_subreq_terminated()
+ */
+ atomic64_t transferred;
+ int error;
+};
+
+static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
+{
+ refcount_inc(&netfs->refcount);
+}
+
+static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
+{
+ ssize_t final_len;
+
+ /* Only the last RPC completion should call netfs_subreq_terminated() */
+ if (!refcount_dec_and_test(&netfs->refcount))
+ return;
+
+ /*
+ * The NFS pageio interface may read a complete page, even when netfs
+ * only asked for a partial page. Specifically, this may be seen when
+ * one thread is truncating a file while another one is reading the last
+ * page of the file.
+ * Correct the final length here to be no larger than the netfs subrequest
+ * length, and thus avoid netfs's "Subreq overread" warning message.
+ */
+ final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
+ netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+ kfree(netfs);
+}
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
+{
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+}
+extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
+extern int nfs_netfs_folio_unlock(struct folio *folio);
+
/*
* fscache.c
*/
@@ -44,9 +96,8 @@ extern void nfs_fscache_init_inode(struct inode *);
extern void nfs_fscache_clear_inode(struct inode *);
extern void nfs_fscache_open_file(struct inode *, struct file *);
extern void nfs_fscache_release_file(struct inode *, struct file *);
-
-extern int __nfs_fscache_read_page(struct inode *, struct page *);
-extern void __nfs_fscache_write_page(struct inode *, struct page *);
+extern int nfs_netfs_readahead(struct readahead_control *ractl);
+extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
@@ -54,34 +105,11 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
folio_wait_fscache(folio);
- fscache_note_page_release(nfs_i_fscache(folio->mapping->host));
- nfs_inc_fscache_stats(folio->mapping->host,
- NFSIOS_FSCACHE_PAGES_UNCACHED);
}
+ fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
return true;
}
-/*
- * Retrieve a page from an inode data storage object.
- */
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
-{
- if (nfs_i_fscache(inode))
- return __nfs_fscache_read_page(inode, page);
- return -ENOBUFS;
-}
-
-/*
- * Store a page newly fetched from the server in an inode data storage object
- * in the cache.
- */
-static inline void nfs_fscache_write_page(struct inode *inode,
- struct page *page)
-{
- if (nfs_i_fscache(inode))
- __nfs_fscache_write_page(inode, page);
-}
-
static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
struct inode *inode)
{
@@ -101,13 +129,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
{
struct nfs_fscache_inode_auxdata auxdata;
- struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
- if (nfsi->fscache) {
- nfs_fscache_update_auxdata(&auxdata, inode);
- fscache_invalidate(nfsi->fscache, &auxdata,
- i_size_read(inode), flags);
- }
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags);
}
/*
@@ -120,7 +145,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
return "no ";
}
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+ struct nfs_pageio_descriptor *desc)
+{
+ hdr->netfs = desc->pg_netfs;
+}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ desc->pg_netfs = hdr->netfs;
+}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc)
+{
+ desc->pg_netfs = NULL;
+}
#else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline int nfs_netfs_folio_unlock(struct folio *folio)
+{
+ return 1;
+}
static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
static inline void nfs_fscache_init_inode(struct inode *inode) {}
@@ -128,22 +174,29 @@ static inline void nfs_fscache_clear_inode(struct inode *inode) {}
static inline void nfs_fscache_open_file(struct inode *inode,
struct file *filp) {}
static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-
-static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+static inline int nfs_netfs_readahead(struct readahead_control *ractl)
{
- return true; /* may release folio */
+ return -ENOBUFS;
}
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
+static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio)
{
return -ENOBUFS;
}
-static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+ return true; /* may release folio */
+}
static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
{
return "no ";
}
-
+static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr,
+ struct nfs_pageio_descriptor *desc) {}
+static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {}
#endif /* CONFIG_NFS_FSCACHE */
#endif /* _NFS_FSCACHE_H */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 97a76706fd54..a910b9a638c5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
nfsi->cache_validity |= flags;
- if (inode->i_mapping->nrpages == 0)
- nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
- NFS_INO_DATA_INVAL_DEFER);
- else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+ if (inode->i_mapping->nrpages == 0) {
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
+ } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ nfs_ooo_clear(nfsi);
+ }
trace_nfs_set_cache_invalid(inode, 0);
}
EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
trace_nfs_size_truncate(inode, offset);
i_size_write(inode, offset);
/* Optimisation */
- if (offset == 0)
- NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
- NFS_INO_DATA_INVAL_DEFER);
+ if (offset == 0) {
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(NFS_I(inode));
+ }
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
spin_unlock(&inode->i_lock);
@@ -1107,7 +1109,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
spin_lock(&inode->i_lock);
if (list_empty(&nfsi->open_files) &&
- (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+ nfs_ooo_test(nfsi))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
list_add_tail_rcu(&ctx->list, &nfsi->open_files);
@@ -1351,8 +1353,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
set_bit(NFS_INO_INVALIDATING, bitlock);
smp_wmb();
- nfsi->cache_validity &=
- ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
@@ -1814,6 +1816,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
return 0;
}
+static void nfs_ooo_merge(struct nfs_inode *nfsi,
+ u64 start, u64 end)
+{
+ int i, cnt;
+
+ if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)
+ /* No point merging anything */
+ return;
+
+ if (!nfsi->ooo) {
+ nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC);
+ if (!nfsi->ooo) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ return;
+ }
+ nfsi->ooo->cnt = 0;
+ }
+
+ /* add this range, merging if possible */
+ cnt = nfsi->ooo->cnt;
+ for (i = 0; i < cnt; i++) {
+ if (end == nfsi->ooo->gap[i].start)
+ end = nfsi->ooo->gap[i].end;
+ else if (start == nfsi->ooo->gap[i].end)
+ start = nfsi->ooo->gap[i].start;
+ else
+ continue;
+ /* Remove 'i' from table and loop to insert the new range */
+ cnt -= 1;
+ nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt];
+ i = -1;
+ }
+ if (start != end) {
+ if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ return;
+ }
+ nfsi->ooo->gap[cnt].start = start;
+ nfsi->ooo->gap[cnt].end = end;
+ cnt += 1;
+ }
+ nfsi->ooo->cnt = cnt;
+}
+
+static void nfs_ooo_record(struct nfs_inode *nfsi,
+ struct nfs_fattr *fattr)
+{
+ /* This reply was out-of-order, so record in the
+ * pre/post change id, possibly cancelling
+ * gaps created when iversion was jumpped forward.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE))
+ nfs_ooo_merge(nfsi,
+ fattr->change_attr,
+ fattr->pre_change_attr);
+}
+
static int nfs_refresh_inode_locked(struct inode *inode,
struct nfs_fattr *fattr)
{
@@ -1824,8 +1886,12 @@ static int nfs_refresh_inode_locked(struct inode *inode,
if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
ret = nfs_update_inode(inode, fattr);
- else if (attr_cmp == 0)
- ret = nfs_check_inode_attributes(inode, fattr);
+ else {
+ nfs_ooo_record(NFS_I(inode), fattr);
+
+ if (attr_cmp == 0)
+ ret = nfs_check_inode_attributes(inode, fattr);
+ }
trace_nfs_refresh_inode_exit(inode, ret);
return ret;
@@ -1916,6 +1982,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
if (attr_cmp < 0)
return 0;
if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
+ /* Record the pre/post change info before clearing PRECHANGE */
+ nfs_ooo_record(NFS_I(inode), fattr);
fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
| NFS_ATTR_FATTR_PRESIZE
| NFS_ATTR_FATTR_PREMTIME
@@ -2070,6 +2138,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 &&
+ nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) {
+ /* There is one remaining gap that hasn't been
+ * merged into iversion - do that now.
+ */
+ inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start);
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ }
if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
/* Could it be a race with writeback? */
if (!(have_writers || have_delegation)) {
@@ -2091,8 +2168,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id,
inode->i_ino);
- } else if (!have_delegation)
- nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ } else if (!have_delegation) {
+ nfs_ooo_record(nfsi, fattr);
+ nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode),
+ fattr->change_attr);
+ }
inode_set_iversion_raw(inode, fattr->change_attr);
}
} else {
@@ -2246,18 +2326,22 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
return NULL;
nfsi->flags = 0UL;
nfsi->cache_validity = 0UL;
+ nfsi->ooo = NULL;
#if IS_ENABLED(CONFIG_NFS_V4)
nfsi->nfs4_acl = NULL;
#endif /* CONFIG_NFS_V4 */
#ifdef CONFIG_NFS_V4_2
nfsi->xattr_cache = NULL;
#endif
+ nfs_netfs_inode_init(nfsi);
+
return &nfsi->vfs_inode;
}
EXPORT_SYMBOL_GPL(nfs_alloc_inode);
void nfs_free_inode(struct inode *inode)
{
+ kfree(NFS_I(inode)->ooo);
kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
}
EXPORT_SYMBOL_GPL(nfs_free_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a65fe2a63ab..3cc027d3bd58 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -452,6 +452,10 @@ extern void nfs_sb_deactive(struct super_block *sb);
extern int nfs_client_for_each_server(struct nfs_client *clp,
int (*fn)(struct nfs_server *, void *),
void *data);
+#ifdef CONFIG_NFS_FSCACHE
+extern const struct netfs_request_ops nfs_netfs_ops;
+#endif
+
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
@@ -481,9 +485,14 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
struct nfs_pgio_completion_ops;
/* read.c */
+extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
+extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+ struct nfs_open_context *ctx,
+ struct folio *folio);
+extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
@@ -846,27 +855,12 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
}
#ifdef CONFIG_CRC32
-/**
- * nfs_fhandle_hash - calculate the crc32 hash for the filehandle
- * @fh - pointer to filehandle
- *
- * returns a crc32 hash for the filehandle that is compatible with
- * the one displayed by "wireshark".
- */
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
- return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
-}
static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
{
return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
NFS4_STATEID_OTHER_SIZE);
}
#else
-static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
-{
- return 0;
-}
static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
{
return 0;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 2ddaab1ac653..5aa776b5a3e7 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -17,9 +17,6 @@
struct nfs_iostats {
unsigned long long bytes[__NFSIOS_BYTESMAX];
-#ifdef CONFIG_NFS_FSCACHE
- unsigned long long fscache[__NFSIOS_FSCACHEMAX];
-#endif
unsigned long events[__NFSIOS_COUNTSMAX];
} ____cacheline_aligned;
@@ -49,20 +46,6 @@ static inline void nfs_add_stats(const struct inode *inode,
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
-#ifdef CONFIG_NFS_FSCACHE
-static inline void nfs_add_fscache_stats(struct inode *inode,
- enum nfs_stat_fscachecounters stat,
- long addend)
-{
- this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
-}
-static inline void nfs_inc_fscache_stats(struct inode *inode,
- enum nfs_stat_fscachecounters stat)
-{
- this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]);
-}
-#endif
-
static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
{
return alloc_percpu(struct nfs_iostats);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 349cc4f60a28..18d8f6529f61 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -21,9 +21,8 @@ static void nfs3_prepare_get_acl(struct posix_acl **p)
{
struct posix_acl *sentinel = uncached_acl_sentinel(current);
- if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) {
- /* Not the first reader or sentinel already in place. */
- }
+ /* If the ACL isn't being read yet, set our sentinel. */
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
}
static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index d80ee88ca996..a6df815a140c 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1122,7 +1122,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
uint32_t segments;
struct read_plus_segment *segs;
int status, i;
- char scratch_buf[16];
__be32 *p;
status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1143,7 +1142,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
if (!segs)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
status = -EIO;
for (i = 0; i < segments; i++) {
status = decode_read_plus_segment(xdr, &segs[i]);
@@ -1348,6 +1346,8 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
+ xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch));
+
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5607b1e2b821..18f25ff4bff7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5439,6 +5439,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
+ if (hdr->res.scratch)
+ kfree(hdr->res.scratch);
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -5452,17 +5454,22 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
}
#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
/* Note: We don't use READ_PLUS with pNFS yet */
- if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp)
+ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) {
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
+ hdr->res.scratch = kmalloc(32, GFP_KERNEL);
+ return hdr->res.scratch != NULL;
+ }
+ return false;
}
#else
-static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
+ return false;
}
#endif /* CONFIG_NFS_V4_2 */
@@ -5472,8 +5479,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
hdr->timestamp = jiffies;
if (!hdr->pgio_done_cb)
hdr->pgio_done_cb = nfs4_read_done_cb;
- msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
- nfs42_read_plus_support(hdr, msg);
+ if (!nfs42_read_plus_support(hdr, msg))
+ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2a0ca5c7f082..bbe49315d99e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -67,6 +67,8 @@
#define OPENOWNER_POOL_SIZE 8
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp);
+
const nfs4_stateid zero_stateid = {
{ .data = { 0 } },
.type = NFS4_SPECIAL_STATEID_TYPE,
@@ -330,6 +332,8 @@ do_confirm:
status = nfs4_proc_create_session(clp, cred);
if (status != 0)
goto out;
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R))
+ nfs4_state_start_reclaim_reboot(clp);
nfs41_finish_session_reset(clp);
nfs_mark_client_ready(clp, NFS_CS_READY);
out:
@@ -1205,10 +1209,6 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
- struct rpc_clnt *cl = clp->cl_rpcclient;
-
- while (cl != cl->cl_parent)
- cl = cl->cl_parent;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index c394e4447100..e776200e9a11 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -37,27 +37,10 @@ static struct ctl_table nfs4_cb_sysctls[] = {
{ }
};
-static struct ctl_table nfs4_cb_sysctl_dir[] = {
- {
- .procname = "nfs",
- .mode = 0555,
- .child = nfs4_cb_sysctls,
- },
- { }
-};
-
-static struct ctl_table nfs4_cb_sysctl_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = nfs4_cb_sysctl_dir,
- },
- { }
-};
-
int nfs4_register_sysctl(void)
{
- nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root);
+ nfs4_callback_sysctl_table = register_sysctl("fs/nfs",
+ nfs4_cb_sysctls);
if (nfs4_callback_sysctl_table == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a778713343df..4e90ca531176 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
{ BIT(NFS_INO_STALE), "STALE" }, \
{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
- { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \
{ BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \
{ BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \
{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
@@ -1243,96 +1242,6 @@ TRACE_EVENT(nfs_readpage_short,
)
);
-DECLARE_EVENT_CLASS(nfs_fscache_page_event,
- TP_PROTO(
- const struct inode *inode,
- struct page *page
- ),
-
- TP_ARGS(inode, page),
-
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(u32, fhandle)
- __field(u64, fileid)
- __field(loff_t, offset)
- ),
-
- TP_fast_assign(
- const struct nfs_inode *nfsi = NFS_I(inode);
- const struct nfs_fh *fh = &nfsi->fh;
-
- __entry->offset = page_index(page) << PAGE_SHIFT;
- __entry->dev = inode->i_sb->s_dev;
- __entry->fileid = nfsi->fileid;
- __entry->fhandle = nfs_fhandle_hash(fh);
- ),
-
- TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long long)__entry->fileid,
- __entry->fhandle,
- (long long)__entry->offset
- )
-);
-DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
- TP_PROTO(
- const struct inode *inode,
- struct page *page,
- int error
- ),
-
- TP_ARGS(inode, page, error),
-
- TP_STRUCT__entry(
- __field(int, error)
- __field(dev_t, dev)
- __field(u32, fhandle)
- __field(u64, fileid)
- __field(loff_t, offset)
- ),
-
- TP_fast_assign(
- const struct nfs_inode *nfsi = NFS_I(inode);
- const struct nfs_fh *fh = &nfsi->fh;
-
- __entry->offset = page_index(page) << PAGE_SHIFT;
- __entry->dev = inode->i_sb->s_dev;
- __entry->fileid = nfsi->fileid;
- __entry->fhandle = nfs_fhandle_hash(fh);
- __entry->error = error;
- ),
-
- TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld error=%d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- (unsigned long long)__entry->fileid,
- __entry->fhandle,
- (long long)__entry->offset, __entry->error
- )
-);
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
- DEFINE_EVENT(nfs_fscache_page_event, name, \
- TP_PROTO( \
- const struct inode *inode, \
- struct page *page \
- ), \
- TP_ARGS(inode, page))
-#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
- DEFINE_EVENT(nfs_fscache_page_event_done, name, \
- TP_PROTO( \
- const struct inode *inode, \
- struct page *page, \
- int error \
- ), \
- TP_ARGS(inode, page, error))
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
-DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
-DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
TRACE_EVENT(nfs_pgio_error,
TP_PROTO(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 64fa8de199de..6efb5068c116 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -26,6 +26,7 @@
#include "internal.h"
#include "pnfs.h"
#include "nfstrace.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -105,6 +106,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
hdr->good_bytes = mirror->pg_count;
hdr->io_completion = desc->pg_io_completion;
hdr->dreq = desc->pg_dreq;
+ nfs_netfs_set_pgio_header(hdr, desc);
hdr->release = release;
hdr->completion_ops = desc->pg_completion_ops;
if (hdr->completion_ops->init_hdr)
@@ -941,6 +943,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_lseg = NULL;
desc->pg_io_completion = NULL;
desc->pg_dreq = NULL;
+ nfs_netfs_reset_pageio_descriptor(desc);
desc->pg_bsize = bsize;
desc->pg_mirror_count = 1;
@@ -1477,6 +1480,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
desc->pg_io_completion = hdr->io_completion;
desc->pg_dreq = hdr->dreq;
+ nfs_netfs_set_pageio_descriptor(desc, hdr);
list_splice_init(&hdr->pages, &pages);
while (!list_empty(&pages)) {
struct nfs_page *req = nfs_list_entry(pages.next);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e90988591df4..f71eeee67e20 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,7 +31,7 @@
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
static const struct nfs_rw_ops nfs_rw_read_ops;
static struct kmem_cache *nfs_rdata_cachep;
@@ -74,7 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
-static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
+void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *pgm;
unsigned long npages;
@@ -110,28 +110,17 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
static void nfs_readpage_release(struct nfs_page *req, int error)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
struct folio *folio = nfs_page_to_folio(req);
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
folio_set_error(folio);
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (folio_test_uptodate(folio))
- nfs_fscache_write_page(inode, &folio->page);
- folio_unlock(folio);
- }
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
+
nfs_release_request(req);
}
-struct nfs_readdesc {
- struct nfs_pageio_descriptor pgio;
- struct nfs_open_context *ctx;
-};
-
static void nfs_page_group_set_uptodate(struct nfs_page *req)
{
if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
@@ -153,7 +142,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
/* note: regions of the page not covered by a
- * request are zeroed in readpage_async_filler */
+ * request are zeroed in nfs_read_add_folio
+ */
if (bytes > hdr->good_bytes) {
/* nothing in this request was good, so zero
* the full extent of the request */
@@ -181,6 +171,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
nfs_readpage_release(req, error);
}
+ nfs_netfs_read_completion(hdr);
+
out:
hdr->release(hdr);
}
@@ -191,6 +183,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
struct rpc_task_setup *task_setup_data, int how)
{
rpc_ops->read_setup(hdr, msg);
+ nfs_netfs_initiate_read(hdr);
trace_nfs_initiate_read(hdr);
}
@@ -206,7 +199,7 @@ nfs_async_read_error(struct list_head *head, int error)
}
}
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
.error_cleanup = nfs_async_read_error,
.completion = nfs_read_completion,
};
@@ -281,7 +274,9 @@ static void nfs_readpage_result(struct rpc_task *task,
nfs_readpage_retry(task, hdr);
}
-static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio)
+int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
+ struct nfs_open_context *ctx,
+ struct folio *folio)
{
struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_server *server = NFS_SERVER(inode);
@@ -297,29 +292,21 @@ static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio)
aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize);
- if (!IS_SYNC(inode)) {
- error = nfs_fscache_read_page(inode, &folio->page);
- if (error == 0)
- goto out_unlock;
+ new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
+ if (IS_ERR(new)) {
+ error = PTR_ERR(new);
+ goto out;
}
- new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len);
- if (IS_ERR(new))
- goto out_error;
-
if (len < fsize)
folio_zero_segment(folio, len, fsize);
- if (!nfs_pageio_add_request(&desc->pgio, new)) {
+ if (!nfs_pageio_add_request(pgio, new)) {
nfs_list_remove_request(new);
- error = desc->pgio.pg_error;
+ error = pgio->pg_error;
nfs_readpage_release(new, error);
goto out;
}
return 0;
-out_error:
- error = PTR_ERR(new);
-out_unlock:
- folio_unlock(folio);
out:
return error;
}
@@ -332,8 +319,9 @@ out:
*/
int nfs_read_folio(struct file *file, struct folio *folio)
{
- struct nfs_readdesc desc;
struct inode *inode = file_inode(file);
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_open_context *ctx;
int ret;
trace_nfs_aop_readpage(inode, folio);
@@ -357,38 +345,43 @@ int nfs_read_folio(struct file *file, struct folio *folio)
if (NFS_STALE(inode))
goto out_unlock;
- desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+ ret = nfs_netfs_read_folio(file, folio);
+ if (!ret)
+ goto out;
+
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
- xchg(&desc.ctx->error, 0);
- nfs_pageio_init_read(&desc.pgio, inode, false,
+ xchg(&ctx->error, 0);
+ nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- ret = readpage_async_filler(&desc, folio);
+ ret = nfs_read_add_folio(&pgio, ctx, folio);
if (ret)
- goto out;
+ goto out_put;
- nfs_pageio_complete_read(&desc.pgio);
- ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
+ nfs_pageio_complete_read(&pgio);
+ ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
if (!ret) {
ret = folio_wait_locked_killable(folio);
if (!folio_test_uptodate(folio) && !ret)
- ret = xchg(&desc.ctx->error, 0);
+ ret = xchg(&ctx->error, 0);
}
+out_put:
+ put_nfs_open_context(ctx);
out:
- put_nfs_open_context(desc.ctx);
trace_nfs_aop_readpage_done(inode, folio, ret);
return ret;
out_unlock:
folio_unlock(folio);
- trace_nfs_aop_readpage_done(inode, folio, ret);
- return ret;
+ goto out;
}
void nfs_readahead(struct readahead_control *ractl)
{
+ struct nfs_pageio_descriptor pgio;
+ struct nfs_open_context *ctx;
unsigned int nr_pages = readahead_count(ractl);
struct file *file = ractl->file;
- struct nfs_readdesc desc;
struct inode *inode = ractl->mapping->host;
struct folio *folio;
int ret;
@@ -401,26 +394,30 @@ void nfs_readahead(struct readahead_control *ractl)
if (NFS_STALE(inode))
goto out;
+ ret = nfs_netfs_readahead(ractl);
+ if (!ret)
+ goto out;
+
if (file == NULL) {
ret = -EBADF;
- desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
- if (desc.ctx == NULL)
+ ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+ if (ctx == NULL)
goto out;
} else
- desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
- nfs_pageio_init_read(&desc.pgio, inode, false,
+ nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
while ((folio = readahead_folio(ractl)) != NULL) {
- ret = readpage_async_filler(&desc, folio);
+ ret = nfs_read_add_folio(&pgio, ctx, folio);
if (ret)
break;
}
- nfs_pageio_complete_read(&desc.pgio);
+ nfs_pageio_complete_read(&pgio);
- put_nfs_open_context(desc.ctx);
+ put_nfs_open_context(ctx);
out:
trace_nfs_aop_readahead_done(inode, nr_pages, ret);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 397c096d874e..30e53e93049e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -692,10 +692,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
totals.events[i] += stats->events[i];
for (i = 0; i < __NFSIOS_BYTESMAX; i++)
totals.bytes[i] += stats->bytes[i];
-#ifdef CONFIG_NFS_FSCACHE
- for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
- totals.fscache[i] += stats->fscache[i];
-#endif
preempt_enable();
}
@@ -706,13 +702,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
seq_puts(m, "\n\tbytes:\t");
for (i = 0; i < __NFSIOS_BYTESMAX; i++)
seq_printf(m, "%Lu ", totals.bytes[i]);
-#ifdef CONFIG_NFS_FSCACHE
- if (nfss->options & NFS_OPTION_FSCACHE) {
- seq_puts(m, "\n\tfsc:\t");
- for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
- seq_printf(m, "%Lu ", totals.fscache[i]);
- }
-#endif
seq_putc(m, '\n');
rpc_clnt_show_stats(m, nfss->client);
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 7aea195ddb35..f39e2089bc4c 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -32,27 +32,9 @@ static struct ctl_table nfs_cb_sysctls[] = {
{ }
};
-static struct ctl_table nfs_cb_sysctl_dir[] = {
- {
- .procname = "nfs",
- .mode = 0555,
- .child = nfs_cb_sysctls,
- },
- { }
-};
-
-static struct ctl_table nfs_cb_sysctl_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = nfs_cb_sysctl_dir,
- },
- { }
-};
-
int nfs_register_sysctl(void)
{
- nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root);
+ nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls);
if (nfs_callback_sysctl_table == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
index 7c1509e968c8..832246b22c51 100644
--- a/fs/nfs_common/nfs_ssc.c
+++ b/fs/nfs_common/nfs_ssc.c
@@ -12,7 +12,6 @@
#include <linux/nfs_ssc.h>
#include "../nfs/nfs4_fs.h"
-MODULE_LICENSE("GPL");
struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 668c7527b17e..ae85257b4238 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -123,11 +123,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
/* OK, we seem to have a valid key */
key.h.flags = 0;
- key.h.expiry_time = get_expiry(&mesg);
- if (key.h.expiry_time == 0)
+ err = get_expiry(&mesg, &key.h.expiry_time);
+ if (err)
goto out;
- key.ek_client = dom;
+ key.ek_client = dom;
key.ek_fsidtype = fsidtype;
memcpy(key.ek_fsid, buf, len);
@@ -439,7 +439,6 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
return -EINVAL;
}
return 0;
-
}
#ifdef CONFIG_NFSD_V4
@@ -546,6 +545,29 @@ static inline int
secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
#endif
+static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp)
+{
+ unsigned int i, mode, listsize;
+ int err;
+
+ err = get_uint(mesg, &listsize);
+ if (err)
+ return err;
+ if (listsize > NFSEXP_XPRTSEC_NUM)
+ return -EINVAL;
+
+ exp->ex_xprtsec_modes = 0;
+ for (i = 0; i < listsize; i++) {
+ err = get_uint(mesg, &mode);
+ if (err)
+ return err;
+ if (mode > NFSEXP_XPRTSEC_MTLS)
+ return -EINVAL;
+ exp->ex_xprtsec_modes |= mode;
+ }
+ return 0;
+}
+
static inline int
nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid)
{
@@ -608,11 +630,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
exp.cd = cd;
exp.ex_devid_map = NULL;
+ exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL;
/* expiry */
- err = -EINVAL;
- exp.h.expiry_time = get_expiry(&mesg);
- if (exp.h.expiry_time == 0)
+ err = get_expiry(&mesg, &exp.h.expiry_time);
+ if (err)
goto out3;
/* flags */
@@ -624,7 +646,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (err || an_int < 0)
goto out3;
exp.ex_flags= an_int;
-
+
/* anon uid */
err = get_int(&mesg, &an_int);
if (err)
@@ -650,6 +672,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid);
else if (strcmp(buf, "secinfo") == 0)
err = secinfo_parse(&mesg, buf, &exp);
+ else if (strcmp(buf, "xprtsec") == 0)
+ err = xprtsec_parse(&mesg, buf, &exp);
else
/* quietly ignore unknown words and anything
* following. Newer user-space can try to set
@@ -663,6 +687,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
if (err)
goto out4;
+
/*
* No point caching this if it would immediately expire.
* Also, this protects exportfs's dummy export from the
@@ -824,6 +849,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
for (i = 0; i < MAX_SECINFO_LIST; i++) {
new->ex_flavors[i] = item->ex_flavors[i];
}
+ new->ex_xprtsec_modes = item->ex_xprtsec_modes;
}
static struct cache_head *svc_export_alloc(void)
@@ -1035,9 +1061,26 @@ static struct svc_export *exp_find(struct cache_detail *cd,
__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
{
- struct exp_flavor_info *f;
- struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
+ struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
+ if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) {
+ if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags))
+ goto ok;
+ }
+ if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) {
+ if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+ !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+ goto ok;
+ }
+ if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) {
+ if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) &&
+ test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
+ goto ok;
+ }
+ goto denied;
+ok:
/* legacy gss-only clients are always OK: */
if (exp->ex_client == rqstp->rq_gssclient)
return 0;
@@ -1062,6 +1105,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
if (nfsd4_spo_must_allow(rqstp))
return 0;
+denied:
return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index d03f7f6a8642..2df8ae25aad3 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -77,6 +77,7 @@ struct svc_export {
struct cache_detail *cd;
struct rcu_head ex_rcu;
struct export_stats ex_stats;
+ unsigned long ex_xprtsec_modes;
};
/* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 6e8712bd7c99..ee9c923192e0 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -74,70 +74,9 @@ static struct list_lru nfsd_file_lru;
static unsigned long nfsd_file_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
static struct delayed_work nfsd_filecache_laundrette;
-static struct rhashtable nfsd_file_rhash_tbl
+static struct rhltable nfsd_file_rhltable
____cacheline_aligned_in_smp;
-enum nfsd_file_lookup_type {
- NFSD_FILE_KEY_INODE,
- NFSD_FILE_KEY_FULL,
-};
-
-struct nfsd_file_lookup_key {
- struct inode *inode;
- struct net *net;
- const struct cred *cred;
- unsigned char need;
- bool gc;
- enum nfsd_file_lookup_type type;
-};
-
-/*
- * The returned hash value is based solely on the address of an in-code
- * inode, a pointer to a slab-allocated object. The entropy in such a
- * pointer is concentrated in its middle bits.
- */
-static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed)
-{
- unsigned long ptr = (unsigned long)inode;
- u32 k;
-
- k = ptr >> L1_CACHE_SHIFT;
- k &= 0x00ffffff;
- return jhash2(&k, 1, seed);
-}
-
-/**
- * nfsd_file_key_hashfn - Compute the hash value of a lookup key
- * @data: key on which to compute the hash value
- * @len: rhash table's key_len parameter (unused)
- * @seed: rhash table's random seed of the day
- *
- * Return value:
- * Computed 32-bit hash value
- */
-static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed)
-{
- const struct nfsd_file_lookup_key *key = data;
-
- return nfsd_file_inode_hash(key->inode, seed);
-}
-
-/**
- * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file
- * @data: object on which to compute the hash value
- * @len: rhash table's key_len parameter (unused)
- * @seed: rhash table's random seed of the day
- *
- * Return value:
- * Computed 32-bit hash value
- */
-static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed)
-{
- const struct nfsd_file *nf = data;
-
- return nfsd_file_inode_hash(nf->nf_inode, seed);
-}
-
static bool
nfsd_match_cred(const struct cred *c1, const struct cred *c2)
{
@@ -158,53 +97,16 @@ nfsd_match_cred(const struct cred *c1, const struct cred *c2)
return true;
}
-/**
- * nfsd_file_obj_cmpfn - Match a cache item against search criteria
- * @arg: search criteria
- * @ptr: cache item to check
- *
- * Return values:
- * %0 - Item matches search criteria
- * %1 - Item does not match search criteria
- */
-static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct nfsd_file_lookup_key *key = arg->key;
- const struct nfsd_file *nf = ptr;
-
- switch (key->type) {
- case NFSD_FILE_KEY_INODE:
- if (nf->nf_inode != key->inode)
- return 1;
- break;
- case NFSD_FILE_KEY_FULL:
- if (nf->nf_inode != key->inode)
- return 1;
- if (nf->nf_may != key->need)
- return 1;
- if (nf->nf_net != key->net)
- return 1;
- if (!nfsd_match_cred(nf->nf_cred, key->cred))
- return 1;
- if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
- return 1;
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
- return 1;
- break;
- }
- return 0;
-}
-
static const struct rhashtable_params nfsd_file_rhash_params = {
.key_len = sizeof_field(struct nfsd_file, nf_inode),
.key_offset = offsetof(struct nfsd_file, nf_inode),
- .head_offset = offsetof(struct nfsd_file, nf_rhash),
- .hashfn = nfsd_file_key_hashfn,
- .obj_hashfn = nfsd_file_obj_hashfn,
- .obj_cmpfn = nfsd_file_obj_cmpfn,
- /* Reduce resizing churn on light workloads */
- .min_size = 512, /* buckets */
+ .head_offset = offsetof(struct nfsd_file, nf_rlist),
+
+ /*
+ * Start with a single page hash table to reduce resizing churn
+ * on light workloads.
+ */
+ .min_size = 256,
.automatic_shrinking = true,
};
@@ -307,27 +209,27 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
}
static struct nfsd_file *
-nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
+nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
+ bool want_gc)
{
struct nfsd_file *nf;
nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
- if (nf) {
- INIT_LIST_HEAD(&nf->nf_lru);
- nf->nf_birthtime = ktime_get();
- nf->nf_file = NULL;
- nf->nf_cred = get_current_cred();
- nf->nf_net = key->net;
- nf->nf_flags = 0;
- __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
- __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
- if (key->gc)
- __set_bit(NFSD_FILE_GC, &nf->nf_flags);
- nf->nf_inode = key->inode;
- refcount_set(&nf->nf_ref, 1);
- nf->nf_may = key->need;
- nf->nf_mark = NULL;
- }
+ if (unlikely(!nf))
+ return NULL;
+
+ INIT_LIST_HEAD(&nf->nf_lru);
+ nf->nf_birthtime = ktime_get();
+ nf->nf_file = NULL;
+ nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
+ nf->nf_flags = want_gc ?
+ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) :
+ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING);
+ nf->nf_inode = inode;
+ refcount_set(&nf->nf_ref, 1);
+ nf->nf_may = need;
+ nf->nf_mark = NULL;
return nf;
}
@@ -352,8 +254,8 @@ static void
nfsd_file_hash_remove(struct nfsd_file *nf)
{
trace_nfsd_file_unhash(nf);
- rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
- nfsd_file_rhash_params);
+ rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist,
+ nfsd_file_rhash_params);
}
static bool
@@ -380,10 +282,8 @@ nfsd_file_free(struct nfsd_file *nf)
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
- get_file(nf->nf_file);
- filp_close(nf->nf_file, NULL);
nfsd_file_check_write_error(nf);
- fput(nf->nf_file);
+ filp_close(nf->nf_file, NULL);
}
/*
@@ -402,13 +302,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
struct file *file = nf->nf_file;
struct address_space *mapping;
- if (!file || !(file->f_mode & FMODE_WRITE))
+ /* File not open for write? */
+ if (!(file->f_mode & FMODE_WRITE))
+ return false;
+
+ /*
+ * Some filesystems (e.g. NFS) flush all dirty data on close.
+ * On others, there is no need to wait for writeback.
+ */
+ if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE))
return false;
+
mapping = file->f_mapping;
return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
+
static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
@@ -492,49 +402,26 @@ nfsd_file_dispose_list(struct list_head *dispose)
}
}
-static void
-nfsd_file_list_remove_disposal(struct list_head *dst,
- struct nfsd_fcache_disposal *l)
-{
- spin_lock(&l->lock);
- list_splice_init(&l->freeme, dst);
- spin_unlock(&l->lock);
-}
-
-static void
-nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct nfsd_fcache_disposal *l = nn->fcache_disposal;
-
- spin_lock(&l->lock);
- list_splice_tail_init(files, &l->freeme);
- spin_unlock(&l->lock);
- queue_work(nfsd_filecache_wq, &l->work);
-}
-
-static void
-nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
- struct net *net)
-{
- struct nfsd_file *nf, *tmp;
-
- list_for_each_entry_safe(nf, tmp, src, nf_lru) {
- if (nf->nf_net == net)
- list_move_tail(&nf->nf_lru, dst);
- }
-}
-
+/**
+ * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list
+ * @dispose: list of nfsd_files to be disposed
+ *
+ * Transfers each file to the "freeme" list for its nfsd_net, to eventually
+ * be disposed of by the per-net garbage collector.
+ */
static void
nfsd_file_dispose_list_delayed(struct list_head *dispose)
{
- LIST_HEAD(list);
- struct nfsd_file *nf;
-
while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
- nfsd_file_list_add_disposal(&list, nf->nf_net);
+ struct nfsd_file *nf = list_first_entry(dispose,
+ struct nfsd_file, nf_lru);
+ struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+ spin_lock(&l->lock);
+ list_move_tail(&nf->nf_lru, &l->freeme);
+ spin_unlock(&l->lock);
+ queue_work(nfsd_filecache_wq, &l->work);
}
}
@@ -678,8 +565,8 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
* @inode: inode on which to close out nfsd_files
* @dispose: list on which to gather nfsd_files to close out
*
- * An nfsd_file represents a struct file being held open on behalf of nfsd. An
- * open file however can block other activity (such as leases), or cause
+ * An nfsd_file represents a struct file being held open on behalf of nfsd.
+ * An open file however can block other activity (such as leases), or cause
* undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
*
* This function is intended to find open nfsd_files when this sort of
@@ -692,20 +579,17 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
static void
nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
{
- struct nfsd_file_lookup_key key = {
- .type = NFSD_FILE_KEY_INODE,
- .inode = inode,
- };
+ struct rhlist_head *tmp, *list;
struct nfsd_file *nf;
rcu_read_lock();
- do {
- nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
- nfsd_file_rhash_params);
- if (!nf)
- break;
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+ if (!test_bit(NFSD_FILE_GC, &nf->nf_flags))
+ continue;
nfsd_file_cond_queue(nf, dispose);
- } while (1);
+ }
rcu_read_unlock();
}
@@ -758,8 +642,8 @@ nfsd_file_close_inode_sync(struct inode *inode)
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and destroy any entries that have not been used since
- * the last scan.
+ * Scrape the freeme list for this nfsd_net, and then dispose of them
+ * all.
*/
static void
nfsd_file_delayed_close(struct work_struct *work)
@@ -768,7 +652,10 @@ nfsd_file_delayed_close(struct work_struct *work)
struct nfsd_fcache_disposal *l = container_of(work,
struct nfsd_fcache_disposal, work);
- nfsd_file_list_remove_disposal(&head, l);
+ spin_lock(&l->lock);
+ list_splice_init(&l->freeme, &head);
+ spin_unlock(&l->lock);
+
nfsd_file_dispose_list(&head);
}
@@ -829,7 +716,7 @@ nfsd_file_cache_init(void)
if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
return 0;
- ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params);
+ ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params);
if (ret)
return ret;
@@ -897,7 +784,7 @@ out_err:
nfsd_file_mark_slab = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
- rhashtable_destroy(&nfsd_file_rhash_tbl);
+ rhltable_destroy(&nfsd_file_rhltable);
goto out;
}
@@ -906,7 +793,8 @@ out_err:
* @net: net-namespace to shut down the cache (may be NULL)
*
* Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
- * then close out everything. Called when an nfsd instance is being shut down.
+ * then close out everything. Called when an nfsd instance is being shut down,
+ * and when the exports table is flushed.
*/
static void
__nfsd_file_cache_purge(struct net *net)
@@ -915,7 +803,7 @@ __nfsd_file_cache_purge(struct net *net)
struct nfsd_file *nf;
LIST_HEAD(dispose);
- rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
+ rhltable_walk_enter(&nfsd_file_rhltable, &iter);
do {
rhashtable_walk_start(&iter);
@@ -1021,7 +909,7 @@ nfsd_file_cache_shutdown(void)
nfsd_file_mark_slab = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
- rhashtable_destroy(&nfsd_file_rhash_tbl);
+ rhltable_destroy(&nfsd_file_rhltable);
for_each_possible_cpu(i) {
per_cpu(nfsd_file_cache_hits, i) = 0;
@@ -1032,6 +920,35 @@ nfsd_file_cache_shutdown(void)
}
}
+static struct nfsd_file *
+nfsd_file_lookup_locked(const struct net *net, const struct cred *cred,
+ struct inode *inode, unsigned char need,
+ bool want_gc)
+{
+ struct rhlist_head *tmp, *list;
+ struct nfsd_file *nf;
+
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+ if (nf->nf_may != need)
+ continue;
+ if (nf->nf_net != net)
+ continue;
+ if (!nfsd_match_cred(nf->nf_cred, cred))
+ continue;
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc)
+ continue;
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
+ continue;
+
+ if (!nfsd_file_get(nf))
+ continue;
+ return nf;
+ }
+ return NULL;
+}
+
/**
* nfsd_file_is_cached - are there any cached open files for this inode?
* @inode: inode to check
@@ -1046,15 +963,20 @@ nfsd_file_cache_shutdown(void)
bool
nfsd_file_is_cached(struct inode *inode)
{
- struct nfsd_file_lookup_key key = {
- .type = NFSD_FILE_KEY_INODE,
- .inode = inode,
- };
+ struct rhlist_head *tmp, *list;
+ struct nfsd_file *nf;
bool ret = false;
- if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
- nfsd_file_rhash_params) != NULL)
- ret = true;
+ rcu_read_lock();
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist)
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
+ ret = true;
+ break;
+ }
+ rcu_read_unlock();
+
trace_nfsd_file_is_cached(inode, (int)ret);
return ret;
}
@@ -1064,14 +986,12 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct file *file,
struct nfsd_file **pnf, bool want_gc)
{
- struct nfsd_file_lookup_key key = {
- .type = NFSD_FILE_KEY_FULL,
- .need = may_flags & NFSD_FILE_MAY_MASK,
- .net = SVC_NET(rqstp),
- .gc = want_gc,
- };
+ unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_file *new, *nf;
+ const struct cred *cred;
bool open_retry = true;
- struct nfsd_file *nf;
+ struct inode *inode;
__be32 status;
int ret;
@@ -1079,80 +999,88 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
if (status != nfs_ok)
return status;
- key.inode = d_inode(fhp->fh_dentry);
- key.cred = get_current_cred();
+ inode = d_inode(fhp->fh_dentry);
+ cred = get_current_cred();
retry:
rcu_read_lock();
- nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
- nfsd_file_rhash_params);
- nf = nfsd_file_get(nf);
+ nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
rcu_read_unlock();
if (nf) {
+ /*
+ * If the nf is on the LRU then it holds an extra reference
+ * that must be put if it's removed. It had better not be
+ * the last one however, since we should hold another.
+ */
if (nfsd_file_lru_remove(nf))
WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
goto wait_for_construction;
}
- nf = nfsd_file_alloc(&key, may_flags);
- if (!nf) {
+ new = nfsd_file_alloc(net, inode, need, want_gc);
+ if (!new) {
status = nfserr_jukebox;
- goto out_status;
+ goto out;
}
- ret = rhashtable_lookup_insert_key(&nfsd_file_rhash_tbl,
- &key, &nf->nf_rhash,
- nfsd_file_rhash_params);
+ rcu_read_lock();
+ spin_lock(&inode->i_lock);
+ nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ if (unlikely(nf)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ nfsd_file_slab_free(&new->nf_rcu);
+ goto wait_for_construction;
+ }
+ nf = new;
+ ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist,
+ nfsd_file_rhash_params);
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
if (likely(ret == 0))
goto open_file;
- nfsd_file_slab_free(&nf->nf_rcu);
- nf = NULL;
if (ret == -EEXIST)
goto retry;
- trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, ret);
+ trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret);
status = nfserr_jukebox;
- goto out_status;
+ goto construction_err;
wait_for_construction:
wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
+ trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf);
if (!open_retry) {
status = nfserr_jukebox;
- goto out;
+ goto construction_err;
}
open_retry = false;
- if (refcount_dec_and_test(&nf->nf_ref))
- nfsd_file_free(nf);
goto retry;
}
-
this_cpu_inc(nfsd_file_cache_hits);
status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
+ if (status != nfs_ok) {
+ nfsd_file_put(nf);
+ nf = NULL;
+ }
+
out:
if (status == nfs_ok) {
this_cpu_inc(nfsd_file_acquisitions);
nfsd_file_check_write_error(nf);
*pnf = nf;
- } else {
- if (refcount_dec_and_test(&nf->nf_ref))
- nfsd_file_free(nf);
- nf = NULL;
}
-
-out_status:
- put_cred(key.cred);
- trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
+ put_cred(cred);
+ trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
return status;
open_file:
trace_nfsd_file_alloc(nf);
- nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
+ nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode);
if (nf->nf_mark) {
if (file) {
get_file(file);
@@ -1170,13 +1098,16 @@ open_file:
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status == nfs_ok && key.inode->i_nlink == 0)
- status = nfserr_jukebox;
- if (status != nfs_ok)
+ if (status != nfs_ok || inode->i_nlink == 0)
nfsd_file_unhash(nf);
- clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
- smp_mb__after_atomic();
- wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+ clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ if (status == nfs_ok)
+ goto out;
+
+construction_err:
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ nf = NULL;
goto out;
}
@@ -1192,8 +1123,11 @@ open_file:
* seconds after the final nfsd_file_put() in case the caller
* wants to re-use it.
*
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
*/
__be32
nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1213,8 +1147,11 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
* but not garbage-collected. The object is unhashed after the
* final nfsd_file_put().
*
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
*/
__be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1235,8 +1172,11 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
* and @file is non-NULL, use it to instantiate a new nfsd_file instead of
* opening a new one.
*
- * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
- * network byte order is returned.
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
*/
__be32
nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1267,7 +1207,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
lru = list_lru_count(&nfsd_file_lru);
rcu_read_lock();
- ht = &nfsd_file_rhash_tbl;
+ ht = &nfsd_file_rhltable.ht;
count = atomic_read(&ht->nelems);
tbl = rht_dereference_rcu(ht->tbl, ht);
buckets = tbl->size;
@@ -1283,7 +1223,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
evictions += per_cpu(nfsd_file_evictions, i);
}
- seq_printf(m, "total entries: %u\n", count);
+ seq_printf(m, "total inodes: %u\n", count);
seq_printf(m, "hash buckets: %u\n", buckets);
seq_printf(m, "lru entries: %lu\n", lru);
seq_printf(m, "cache hits: %lu\n", hits);
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 41516a4263ea..e54165a3224f 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -29,9 +29,8 @@ struct nfsd_file_mark {
* never be dereferenced, only used for comparison.
*/
struct nfsd_file {
- struct rhash_head nf_rhash;
- struct list_head nf_lru;
- struct rcu_head nf_rcu;
+ struct rhlist_head nf_rlist;
+ void *nf_inode;
struct file *nf_file;
const struct cred *nf_cred;
struct net *nf_net;
@@ -40,10 +39,12 @@ struct nfsd_file {
#define NFSD_FILE_REFERENCED (2)
#define NFSD_FILE_GC (3)
unsigned long nf_flags;
- struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
unsigned char nf_may;
+
struct nfsd_file_mark *nf_mark;
+ struct list_head nf_lru;
+ struct rcu_head nf_rcu;
ktime_t nf_birthtime;
};
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5e9809aff37e..7a806ac13e31 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -240,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
goto out;
/* expiry */
- ent.h.expiry_time = get_expiry(&buf);
- if (ent.h.expiry_time == 0)
+ error = get_expiry(&buf, &ent.h.expiry_time);
+ if (error)
goto out;
error = -ENOMEM;
@@ -408,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
memcpy(ent.name, buf1, sizeof(ent.name));
/* expiry */
- ent.h.expiry_time = get_expiry(&buf);
- if (ent.h.expiry_time == 0)
+ error = get_expiry(&buf, &ent.h.expiry_time);
+ if (error)
goto out;
/* ID */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5783209f17fc..bb9d47172162 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -930,6 +930,9 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
* Grab and keep cached pages associated with a file in the svc_rqst
* so that they can be passed to the network sendmsg/sendpage routines
* directly. They will be released after the sending has completed.
+ *
+ * Return values: Number of bytes consumed, or -EIO if there are no
+ * remaining pages in rqstp->rq_pages.
*/
static int
nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
@@ -948,7 +951,8 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
*/
if (page == *(rqstp->rq_next_page - 1))
continue;
- svc_rqst_replace_page(rqstp, page);
+ if (unlikely(!svc_rqst_replace_page(rqstp, page)))
+ return -EIO;
}
if (rqstp->rq_res.page_len == 0) // first call
rqstp->rq_res.page_base = offset % PAGE_SIZE;
@@ -2164,7 +2168,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
goto out;
}
- buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+ buf = kvmalloc(len, GFP_KERNEL);
if (buf == NULL) {
err = nfserr_jukebox;
goto out;
@@ -2227,10 +2231,7 @@ nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
goto out;
}
- /*
- * We're holding i_rwsem - use GFP_NOFS.
- */
- buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS);
+ buf = kvmalloc(len, GFP_KERNEL);
if (buf == NULL) {
err = nfserr_jukebox;
goto out;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 41ccd43cd979..5cf30827f244 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -259,10 +259,10 @@ repeat:
NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
- if (unlikely(!dfolio)) {
+ if (unlikely(IS_ERR(dfolio))) {
/* No empty page is added to the page cache */
- err = -ENOMEM;
folio_unlock(folio);
+ err = PTR_ERR(dfolio);
break;
}
if (unlikely(!folio_buffers(folio)))
@@ -311,7 +311,7 @@ repeat:
folio_lock(folio);
dfolio = filemap_lock_folio(dmap, index);
- if (dfolio) {
+ if (!IS_ERR(dfolio)) {
/* overwrite existing folio in the destination cache */
WARN_ON(folio_test_dirty(dfolio));
nilfs_copy_page(&dfolio->page, &folio->page, 0);
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index a030d00af90c..174fe536a1c0 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -31,16 +31,6 @@ static struct ctl_table ntfs_sysctls[] = {
{}
};
-/* Define the parent directory /proc/sys/fs. */
-static struct ctl_table sysctls_root[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = ntfs_sysctls
- },
- {}
-};
-
/* Storage for the sysctls header. */
static struct ctl_table_header *sysctls_root_table;
@@ -54,7 +44,7 @@ int ntfs_sysctl(int add)
{
if (add) {
BUG_ON(sysctls_root_table);
- sysctls_root_table = register_sysctl_table(sysctls_root);
+ sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
if (!sysctls_root_table)
return -ENOMEM;
} else {
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 5e6bafb10f42..0b8bc66377db 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -405,8 +405,8 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
int err = 0;
struct ntfs_sb_info *sbi = ni->mi.sbi;
u8 cluster_bits = sbi->cluster_bits;
- bool is_mft =
- ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len;
+ bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA &&
+ !name_len;
u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp;
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
@@ -531,11 +531,10 @@ add_alloc_in_same_attr_seg:
pre_alloc = 0;
if (type == ATTR_DATA && !name_len &&
sbi->options->prealloc) {
- pre_alloc =
- bytes_to_cluster(
- sbi,
- get_pre_allocated(new_size)) -
- new_alen;
+ pre_alloc = bytes_to_cluster(
+ sbi, get_pre_allocated(
+ new_size)) -
+ new_alen;
}
/* Get the last LCN to allocate from. */
@@ -573,8 +572,8 @@ add_alloc_in_same_attr_seg:
err = attr_allocate_clusters(
sbi, run, vcn, lcn, to_allocate, &pre_alloc,
is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen,
- is_mft ? 0
- : (sbi->record_size -
+ is_mft ? 0 :
+ (sbi->record_size -
le32_to_cpu(rec->used) + 8) /
3 +
1,
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 723fb64e6531..9a6c6a09d70c 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -40,9 +40,9 @@ static struct kmem_cache *ntfs_enode_cachep;
int __init ntfs3_init_bitmap(void)
{
- ntfs_enode_cachep =
- kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0,
- SLAB_RECLAIM_ACCOUNT, NULL);
+ ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache",
+ sizeof(struct e_node), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
return ntfs_enode_cachep ? 0 : -ENOMEM;
}
@@ -286,9 +286,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len,
if (wnd->uptodated != 1) {
/* Check bits before 'bit'. */
ib = wnd->zone_bit == wnd->zone_end ||
- bit < wnd->zone_end
- ? 0
- : wnd->zone_end;
+ bit < wnd->zone_end ?
+ 0 :
+ wnd->zone_end;
while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) {
bit -= 1;
@@ -297,9 +297,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len,
/* Check bits after 'end_in'. */
ib = wnd->zone_bit == wnd->zone_end ||
- end_in > wnd->zone_bit
- ? wnd->nbits
- : wnd->zone_bit;
+ end_in > wnd->zone_bit ?
+ wnd->nbits :
+ wnd->zone_bit;
while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) {
end_in += 1;
@@ -417,8 +417,8 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len)
return;
n3 = rb_first(&wnd->count_tree);
wnd->extent_max =
- n3 ? rb_entry(n3, struct e_node, count.node)->count.key
- : 0;
+ n3 ? rb_entry(n3, struct e_node, count.node)->count.key :
+ 0;
return;
}
@@ -658,7 +658,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
if (!wnd->bits_last)
wnd->bits_last = wbits;
- wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
+ wnd->free_bits =
+ kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
if (!wnd->free_bits)
return -ENOMEM;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index e9bdc1ff08c9..9a3d55c367d9 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -22,20 +22,21 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
{
struct fstrim_range __user *user_range;
struct fstrim_range range;
+ struct block_device *dev;
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
+ dev = sbi->sb->s_bdev;
+ if (!bdev_max_discard_sectors(dev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
if (copy_from_user(&range, user_range, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u32, range.minlen,
- bdev_discard_granularity(sbi->sb->s_bdev));
+ range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev));
err = ntfs_trim_fs(sbi, &range);
if (err < 0)
@@ -190,8 +191,8 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
- to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
- : PAGE_SIZE;
+ to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) :
+ PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
page = find_or_create_page(mapping, idx,
@@ -223,16 +224,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(REQ_OP_READ, bh);
-
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
+ err = bh_read(bh, 0);
+ if (err < 0) {
unlock_page(page);
put_page(page);
- err = -EIO;
goto out;
}
}
@@ -570,13 +565,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_unlock(ni);
} else {
/* Check new size. */
+ u8 cluster_bits = sbi->cluster_bits;
/* generic/213: expected -ENOSPC instead of -EFBIG. */
if (!is_supported_holes) {
loff_t to_alloc = new_size - inode_get_bytes(inode);
if (to_alloc > 0 &&
- (to_alloc >> sbi->cluster_bits) >
+ (to_alloc >> cluster_bits) >
wnd_zeroes(&sbi->used.bitmap)) {
err = -ENOSPC;
goto out;
@@ -597,7 +593,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
}
if (is_supported_holes) {
- CLST vcn = vbo >> sbi->cluster_bits;
+ CLST vcn = vbo >> cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
CLST lcn, clen;
@@ -660,22 +656,12 @@ out:
int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
- struct super_block *sb = dentry->d_sb;
- struct ntfs_sb_info *sbi = sb->s_fs_info;
struct inode *inode = d_inode(dentry);
struct ntfs_inode *ni = ntfs_i(inode);
u32 ia_valid = attr->ia_valid;
umode_t mode = inode->i_mode;
int err;
- if (sbi->options->noacsrules) {
- /* "No access rules" - Force any changes of time etc. */
- attr->ia_valid |= ATTR_FORCE;
- /* and disable for editing some attributes. */
- attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
- ia_valid = attr->ia_valid;
- }
-
err = setattr_prepare(idmap, dentry, attr);
if (err)
goto out;
@@ -719,7 +705,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
- ntfs_save_wsl_perm(inode);
+ ntfs_save_wsl_perm(inode, NULL);
mark_inode_dirty(inode);
out:
return err;
@@ -1065,8 +1051,8 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out;
- ret = is_compressed(ni) ? ntfs_compress_write(iocb, from)
- : __generic_file_write_iter(iocb, from);
+ ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) :
+ __generic_file_write_iter(iocb, from);
out:
inode_unlock(inode);
@@ -1118,8 +1104,9 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
int err = 0;
/* If we are last writer on the inode, drop the block reservation. */
- if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) &&
- atomic_read(&inode->i_writecount) == 1)) {
+ if (sbi->options->prealloc &&
+ ((file->f_mode & FMODE_WRITE) &&
+ atomic_read(&inode->i_writecount) == 1)) {
ni_lock(ni);
down_write(&ni->file.run_lock);
@@ -1159,8 +1146,7 @@ const struct inode_operations ntfs_file_inode_operations = {
.getattr = ntfs_getattr,
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
- .permission = ntfs_permission,
- .get_inode_acl = ntfs_get_acl,
+ .get_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
};
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index f1df52dfab74..2bfcf1a989c9 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -76,8 +76,8 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
const struct ATTRIB *attr;
attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
- return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO))
- : NULL;
+ return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) :
+ NULL;
}
/*
@@ -91,8 +91,8 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
- return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5))
- : NULL;
+ return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) :
+ NULL;
}
/*
@@ -102,7 +102,7 @@ void ni_clear(struct ntfs_inode *ni)
{
struct rb_node *node;
- if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec))
+ if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec))
ni_delete_all(ni);
al_destroy(ni);
@@ -1439,8 +1439,8 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
int err;
CLST plen;
struct ATTRIB *attr;
- bool is_ext =
- (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn;
+ bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) &&
+ !svcn;
u32 name_size = ALIGN(name_len * sizeof(short), 8);
u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT;
u32 run_off = name_off + name_size;
@@ -1645,7 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
{
struct ATTRIB *attr = NULL;
struct ATTR_FILE_NAME *fname;
- struct le_str *fns;
+ struct le_str *fns;
if (le)
*le = NULL;
@@ -1756,9 +1756,9 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa)
}
/* Resize nonresident empty attribute in-place only. */
- new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED))
- ? (SIZEOF_NONRESIDENT_EX + 8)
- : (SIZEOF_NONRESIDENT + 8);
+ new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ?
+ (SIZEOF_NONRESIDENT_EX + 8) :
+ (SIZEOF_NONRESIDENT + 8);
if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size)))
return -EOPNOTSUPP;
@@ -2965,14 +2965,14 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTRIB *attr;
- u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0;
+ u16 de_key_size;
switch (undo_step) {
case 4:
+ de_key_size = le16_to_cpu(de2->key_size);
if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
- &attr, NULL, NULL)) {
+ &attr, NULL, NULL))
return false;
- }
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size);
mi_get_ref(&ni->mi, &de2->ref);
@@ -2981,19 +2981,16 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
de2->flags = 0;
de2->res = 0;
- if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL,
- 1)) {
+ if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1))
return false;
- }
fallthrough;
case 2:
de_key_size = le16_to_cpu(de->key_size);
if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
- &attr, NULL, NULL)) {
+ &attr, NULL, NULL))
return false;
- }
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
mi_get_ref(&ni->mi, &de->ref);
@@ -3162,9 +3159,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
u64 data_size = le64_to_cpu(attr->nres.data_size);
__le64 valid_le;
- dup->alloc_size = is_attr_ext(attr)
- ? attr->nres.total_size
- : attr->nres.alloc_size;
+ dup->alloc_size = is_attr_ext(attr) ?
+ attr->nres.total_size :
+ attr->nres.alloc_size;
dup->data_size = attr->nres.data_size;
if (new_valid > data_size)
@@ -3258,6 +3255,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
return 0;
}
+ if (!ni->mi.mrec)
+ goto out;
+
if (is_rec_inuse(ni->mi.mrec) &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
bool modified = false;
@@ -3360,7 +3360,7 @@ out:
ni_unlock(ni);
if (err) {
- ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err);
+ ntfs_inode_err(inode, "%s failed, %d.", hint, err);
ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
return err;
}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index c6eb371a3695..57762c5fe68b 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -827,10 +827,10 @@ static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl,
memcpy(rt + 1, tbl + 1, esize * used);
- rt->free_goal = free_goal == ~0u
- ? cpu_to_le32(~0u)
- : cpu_to_le32(sizeof(struct RESTART_TABLE) +
- free_goal * esize);
+ rt->free_goal = free_goal == ~0u ?
+ cpu_to_le32(~0u) :
+ cpu_to_le32(sizeof(struct RESTART_TABLE) +
+ free_goal * esize);
if (tbl->first_free) {
rt->first_free = tbl->first_free;
@@ -1089,9 +1089,9 @@ static inline u64 base_lsn(struct ntfs_log *log,
(lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0))
<< log->file_data_bits) +
((((is_log_record_end(hdr) &&
- h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn))
- ? le16_to_cpu(hdr->record_hdr.next_record_off)
- : log->page_size) +
+ h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ?
+ le16_to_cpu(hdr->record_hdr.next_record_off) :
+ log->page_size) +
lsn) >>
3);
@@ -1298,9 +1298,9 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
if (!log->clst_per_page)
log->clst_per_page = 1;
- log->first_page = major_ver >= 2
- ? 0x22 * page_size
- : ((sys_page_size << 1) + (page_size << 1));
+ log->first_page = major_ver >= 2 ?
+ 0x22 * page_size :
+ ((sys_page_size << 1) + (page_size << 1));
log->major_ver = major_ver;
log->minor_ver = minor_ver;
}
@@ -1512,20 +1512,19 @@ static u32 current_log_avail(struct ntfs_log *log)
* have to compute the free range.
* If there is no oldest lsn then start at the first page of the file.
*/
- oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN)
- ? log->first_page
- : (log->oldest_lsn_off & ~log->sys_page_mask);
+ oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ?
+ log->first_page :
+ (log->oldest_lsn_off & ~log->sys_page_mask);
/*
* We will use the next log page offset to compute the next free page.
* If we are going to reuse this page go to the next page.
* If we are at the first page then use the end of the file.
*/
- next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL)
- ? log->next_page + log->page_size
- : log->next_page == log->first_page
- ? log->l_size
- : log->next_page;
+ next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ?
+ log->next_page + log->page_size :
+ log->next_page == log->first_page ? log->l_size :
+ log->next_page;
/* If the two offsets are the same then there is no available space. */
if (oldest_off == next_free_off)
@@ -1535,9 +1534,9 @@ static u32 current_log_avail(struct ntfs_log *log)
* this range from the total available pages.
*/
free_bytes =
- oldest_off < next_free_off
- ? log->total_avail_pages - (next_free_off - oldest_off)
- : oldest_off - next_free_off;
+ oldest_off < next_free_off ?
+ log->total_avail_pages - (next_free_off - oldest_off) :
+ oldest_off - next_free_off;
free_bytes >>= log->page_bits;
return free_bytes * log->reserved;
@@ -1671,8 +1670,8 @@ next_tail:
}
best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0;
- best_lsn2 =
- second_tail ? base_lsn(log, second_tail, second_file_off) : 0;
+ best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) :
+ 0;
if (first_tail && second_tail) {
if (best_lsn1 > best_lsn2) {
@@ -1767,8 +1766,8 @@ tail_read:
page_cnt = page_pos = 1;
- curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off)
- : log->next_page;
+ curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) :
+ log->next_page;
wrapped_file =
curpage_off == log->first_page &&
@@ -1826,9 +1825,9 @@ use_cur_page:
le64_to_cpu(cur_page->record_hdr.last_end_lsn) &&
((lsn_cur >> log->file_data_bits) +
((curpage_off <
- (lsn_to_vbo(log, lsn_cur) & ~log->page_mask))
- ? 1
- : 0)) != expected_seq) {
+ (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ?
+ 1 :
+ 0)) != expected_seq) {
goto check_tail;
}
@@ -2575,7 +2574,7 @@ static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn)
return find_log_rec(log, *lsn, lcb);
}
-static inline bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes)
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes)
{
__le16 mask;
u32 min_de, de_off, used, total;
@@ -2642,9 +2641,10 @@ static inline bool check_index_root(const struct ATTRIB *attr,
{
bool ret;
const struct INDEX_ROOT *root = resident_data(attr);
- u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size
- ? sbi->cluster_bits
- : SECTOR_SHIFT;
+ u8 index_bits = le32_to_cpu(root->index_block_size) >=
+ sbi->cluster_size ?
+ sbi->cluster_bits :
+ SECTOR_SHIFT;
u8 block_clst = root->index_block_clst;
if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) ||
@@ -3683,7 +3683,8 @@ move_data:
if (a_dirty) {
attr = oa->attr;
- err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0);
+ err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes,
+ 0);
if (err)
goto out;
}
@@ -3768,11 +3769,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
if (!log)
return -ENOMEM;
- memset(&rst_info, 0, sizeof(struct restart_info));
-
log->ni = ni;
log->l_size = l_size;
log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+
if (!log->one_page_buf) {
err = -ENOMEM;
goto out;
@@ -3783,6 +3783,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
log->page_bits = blksize_bits(page_size);
/* Look for a restart area on the disk. */
+ memset(&rst_info, 0, sizeof(struct restart_info));
err = log_read_rst(log, l_size, true, &rst_info);
if (err)
goto out;
@@ -3859,10 +3860,10 @@ check_restart_area:
log->init_ra = !!rst_info.vbo;
/* If we have a valid page then grab a pointer to the restart area. */
- ra2 = rst_info.valid_page
- ? Add2Ptr(rst_info.r_page,
- le16_to_cpu(rst_info.r_page->ra_off))
- : NULL;
+ ra2 = rst_info.valid_page ?
+ Add2Ptr(rst_info.r_page,
+ le16_to_cpu(rst_info.r_page->ra_off)) :
+ NULL;
if (rst_info.chkdsk_was_run ||
(ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
@@ -4256,6 +4257,10 @@ check_attribute_names:
rec_len -= t32;
attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS);
+ if (!attr_names) {
+ err = -ENOMEM;
+ goto out;
+ }
lcb_put(lcb);
lcb = NULL;
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 567563771bf8..28cc421102e5 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -172,8 +172,8 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
u16 sample, fo, fn;
fo = le16_to_cpu(rhdr->fix_off);
- fn = simple ? ((bytes >> SECTOR_SHIFT) + 1)
- : le16_to_cpu(rhdr->fix_num);
+ fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) :
+ le16_to_cpu(rhdr->fix_num);
/* Check errors. */
if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
@@ -223,7 +223,7 @@ int ntfs_extend_init(struct ntfs_sb_info *sbi)
inode = ntfs_iget5(sb, &ref, &NAME_EXTEND);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- ntfs_err(sb, "Failed to load $Extend.");
+ ntfs_err(sb, "Failed to load $Extend (%d).", err);
inode = NULL;
goto out;
}
@@ -282,7 +282,7 @@ int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi)
/* Check for 4GB. */
if (ni->vfs_inode.i_size >= 0x100000000ull) {
- ntfs_err(sb, "\x24LogFile is too big");
+ ntfs_err(sb, "\x24LogFile is large than 4G.");
err = -EINVAL;
goto out;
}
@@ -646,13 +646,13 @@ next:
NULL, 0, NULL, NULL))
goto next;
- __clear_bit_le(ir - MFT_REC_RESERVED,
+ __clear_bit(ir - MFT_REC_RESERVED,
&sbi->mft.reserved_bitmap);
}
}
/* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
- zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap,
+ zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
MFT_REC_FREE, MFT_REC_RESERVED);
if (zbit >= MFT_REC_FREE) {
sbi->mft.next_reserved = MFT_REC_FREE;
@@ -720,7 +720,7 @@ found:
if (*rno >= MFT_REC_FREE)
wnd_set_used(wnd, *rno, 1);
else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
- __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
out:
if (!mft)
@@ -748,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
else
wnd_set_free(wnd, rno, 1);
} else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
- __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
}
if (rno < wnd_zone_bit(wnd))
@@ -846,18 +846,16 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
{
int err;
struct super_block *sb = sbi->sb;
- u32 blocksize;
+ u32 blocksize, bytes;
sector_t block1, block2;
- u32 bytes;
- if (!sb)
+ /*
+ * sb can be NULL here. In this case sbi->flags should be 0 too.
+ */
+ if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR))
return;
blocksize = sb->s_blocksize;
-
- if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
- return;
-
bytes = sbi->mft.recs_mirr << sbi->record_bits;
block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;
@@ -925,6 +923,7 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
struct VOLUME_INFO *info;
struct mft_inode *mi;
struct ntfs_inode *ni;
+ __le16 info_flags;
/*
* Do not change state if fs was real_dirty.
@@ -957,6 +956,8 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
goto out;
}
+ info_flags = info->flags;
+
switch (dirty) {
case NTFS_DIRTY_ERROR:
ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors");
@@ -970,8 +971,10 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
break;
}
/* Cache current volume flags. */
- sbi->volume.flags = info->flags;
- mi->dirty = true;
+ if (info_flags != info->flags) {
+ sbi->volume.flags = info->flags;
+ mi->dirty = true;
+ }
err = 0;
out:
@@ -1683,6 +1686,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir)
out:
if (err) {
+ make_bad_inode(inode);
iput(inode);
ni = ERR_PTR(err);
}
@@ -1859,7 +1863,7 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
inode = ntfs_iget5(sb, &ref, &NAME_SECURE);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- ntfs_err(sb, "Failed to load $Secure.");
+ ntfs_err(sb, "Failed to load $Secure (%d).", err);
inode = NULL;
goto out;
}
@@ -1870,41 +1874,43 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME,
ARRAY_SIZE(SDH_NAME), NULL, NULL);
- if (!attr) {
- err = -EINVAL;
- goto out;
- }
-
- root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
- if (root_sdh->type != ATTR_ZERO ||
+ if (!attr ||
+ !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
+ root_sdh->type != ATTR_ZERO ||
root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
- offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) {
+ offsetof(struct INDEX_ROOT, ihdr) +
+ le32_to_cpu(root_sdh->ihdr.used) >
+ le32_to_cpu(attr->res.data_size)) {
+ ntfs_err(sb, "$Secure::$SDH is corrupted.");
err = -EINVAL;
goto out;
}
err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err);
goto out;
+ }
attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME,
ARRAY_SIZE(SII_NAME), NULL, NULL);
- if (!attr) {
- err = -EINVAL;
- goto out;
- }
-
- root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
- if (root_sii->type != ATTR_ZERO ||
+ if (!attr ||
+ !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
+ root_sii->type != ATTR_ZERO ||
root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
- offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) {
+ offsetof(struct INDEX_ROOT, ihdr) +
+ le32_to_cpu(root_sii->ihdr.used) >
+ le32_to_cpu(attr->res.data_size)) {
+ ntfs_err(sb, "$Secure::$SII is corrupted.");
err = -EINVAL;
goto out;
}
err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err);
goto out;
+ }
fnd_sii = fnd_get();
if (!fnd_sii) {
@@ -2594,8 +2600,10 @@ static inline bool is_reserved_name(struct ntfs_sb_info *sbi,
if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) {
port_digit = le16_to_cpu(name[3]);
if (port_digit >= '1' && port_digit <= '9')
- if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) ||
- !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false))
+ if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase,
+ false) ||
+ !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase,
+ false))
return true;
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 51ab75954640..0a48d2d67219 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -431,8 +431,9 @@ next_run:
if (vbo + blocksize > data_size)
nbits = 8 * (data_size - vbo);
- ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret)
- : false;
+ ok = nbits > from ?
+ (*fn)((ulong *)bh->b_data, from, nbits, ret) :
+ false;
put_bh(bh);
if (ok) {
@@ -725,9 +726,13 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
u32 e_size, e_key_len;
u32 end = le32_to_cpu(hdr->used);
u32 off = le32_to_cpu(hdr->de_off);
+ u32 total = le32_to_cpu(hdr->total);
u16 offs[128];
fill_table:
+ if (end > total)
+ return NULL;
+
if (off + sizeof(struct NTFS_DE) > end)
return NULL;
@@ -760,8 +765,7 @@ binary_search:
return NULL;
max_idx = 0;
- table_size = min(table_size * 2,
- (int)ARRAY_SIZE(offs));
+ table_size = min(table_size * 2, (int)ARRAY_SIZE(offs));
goto fill_table;
}
} else if (diff2 < 0) {
@@ -844,6 +848,10 @@ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr,
u32 off = PtrOffset(hdr, re);
int bytes = used - (off + esize);
+ /* check INDEX_HDR valid before using INDEX_HDR */
+ if (!check_index_header(hdr, le32_to_cpu(hdr->total)))
+ return NULL;
+
if (off >= used || esize < sizeof(struct NTFS_DE) ||
bytes < sizeof(struct NTFS_DE))
return NULL;
@@ -986,6 +994,7 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
struct ATTR_LIST_ENTRY *le = NULL;
struct ATTRIB *a;
const struct INDEX_NAMES *in = &s_index_names[indx->type];
+ struct INDEX_ROOT *root;
a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL,
mi);
@@ -995,7 +1004,16 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
if (attr)
*attr = a;
- return resident_data_ex(a, sizeof(struct INDEX_ROOT));
+ root = resident_data_ex(a, sizeof(struct INDEX_ROOT));
+
+ /* length check */
+ if (root &&
+ offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) >
+ le32_to_cpu(a->res.data_size)) {
+ return NULL;
+ }
+
+ return root;
}
static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni,
@@ -1085,7 +1103,8 @@ ok:
}
/* check for index header length */
- if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) {
+ if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) >
+ bytes) {
err = -EINVAL;
goto out;
}
@@ -1151,8 +1170,10 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
/* Read next level. */
err = indx_read(indx, ni, de_get_vbn(e), &node);
- if (err)
+ if (err) {
+ /* io error? */
return err;
+ }
/* Lookup entry that is <= to the search value. */
e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
@@ -1654,9 +1675,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
mi->dirty = true;
/* Create alloc and bitmap attributes (if not). */
- err = run_is_empty(&indx->alloc_run)
- ? indx_create_allocate(indx, ni, &new_vbn)
- : indx_add_allocate(indx, ni, &new_vbn);
+ err = run_is_empty(&indx->alloc_run) ?
+ indx_create_allocate(indx, ni, &new_vbn) :
+ indx_add_allocate(indx, ni, &new_vbn);
/* Layout of record may be changed, so rescan root. */
root = indx_get_root(indx, ni, &attr, &mi);
@@ -1759,10 +1780,11 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
struct indx_node *n1 = fnd->nodes[level];
struct INDEX_HDR *hdr1 = &n1->index->ihdr;
struct INDEX_HDR *hdr2;
- u32 to_copy, used;
+ u32 to_copy, used, used1;
CLST new_vbn;
__le64 t_vbn, *sub_vbn;
u16 sp_size;
+ void *hdr1_saved = NULL;
/* Try the most easy case. */
e = fnd->level - 1 == level ? fnd->de[level] : NULL;
@@ -1795,6 +1817,13 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
return -ENOMEM;
memcpy(up_e, sp, sp_size);
+ used1 = le32_to_cpu(hdr1->used);
+ hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS);
+ if (!hdr1_saved) {
+ err = -ENOMEM;
+ goto out;
+ }
+
if (!hdr1->flags) {
up_e->flags |= NTFS_IE_HAS_SUBNODES;
up_e->size = cpu_to_le16(sp_size + sizeof(u64));
@@ -1827,7 +1856,7 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
hdr_insert_head(hdr2, de_t, to_copy);
/* Remove all entries (sp including) from hdr1. */
- used = le32_to_cpu(hdr1->used) - to_copy - sp_size;
+ used = used1 - to_copy - sp_size;
memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off));
hdr1->used = cpu_to_le32(used);
@@ -1838,9 +1867,9 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
hdr_insert_de(indx,
(*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size),
up_e + 1, le16_to_cpu(up_e->key_size),
- ctx) < 0
- ? hdr2
- : hdr1,
+ ctx) < 0 ?
+ hdr2 :
+ hdr1,
new_de, NULL, ctx);
indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);
@@ -1857,8 +1886,6 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
if (!level) {
/* Insert in root. */
err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0);
- if (err)
- goto out;
} else {
/*
* The target buffer's parent is another index buffer.
@@ -1866,12 +1893,20 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
*/
err = indx_insert_into_buffer(indx, ni, root, up_e, ctx,
level - 1, fnd);
- if (err)
- goto out;
+ }
+
+ if (err) {
+ /*
+ * Undo critical operations.
+ */
+ indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits);
+ memcpy(hdr1, hdr1_saved, used1);
+ indx_write(indx, ni, n1, 0);
}
out:
kfree(up_e);
+ kfree(hdr1_saved);
return err;
}
@@ -1930,16 +1965,12 @@ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
*/
err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx,
fnd, undo);
- if (err)
- goto out;
} else {
/*
* Found a leaf buffer, so we'll insert the new entry into it.
*/
err = indx_insert_into_buffer(indx, ni, root, new_de, ctx,
fnd->level - 1, fnd);
- if (err)
- goto out;
}
out:
@@ -2308,8 +2339,8 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
err = level ? indx_insert_into_buffer(indx, ni, root,
re, ctx,
fnd->level - 1,
- fnd)
- : indx_insert_into_root(indx, ni, re, e,
+ fnd) :
+ indx_insert_into_root(indx, ni, re, e,
ctx, fnd, 0);
kfree(re);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 309d9b46b5d5..6c560245eef4 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -100,6 +100,12 @@ static struct inode *ntfs_read_mft(struct inode *inode,
/* Record should contain $I30 root. */
is_dir = rec->flags & RECORD_FLAG_DIR;
+ /* MFT_REC_MFT is not a dir */
+ if (is_dir && ino == MFT_REC_MFT) {
+ err = -EINVAL;
+ goto out;
+ }
+
inode->i_generation = le16_to_cpu(rec->seq);
/* Enumerate all struct Attributes MFT. */
@@ -131,7 +137,13 @@ next_attr:
rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
asize = le32_to_cpu(attr->size);
- if (le16_to_cpu(attr->name_off) + attr->name_len > asize)
+ /*
+ * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'.
+ * There not critical to check this case again
+ */
+ if (attr->name_len &&
+ sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) >
+ asize)
goto out;
if (attr->non_res) {
@@ -250,8 +262,8 @@ next_attr:
if (!attr->nres.alloc_size)
goto next_attr;
- run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run
- : &ni->file.run;
+ run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run :
+ &ni->file.run;
break;
case ATTR_ROOT:
@@ -259,7 +271,6 @@ next_attr:
goto out;
root = Add2Ptr(attr, roff);
- is_root = true;
if (attr->name_len != ARRAY_SIZE(I30_NAME) ||
memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
@@ -272,15 +283,16 @@ next_attr:
if (!is_dir)
goto next_attr;
+ is_root = true;
ni->ni_flags |= NI_FLAG_DIR;
err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
if (err)
goto out;
- mode = sb->s_root
- ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv))
- : (S_IFDIR | 0777);
+ mode = sb->s_root ?
+ (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) :
+ (S_IFDIR | 0777);
goto next_attr;
case ATTR_ALLOC:
@@ -437,8 +449,8 @@ end_enum:
ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
inode->i_op = &ntfs_file_inode_operations;
inode->i_fop = &ntfs_file_operations;
- inode->i_mapping->a_ops =
- is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+ inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
+ &ntfs_aops;
if (ino != MFT_REC_MFT)
init_rwsem(&ni->file.run_lock);
} else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
@@ -636,6 +648,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
set_bh_page(bh, page, off);
+
err = bh_read(bh, 0);
if (err < 0)
goto out;
@@ -773,8 +786,8 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
ret = blockdev_direct_IO(iocb, inode, iter,
- wr ? ntfs_get_block_direct_IO_W
- : ntfs_get_block_direct_IO_R);
+ wr ? ntfs_get_block_direct_IO_W :
+ ntfs_get_block_direct_IO_R);
if (ret > 0)
end = vbo + ret;
@@ -833,7 +846,7 @@ out:
}
static int ntfs_resident_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+ struct writeback_control *wbc, void *data)
{
struct address_space *mapping = data;
struct ntfs_inode *ni = ntfs_i(mapping->host);
@@ -874,8 +887,8 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
*pagep = NULL;
if (is_resident(ni)) {
- struct page *page = grab_cache_page_write_begin(
- mapping, pos >> PAGE_SHIFT);
+ struct page *page =
+ grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT);
if (!page) {
err = -ENOMEM;
@@ -907,9 +920,8 @@ out:
/*
* ntfs_write_end - Address_space_operations::write_end.
*/
-int ntfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 copied, struct page *page,
- void *fsdata)
+int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+ u32 len, u32 copied, struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
@@ -1307,8 +1319,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
inode_init_owner(idmap, inode, dir, mode);
mode = inode->i_mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime =
- current_time(inode);
+ ni->i_crtime = current_time(inode);
rec = ni->mi.mrec;
rec->hard_links = cpu_to_le16(1);
@@ -1349,10 +1360,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
attr->res.data_size = cpu_to_le32(dsize);
std5->cr_time = std5->m_time = std5->c_time = std5->a_time =
- kernel2nt(&inode->i_atime);
+ kernel2nt(&ni->i_crtime);
- ni->std_fa = fa;
- std5->fa = fa;
+ std5->fa = ni->std_fa = fa;
attr = Add2Ptr(attr, asize);
@@ -1551,11 +1561,15 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
}
asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
+ /* Write non resident data. */
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp,
+ nsize, 0);
+ if (err)
+ goto out5;
} else {
attr->res.data_off = SIZEOF_RESIDENT_LE;
attr->res.data_size = cpu_to_le32(nsize);
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
- nsize = 0;
}
/* Size of symlink equals the length of input string. */
inode->i_size = size;
@@ -1576,19 +1590,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8);
rec->next_attr_id = cpu_to_le16(aid);
- /* Step 2: Add new name in index. */
- err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
- if (err)
- goto out6;
-
- /* Unlock parent directory before ntfs_init_acl. */
- if (!fnd)
- ni_unlock(dir_ni);
-
inode->i_generation = le16_to_cpu(rec->seq);
- dir->i_mtime = dir->i_ctime = inode->i_atime;
-
if (S_ISDIR(mode)) {
inode->i_op = &ntfs_dir_inode_operations;
inode->i_fop = &ntfs_dir_operations;
@@ -1601,8 +1604,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
} else if (S_ISREG(mode)) {
inode->i_op = &ntfs_file_inode_operations;
inode->i_fop = &ntfs_file_operations;
- inode->i_mapping->a_ops =
- is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+ inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
+ &ntfs_aops;
init_rwsem(&ni->file.run_lock);
} else {
inode->i_op = &ntfs_special_inode_operations;
@@ -1613,41 +1616,58 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap,
if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
err = ntfs_init_acl(idmap, inode, dir);
if (err)
- goto out7;
+ goto out5;
} else
#endif
{
inode->i_flags |= S_NOSEC;
}
- /* Write non resident data. */
- if (nsize) {
- err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0);
- if (err)
- goto out7;
+ /*
+ * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
+ * The packed size of extended attribute is stored in direntry too.
+ * 'fname' here points to inside new_de.
+ */
+ ntfs_save_wsl_perm(inode, &fname->dup.ea_size);
+
+ /*
+ * update ea_size in file_name attribute too.
+ * Use ni_find_attr cause layout of MFT record may be changed
+ * in ntfs_init_acl and ntfs_save_wsl_perm.
+ */
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
+ if (attr) {
+ struct ATTR_FILE_NAME *fn;
+
+ fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (fn)
+ fn->dup.ea_size = fname->dup.ea_size;
}
+ /* We do not need to update parent directory later */
+ ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT;
+
+ /* Step 2: Add new name in index. */
+ err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
+ if (err)
+ goto out6;
+
/*
* Call 'd_instantiate' after inode->i_op is set
* but before finish_open.
*/
d_instantiate(dentry, inode);
- ntfs_save_wsl_perm(inode);
+ /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
+ inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime =
+ dir->i_ctime = ni->i_crtime;
+
mark_inode_dirty(dir);
mark_inode_dirty(inode);
/* Normal exit. */
goto out2;
-out7:
-
- /* Undo 'indx_insert_entry'. */
- if (!fnd)
- ni_lock_dir(dir_ni);
- indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
- le16_to_cpu(new_de->key_size), sbi);
- /* ni_unlock(dir_ni); will be called later. */
out6:
if (rp_inserted)
ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
@@ -1669,11 +1689,11 @@ out2:
kfree(rp);
out1:
- if (err) {
- if (!fnd)
- ni_unlock(dir_ni);
+ if (!fnd)
+ ni_unlock(dir_ni);
+
+ if (err)
return ERR_PTR(err);
- }
unlock_new_inode(inode);
@@ -1770,9 +1790,6 @@ void ntfs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_nlink)
- _ni_write_inode(inode, inode_needs_sync(inode));
-
invalidate_inode_buffers(inode);
clear_inode(inode);
@@ -2057,7 +2074,6 @@ const struct inode_operations ntfs_link_inode_operations = {
.get_link = ntfs_get_link,
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
- .permission = ntfs_permission,
};
const struct address_space_operations ntfs_aops = {
diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c
index 28f654561f27..61e161c7c567 100644
--- a/fs/ntfs3/lznt.c
+++ b/fs/ntfs3/lznt.c
@@ -296,8 +296,8 @@ next:
*/
struct lznt *get_lznt_ctx(int level)
{
- struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash)
- : sizeof(struct lznt),
+ struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) :
+ sizeof(struct lznt),
GFP_NOFS);
if (r)
@@ -392,9 +392,9 @@ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc,
unc_use = err;
} else {
/* This chunk does not contain compressed data. */
- unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end
- ? unc_end - unc_chunk
- : LZNT_CHUNK_SIZE;
+ unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ?
+ unc_end - unc_chunk :
+ LZNT_CHUNK_SIZE;
if (cmpr_chunk + sizeof(chunk_hdr) + unc_use >
cmpr_end) {
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 407fe92394e2..9736b1e4a0f6 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -88,6 +88,16 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
__putname(uni);
}
+ /*
+ * Check for a null pointer
+ * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL.
+ * This causes null pointer dereference in d_splice_alias().
+ */
+ if (!IS_ERR_OR_NULL(inode) && !inode->i_op) {
+ iput(inode);
+ inode = ERR_PTR(-EINVAL);
+ }
+
return d_splice_alias(inode, dentry);
}
@@ -423,8 +433,8 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0,
NULL, 0, fnd);
- err = IS_ERR(inode) ? PTR_ERR(inode)
- : finish_open(file, dentry, ntfs_file_open);
+ err = IS_ERR(inode) ? PTR_ERR(inode) :
+ finish_open(file, dentry, ntfs_file_open);
dput(d);
out2:
@@ -597,8 +607,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
.rmdir = ntfs_rmdir,
.mknod = ntfs_mknod,
.rename = ntfs_rename,
- .permission = ntfs_permission,
- .get_inode_acl = ntfs_get_acl,
+ .get_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
@@ -611,7 +620,7 @@ const struct inode_operations ntfs_special_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
- .get_inode_acl = ntfs_get_acl,
+ .get_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
};
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 86ea1826d099..90151e56c122 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -435,9 +435,6 @@ static inline u64 attr_svcn(const struct ATTRIB *attr)
return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0;
}
-/* The size of resident attribute by its resident size. */
-#define BYTES_PER_RESIDENT(b) (0x18 + (b))
-
static_assert(sizeof(struct ATTRIB) == 0x48);
static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08);
static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 80072e5f96f7..eb01f7e76479 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -100,7 +100,6 @@ struct ntfs_mount_options {
unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */
unsigned windows_names : 1; /* Disallow names forbidden by Windows. */
unsigned force : 1; /* RW mount dirty volume. */
- unsigned noacsrules : 1; /* Exclude acs rules. */
unsigned prealloc : 1; /* Preallocate space when file is growing. */
unsigned nocase : 1; /* case insensitive. */
};
@@ -164,7 +163,6 @@ struct wnd_bitmap {
size_t zone_bit;
size_t zone_end;
- bool set_tail; // Not necessary in driver.
bool inited;
};
@@ -340,7 +338,7 @@ enum ntfs_inode_mutex_lock_class {
};
/*
- * sturct ntfs_inode
+ * struct ntfs_inode
*
* Ntfs inode - extends linux inode. consists of one or more MFT inodes.
*/
@@ -581,6 +579,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
bool ni_is_dirty(struct inode *inode);
/* Globals from fslog.c */
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
int log_replay(struct ntfs_inode *ni, bool *initialized);
/* Globals from fsntfs.c */
@@ -700,9 +699,8 @@ int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
int ntfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, u32 len, struct page **pagep, void **fsdata);
-int ntfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 copied, struct page *page,
- void *fsdata);
+int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+ u32 len, u32 copied, struct page *page, void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
@@ -858,23 +856,22 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, int type);
int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
struct posix_acl *acl, int type);
int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
- struct inode *dir);
+ struct inode *dir);
#else
#define ntfs_get_acl NULL
#define ntfs_set_acl NULL
#endif
int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
-int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode,
- int mask);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
extern const struct xattr_handler *ntfs_xattr_handlers[];
-int ntfs_save_wsl_perm(struct inode *inode);
+int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
/* globals from lznt.c */
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index defce6a5c8e1..2a281cead2bc 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -221,7 +221,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
}
if (off + asize < off) {
- /* overflow check */
+ /* Overflow check. */
return NULL;
}
@@ -247,8 +247,8 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if ((t32 & 0xf) || (t32 > 0x100))
return NULL;
- /* Check boundary. */
- if (off + asize > used)
+ /* Check overflow and boundary. */
+ if (off + asize < off || off + asize > used)
return NULL;
/* Check size of attribute. */
@@ -419,10 +419,9 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
struct ntfs_sb_info *sbi = mi->sbi;
u32 used = le32_to_cpu(rec->used);
const u16 *upcase = sbi->upcase;
- int diff;
/* Can we insert mi attribute? */
- if (used + asize > mi->sbi->record_size)
+ if (used + asize > sbi->record_size)
return NULL;
/*
@@ -431,7 +430,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
*/
attr = NULL;
while ((attr = mi_enum_attr(mi, attr))) {
- diff = compare_attr(attr, type, name, name_len, upcase);
+ int diff = compare_attr(attr, type, name, name_len, upcase);
if (diff < 0)
continue;
@@ -442,9 +441,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
}
if (!attr) {
- tail = 8; /* Not used, just to suppress warning. */
+ /* Append. */
+ tail = 8;
attr = Add2Ptr(rec, used - 8);
} else {
+ /* Insert before 'attr'. */
tail = used - PtrOffset(rec, attr);
}
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index a5af71cd8d14..47612d16c027 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -433,9 +433,9 @@ requires_new_range:
should_add_tail = Tovcn < r->len;
if (should_add_tail) {
- tail_lcn = r->lcn == SPARSE_LCN
- ? SPARSE_LCN
- : (r->lcn + Tovcn);
+ tail_lcn = r->lcn == SPARSE_LCN ?
+ SPARSE_LCN :
+ (r->lcn + Tovcn);
tail_vcn = r->vcn + Tovcn;
tail_len = r->len - Tovcn;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index ef4ea3f21905..5158dd31fd97 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -39,10 +39,10 @@
* To mount large volumes as ntfs one should use large cluster size (up to 2M)
* The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P
*
- * ntfs limits, cluster size is 2M (2^31)
+ * ntfs limits, cluster size is 2M (2^21)
* -----------------------------------------------------------------------------
- * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes |
- * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes |
+ * | < 8P, 2^53 | < 2^32 | yes | yes | yes | yes | yes |
+ * | > 8P, 2^53 | > 2^32 | no | no | yes | yes | yes |
* ----------------------------------------------------------|------------------
*
*/
@@ -115,9 +115,9 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
return;
/* Use static allocated buffer, if possible. */
- name = atomic_dec_and_test(&s_name_buf_cnt)
- ? s_name_buf
- : kmalloc(sizeof(s_name_buf), GFP_NOFS);
+ name = atomic_dec_and_test(&s_name_buf_cnt) ?
+ s_name_buf :
+ kmalloc(sizeof(s_name_buf), GFP_NOFS);
if (name) {
struct dentry *de = d_find_alias(inode);
@@ -253,7 +253,6 @@ enum Opt {
Opt_acl,
Opt_iocharset,
Opt_prealloc,
- Opt_noacsrules,
Opt_nocase,
Opt_err,
};
@@ -271,12 +270,11 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = {
fsparam_flag_no("hidden", Opt_nohidden),
fsparam_flag_no("hide_dot_files", Opt_hide_dot_files),
fsparam_flag_no("windows_names", Opt_windows_names),
- fsparam_flag_no("acl", Opt_acl),
fsparam_flag_no("showmeta", Opt_showmeta),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_string("iocharset", Opt_iocharset),
fsparam_flag_no("prealloc", Opt_prealloc),
- fsparam_flag_no("acsrules", Opt_noacsrules),
fsparam_flag_no("nocase", Opt_nocase),
- fsparam_string("iocharset", Opt_iocharset),
{}
};
@@ -366,19 +364,20 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_windows_names:
opts->windows_names = result.negated ? 0 : 1;
break;
+ case Opt_showmeta:
+ opts->showmeta = result.negated ? 0 : 1;
+ break;
case Opt_acl:
if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
fc->sb_flags |= SB_POSIXACL;
#else
- return invalf(fc, "ntfs3: Support for ACL not compiled in!");
+ return invalf(
+ fc, "ntfs3: Support for ACL not compiled in!");
#endif
else
fc->sb_flags &= ~SB_POSIXACL;
break;
- case Opt_showmeta:
- opts->showmeta = result.negated ? 0 : 1;
- break;
case Opt_iocharset:
kfree(opts->nls_name);
opts->nls_name = param->string;
@@ -387,9 +386,6 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_prealloc:
opts->prealloc = result.negated ? 0 : 1;
break;
- case Opt_noacsrules:
- opts->noacsrules = result.negated ? 1 : 0;
- break;
case Opt_nocase:
opts->nocase = result.negated ? 1 : 0;
break;
@@ -409,24 +405,29 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
- errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
+ errorf(fc,
+ "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
return -EINVAL;
}
new_opts->nls = ntfs_load_nls(new_opts->nls_name);
if (IS_ERR(new_opts->nls)) {
new_opts->nls = NULL;
- errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name);
+ errorf(fc, "ntfs3: Cannot load iocharset %s",
+ new_opts->nls_name);
return -EINVAL;
}
if (new_opts->nls != sbi->options->nls)
- return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!");
+ return invalf(
+ fc,
+ "ntfs3: Cannot use different iocharset when remounting!");
sync_filesystem(sb);
if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
!new_opts->force) {
- errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!");
+ errorf(fc,
+ "ntfs3: Volume is dirty and \"force\" flag is not set!");
return -EINVAL;
}
@@ -544,40 +545,38 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
struct ntfs_mount_options *opts = sbi->options;
struct user_namespace *user_ns = seq_user_ns(m);
- seq_printf(m, ",uid=%u",
- from_kuid_munged(user_ns, opts->fs_uid));
- seq_printf(m, ",gid=%u",
- from_kgid_munged(user_ns, opts->fs_gid));
- if (opts->fmask)
- seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
+ seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid));
+ seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid));
if (opts->dmask)
seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff);
- if (opts->nls)
- seq_printf(m, ",iocharset=%s", opts->nls->charset);
- else
- seq_puts(m, ",iocharset=utf8");
+ if (opts->fmask)
+ seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
if (opts->sys_immutable)
seq_puts(m, ",sys_immutable");
if (opts->discard)
seq_puts(m, ",discard");
+ if (opts->force)
+ seq_puts(m, ",force");
if (opts->sparse)
seq_puts(m, ",sparse");
- if (opts->showmeta)
- seq_puts(m, ",showmeta");
if (opts->nohidden)
seq_puts(m, ",nohidden");
- if (opts->windows_names)
- seq_puts(m, ",windows_names");
if (opts->hide_dot_files)
seq_puts(m, ",hide_dot_files");
- if (opts->force)
- seq_puts(m, ",force");
- if (opts->noacsrules)
- seq_puts(m, ",noacsrules");
- if (opts->prealloc)
- seq_puts(m, ",prealloc");
+ if (opts->windows_names)
+ seq_puts(m, ",windows_names");
+ if (opts->showmeta)
+ seq_puts(m, ",showmeta");
if (sb->s_flags & SB_POSIXACL)
seq_puts(m, ",acl");
+ if (opts->nls)
+ seq_printf(m, ",iocharset=%s", opts->nls->charset);
+ else
+ seq_puts(m, ",iocharset=utf8");
+ if (opts->prealloc)
+ seq_puts(m, ",prealloc");
+ if (opts->nocase)
+ seq_puts(m, ",nocase");
return 0;
}
@@ -706,7 +705,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
if (boot->sectors_per_clusters <= 0x80)
return boot->sectors_per_clusters;
if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
- return 1U << -(s8)boot->sectors_per_clusters;
+ return 1U << (-(s8)boot->sectors_per_clusters);
return -EINVAL;
}
@@ -724,6 +723,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
struct buffer_head *bh;
struct MFT_REC *rec;
u16 fn, ao;
+ u8 cluster_bits;
sbi->volume.blocks = dev_size >> PAGE_SHIFT;
@@ -734,48 +734,81 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
err = -EINVAL;
boot = (struct NTFS_BOOT *)bh->b_data;
- if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1))
+ if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) {
+ ntfs_err(sb, "Boot's signature is not NTFS.");
goto out;
+ }
/* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/
/*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1])
* goto out;
*/
- boot_sector_size = (u32)boot->bytes_per_sector[1] << 8;
- if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE ||
+ boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) |
+ boot->bytes_per_sector[0];
+ if (boot_sector_size < SECTOR_SIZE ||
!is_power_of_2(boot_sector_size)) {
+ ntfs_err(sb, "Invalid bytes per sector %u.", boot_sector_size);
goto out;
}
/* cluster size: 512, 1K, 2K, 4K, ... 2M */
sct_per_clst = true_sectors_per_clst(boot);
- if ((int)sct_per_clst < 0)
- goto out;
- if (!is_power_of_2(sct_per_clst))
+ if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) {
+ ntfs_err(sb, "Invalid sectors per cluster %u.", sct_per_clst);
goto out;
+ }
+
+ sbi->cluster_size = boot_sector_size * sct_per_clst;
+ sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size);
+ sbi->cluster_mask = sbi->cluster_size - 1;
+ sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
mlcn = le64_to_cpu(boot->mft_clst);
mlcn2 = le64_to_cpu(boot->mft2_clst);
sectors = le64_to_cpu(boot->sectors_per_volume);
- if (mlcn * sct_per_clst >= sectors)
+ if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) {
+ ntfs_err(
+ sb,
+ "Start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.",
+ mlcn, mlcn2, sectors);
goto out;
+ }
- if (mlcn2 * sct_per_clst >= sectors)
- goto out;
+ sbi->record_size = record_size =
+ boot->record_size < 0 ? 1 << (-boot->record_size) :
+ (u32)boot->record_size << cluster_bits;
+ sbi->record_bits = blksize_bits(record_size);
+ sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
/* Check MFT record size. */
- if ((boot->record_size < 0 &&
- SECTOR_SIZE > (2U << (-boot->record_size))) ||
- (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) {
+ if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) {
+ ntfs_err(sb, "Invalid bytes per MFT record %u (%d).",
+ record_size, boot->record_size);
+ goto out;
+ }
+
+ if (record_size > MAXIMUM_BYTES_PER_MFT) {
+ ntfs_err(sb, "Unsupported bytes per MFT record %u.",
+ record_size);
goto out;
}
+ sbi->index_size = boot->index_size < 0 ?
+ 1u << (-boot->index_size) :
+ (u32)boot->index_size << cluster_bits;
+
/* Check index record size. */
- if ((boot->index_size < 0 &&
- SECTOR_SIZE > (2U << (-boot->index_size))) ||
- (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) {
+ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) {
+ ntfs_err(sb, "Invalid bytes per index %u(%d).", sbi->index_size,
+ boot->index_size);
+ goto out;
+ }
+
+ if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) {
+ ntfs_err(sb, "Unsupported bytes per index %u.",
+ sbi->index_size);
goto out;
}
@@ -791,53 +824,36 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
if (boot_sector_size != sector_size) {
ntfs_warn(
sb,
- "Different NTFS' sector size (%u) and media sector size (%u)",
+ "Different NTFS sector size (%u) and media sector size (%u).",
boot_sector_size, sector_size);
dev_size += sector_size - 1;
}
- sbi->cluster_size = boot_sector_size * sct_per_clst;
- sbi->cluster_bits = blksize_bits(sbi->cluster_size);
-
- sbi->mft.lbo = mlcn << sbi->cluster_bits;
- sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
+ sbi->mft.lbo = mlcn << cluster_bits;
+ sbi->mft.lbo2 = mlcn2 << cluster_bits;
/* Compare boot's cluster and sector. */
- if (sbi->cluster_size < boot_sector_size)
+ if (sbi->cluster_size < boot_sector_size) {
+ ntfs_err(sb, "Invalid bytes per cluster (%u).",
+ sbi->cluster_size);
goto out;
+ }
/* Compare boot's cluster and media sector. */
if (sbi->cluster_size < sector_size) {
/* No way to use ntfs_get_block in this case. */
ntfs_err(
sb,
- "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)",
+ "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).",
sbi->cluster_size, sector_size);
goto out;
}
- sbi->cluster_mask = sbi->cluster_size - 1;
- sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
- sbi->record_size = record_size = boot->record_size < 0
- ? 1 << (-boot->record_size)
- : (u32)boot->record_size
- << sbi->cluster_bits;
-
- if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
- goto out;
-
- sbi->record_bits = blksize_bits(record_size);
- sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
-
sbi->max_bytes_per_attr =
record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) -
ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) -
ALIGN(sizeof(enum ATTR_TYPE), 8);
- sbi->index_size = boot->index_size < 0
- ? 1u << (-boot->index_size)
- : (u32)boot->index_size << sbi->cluster_bits;
-
sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
/* Warning if RAW volume. */
@@ -847,18 +863,18 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
gb0 = format_size_gb(dev_size, &mb0);
ntfs_warn(
sb,
- "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only",
+ "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.",
gb, mb, gb0, mb0);
sb->s_flags |= SB_RDONLY;
}
- clusters = sbi->volume.size >> sbi->cluster_bits;
+ clusters = sbi->volume.size >> cluster_bits;
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
/* 32 bits per cluster. */
if (clusters >> 32) {
ntfs_notice(
sb,
- "NTFS %u.%02u Gb is too big to use 32 bits per cluster",
+ "NTFS %u.%02u Gb is too big to use 32 bits per cluster.",
gb, mb);
goto out;
}
@@ -892,17 +908,17 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits;
/* Maximum size for normal files. */
- sbi->maxbytes = (clusters << sbi->cluster_bits) - 1;
+ sbi->maxbytes = (clusters << cluster_bits) - 1;
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
- if (clusters >= (1ull << (64 - sbi->cluster_bits)))
+ if (clusters >= (1ull << (64 - cluster_bits)))
sbi->maxbytes = -1;
sbi->maxbytes_sparse = -1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
#else
/* Maximum size for sparse file. */
- sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1;
- sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
+ sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1;
+ sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits;
#endif
/*
@@ -910,7 +926,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
* It would be nice if we are able to allocate 1/8 of
* total clusters for MFT but not more then 512 MB.
*/
- sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
+ sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3);
err = 0;
@@ -928,6 +944,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
+ struct ntfs_mount_options *options;
struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt, bad_len, bad_frags;
@@ -942,7 +959,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.high = 0;
sbi->sb = sb;
- sbi->options = fc->fs_private;
+ sbi->options = options = fc->fs_private;
fc->fs_private = NULL;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
@@ -950,12 +967,12 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ntfs_export_ops;
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
- sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL;
+ sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;
- sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
- if (IS_ERR(sbi->options->nls)) {
- sbi->options->nls = NULL;
- errorf(fc, "Cannot load nls %s", sbi->options->nls_name);
+ options->nls = ntfs_load_nls(options->nls_name);
+ if (IS_ERR(options->nls)) {
+ options->nls = NULL;
+ errorf(fc, "Cannot load nls %s", options->nls_name);
err = -EINVAL;
goto out;
}
@@ -980,8 +997,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_VOL);
inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $Volume.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $Volume (%d).", err);
goto out;
}
@@ -1007,13 +1024,9 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
}
attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
- if (!attr || is_attr_ext(attr)) {
- err = -EINVAL;
- goto put_inode_out;
- }
-
- info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
- if (!info) {
+ if (!attr || is_attr_ext(attr) ||
+ !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) {
+ ntfs_err(sb, "$Volume is corrupted.");
err = -EINVAL;
goto put_inode_out;
}
@@ -1028,13 +1041,13 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_MIRR);
inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $MFTMirr.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $MFTMirr (%d).", err);
goto out;
}
- sbi->mft.recs_mirr =
- ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits;
+ sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >>
+ sbi->record_bits;
iput(inode);
@@ -1043,8 +1056,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_LOG);
inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load \x24LogFile.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load \x24LogFile (%d).", err);
goto out;
}
@@ -1064,7 +1077,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto out;
}
} else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) {
- if (!sb_rdonly(sb) && !sbi->options->force) {
+ if (!sb_rdonly(sb) && !options->force) {
ntfs_warn(
sb,
"volume is dirty and \"force\" flag is not set!");
@@ -1079,8 +1092,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
inode = ntfs_iget5(sb, &ref, &NAME_MFT);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $MFT.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $MFT (%d).", err);
goto out;
}
@@ -1095,8 +1108,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto put_inode_out;
err = ni_load_all_mi(ni);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err);
goto put_inode_out;
+ }
sbi->mft.ni = ni;
@@ -1105,8 +1120,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_BITMAP);
inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $Bitmap.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $Bitmap (%d).", err);
goto out;
}
@@ -1120,22 +1135,25 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Check bitmap boundary. */
tt = sbi->used.bitmap.nbits;
if (inode->i_size < bitmap_size(tt)) {
+ ntfs_err(sb, "$Bitmap is corrupted.");
err = -EINVAL;
goto put_inode_out;
}
- /* Not necessary. */
- sbi->used.bitmap.set_tail = true;
err = wnd_init(&sbi->used.bitmap, sb, tt);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err);
goto put_inode_out;
+ }
iput(inode);
/* Compute the MFT zone. */
err = ntfs_refresh_zone(sbi);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to initialize MFT zone (%d).", err);
goto out;
+ }
/* Load $BadClus. */
ref.low = cpu_to_le32(MFT_REC_BADCLUST);
@@ -1180,15 +1198,23 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_ATTR);
inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $AttrDef -> %d", err);
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $AttrDef (%d)", err);
goto out;
}
- if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) {
+ /*
+ * Typical $AttrDef contains up to 20 entries.
+ * Check for extremely large/small size.
+ */
+ if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) ||
+ inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) {
+ ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).",
+ inode->i_size);
err = -EINVAL;
goto put_inode_out;
}
+
bytes = inode->i_size;
sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
if (!t) {
@@ -1202,6 +1228,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (IS_ERR(page)) {
err = PTR_ERR(page);
+ ntfs_err(sb, "Failed to read $AttrDef (%d).", err);
goto put_inode_out;
}
memcpy(Add2Ptr(t, done), page_address(page),
@@ -1209,6 +1236,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ntfs_unmap_page(page);
if (!idx && ATTR_STD != t->type) {
+ ntfs_err(sb, "$AttrDef is corrupted.");
err = -EINVAL;
goto put_inode_out;
}
@@ -1243,13 +1271,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.seq = cpu_to_le16(MFT_REC_UPCASE);
inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $UpCase.");
err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $UpCase (%d).", err);
goto out;
}
if (inode->i_size != 0x10000 * sizeof(short)) {
err = -EINVAL;
+ ntfs_err(sb, "$UpCase is corrupted.");
goto put_inode_out;
}
@@ -1260,6 +1289,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (IS_ERR(page)) {
err = PTR_ERR(page);
+ ntfs_err(sb, "Failed to read $UpCase (%d).", err);
goto put_inode_out;
}
@@ -1285,23 +1315,31 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (is_ntfs3(sbi)) {
/* Load $Secure. */
err = ntfs_security_init(sbi);
- if (err)
+ if (err) {
+ ntfs_err(sb, "Failed to initialize $Secure (%d).", err);
goto out;
+ }
/* Load $Extend. */
err = ntfs_extend_init(sbi);
- if (err)
+ if (err) {
+ ntfs_warn(sb, "Failed to initialize $Extend.");
goto load_root;
+ }
- /* Load $Extend\$Reparse. */
+ /* Load $Extend/$Reparse. */
err = ntfs_reparse_init(sbi);
- if (err)
+ if (err) {
+ ntfs_warn(sb, "Failed to initialize $Extend/$Reparse.");
goto load_root;
+ }
- /* Load $Extend\$ObjId. */
+ /* Load $Extend/$ObjId. */
err = ntfs_objid_init(sbi);
- if (err)
+ if (err) {
+ ntfs_warn(sb, "Failed to initialize $Extend/$ObjId.");
goto load_root;
+ }
}
load_root:
@@ -1309,12 +1347,21 @@ load_root:
ref.low = cpu_to_le32(MFT_REC_ROOT);
ref.seq = cpu_to_le16(MFT_REC_ROOT);
inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
- if (IS_ERR(inode) || !inode->i_op) {
- ntfs_err(sb, "Failed to load root.");
- err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load root (%d).", err);
goto out;
}
+ /*
+ * Final check. Looks like this case should never occurs.
+ */
+ if (!inode->i_op) {
+ err = -EINVAL;
+ ntfs_err(sb, "Failed to load root (%d).", err);
+ goto put_inode_out;
+ }
+
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
err = -ENOMEM;
@@ -1434,7 +1481,7 @@ static const struct fs_context_operations ntfs_context_ops = {
};
/*
- * ntfs_init_fs_context - Initialize spi and opts
+ * ntfs_init_fs_context - Initialize sbi and opts
*
* This will called when mount/remount. We will first initialize
* options so that if remount we can use just that.
@@ -1507,7 +1554,8 @@ static int __init init_ntfs_fs(void)
if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL))
pr_info("ntfs3: Enabled Linux POSIX ACLs support\n");
if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER))
- pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
+ pr_notice(
+ "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
@@ -1550,7 +1598,9 @@ MODULE_DESCRIPTION("ntfs3 read/write filesystem");
MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support");
#endif
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
-MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this");
+MODULE_INFO(
+ cluster,
+ "Warning: Activated 64 bits per cluster. Windows does not support this");
#endif
#ifdef CONFIG_NTFS3_LZX_XPRESS
MODULE_INFO(compression, "Read-only lzx/xpress compression included");
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7324cf924932..c3de60a4543f 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -23,8 +23,8 @@
static inline size_t unpacked_ea_size(const struct EA_FULL *ea)
{
- return ea->size ? le32_to_cpu(ea->size)
- : ALIGN(struct_size(ea, name,
+ return ea->size ? le32_to_cpu(ea->size) :
+ ALIGN(struct_size(ea, name,
1 + ea->name_len +
le16_to_cpu(ea->elength)),
4);
@@ -296,7 +296,8 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
- size_t val_size, int flags, bool locked)
+ size_t val_size, int flags, bool locked,
+ __le16 *ea_size)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -410,7 +411,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
/*
* 1. Check ea_info.size_pack for overflow.
- * 2. New attibute size must fit value from $AttrDef
+ * 2. New attribute size must fit value from $AttrDef
*/
if (new_pack > 0xffff || size > sbi->ea_max_size) {
ntfs_inode_warn(
@@ -504,6 +505,8 @@ update_ea:
if (ea_info.size_pack != size_pack)
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ if (ea_size)
+ *ea_size = ea_info.size_pack;
mark_inode_dirty(&ni->vfs_inode);
out:
@@ -517,9 +520,14 @@ out:
}
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
- int locked)
+
+/*
+ * ntfs_get_acl - inode_operations::get_acl
+ */
+struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, int type)
{
+ struct inode *inode = d_inode(dentry);
struct ntfs_inode *ni = ntfs_i(inode);
const char *name;
size_t name_len;
@@ -542,13 +550,11 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1;
}
- if (!locked)
- ni_lock(ni);
+ ni_lock(ni);
err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req);
- if (!locked)
- ni_unlock(ni);
+ ni_unlock(ni);
/* Translate extended attribute to acl. */
if (err >= 0) {
@@ -567,17 +573,6 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
return acl;
}
-/*
- * ntfs_get_acl - inode_operations::get_acl
- */
-struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
-{
- if (rcu)
- return ERR_PTR(-ECHILD);
-
- return ntfs_get_acl_ex(inode, type, 0);
-}
-
static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
struct inode *inode, struct posix_acl *acl,
int type, bool init_acl)
@@ -633,7 +628,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
flags = 0;
}
- err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0, NULL);
if (err == -ENODATA && !size)
err = 0; /* Removing non existed xattr. */
if (!err) {
@@ -712,20 +707,6 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry)
}
/*
- * ntfs_permission - inode_operations::permission
- */
-int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode,
- int mask)
-{
- if (ntfs_sb(inode->i_sb)->options->noacsrules) {
- /* "No access rules" mode - Allow all changes. */
- return 0;
- }
-
- return generic_permission(idmap, inode, mask);
-}
-
-/*
* ntfs_listxattr - inode_operations::listxattr
*/
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -780,7 +761,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
err = sizeof(u32);
*(u32 *)buffer = le32_to_cpu(ni->std_fa);
if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
- *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer);
+ *(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer);
}
goto out;
}
@@ -857,7 +838,7 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler,
if (size != sizeof(u32))
goto out;
if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
- new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value));
+ new_fa = cpu_to_le32(be32_to_cpu(*(__be32 *)value));
else
new_fa = cpu_to_le32(*(u32 *)value);
@@ -937,7 +918,8 @@ set_new_fa:
}
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0,
+ NULL);
out:
inode->i_ctime = current_time(inode);
@@ -951,7 +933,7 @@ out:
*
* save uid/gid/mode in xattr
*/
-int ntfs_save_wsl_perm(struct inode *inode)
+int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size)
{
int err;
__le32 value;
@@ -960,26 +942,26 @@ int ntfs_save_wsl_perm(struct inode *inode)
ni_lock(ni);
value = cpu_to_le32(i_uid_read(inode));
err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
- sizeof(value), 0, true); /* true == already locked. */
+ sizeof(value), 0, true, ea_size);
if (err)
goto out;
value = cpu_to_le32(i_gid_read(inode));
err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
- sizeof(value), 0, true);
+ sizeof(value), 0, true, ea_size);
if (err)
goto out;
value = cpu_to_le32(inode->i_mode);
err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
- sizeof(value), 0, true);
+ sizeof(value), 0, true, ea_size);
if (err)
goto out;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
value = cpu_to_le32(inode->i_rdev);
err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
- sizeof(value), 0, true);
+ sizeof(value), 0, true, ea_size);
if (err)
goto out;
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 811a6ea374bb..b1550ba73f96 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -803,8 +803,8 @@ bail:
* a better backward&forward compatibility, since a small piece of
* request will be less likely to be broken if disk layout get changed.
*/
-static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
- int compat_flag)
+static noinline_for_stack int
+ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag)
{
int i, status = 0;
u64 req_addr;
@@ -840,27 +840,26 @@ bail:
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- int new_clusters;
- int status;
- struct ocfs2_space_resv sr;
- struct ocfs2_new_group_input input;
- struct reflink_arguments args;
- const char __user *old_path;
- const char __user *new_path;
- bool preserve;
- struct ocfs2_info info;
void __user *argp = (void __user *)arg;
+ int status;
switch (cmd) {
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
case OCFS2_IOC_UNRESVSP64:
+ {
+ struct ocfs2_space_resv sr;
+
if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
return -EFAULT;
return ocfs2_change_file_space(filp, cmd, &sr);
+ }
case OCFS2_IOC_GROUP_EXTEND:
+ {
+ int new_clusters;
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -873,8 +872,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
status = ocfs2_group_extend(inode, new_clusters);
mnt_drop_write_file(filp);
return status;
+ }
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ {
+ struct ocfs2_new_group_input input;
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -887,7 +890,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
status = ocfs2_group_add(inode, &input);
mnt_drop_write_file(filp);
return status;
+ }
case OCFS2_IOC_REFLINK:
+ {
+ struct reflink_arguments args;
+ const char __user *old_path;
+ const char __user *new_path;
+ bool preserve;
+
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
old_path = (const char __user *)(unsigned long)args.old_path;
@@ -895,11 +905,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ }
case OCFS2_IOC_INFO:
+ {
+ struct ocfs2_info info;
+
if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
+ }
case FITRIM:
{
struct super_block *sb = inode->i_sb;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aefdf1d3be7c..9014bbcc8031 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac)
struct iov_iter iter;
struct inode *inode = rac->mapping->host;
struct xarray *i_pages;
- struct page *page;
+ struct folio *folio;
loff_t new_start = readahead_pos(rac);
int ret;
size_t new_len = 0;
@@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac)
ret = 0;
/* clean up. */
- while ((page = readahead_page(rac))) {
- page_endio(page, false, ret);
- put_page(page);
+ while ((folio = readahead_folio(rac))) {
+ if (!ret)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
}
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b0315d34c58..d35bbf35a874 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,6 +91,7 @@
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/kthread.h>
+#include <linux/mmu_context.h>
#include <asm/processor.h>
#include "internal.h"
@@ -219,6 +220,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
seq_putc(m, '\n');
+
+ seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
}
void render_sigset_t(struct seq_file *m, const char *header,
@@ -423,6 +426,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -438,6 +446,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_mem(m, mm);
task_core_dumping(m, task);
task_thp_status(m, mm);
+ task_untag_mask(m, mm);
mmput(mm);
}
task_sig(m, task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e0e0ccd47aa..05452c3b9872 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -96,6 +96,7 @@
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
+#include <linux/ksm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -699,7 +700,6 @@ int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return error;
setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
return 0;
}
@@ -3207,6 +3207,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
+ seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8379593fa4bb..42ae38ff6e7e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -127,7 +127,6 @@ static int proc_notify_change(struct mnt_idmap *idmap,
return error;
setattr_copy(&nop_mnt_idmap, inode, iattr);
- mark_inode_dirty(inode);
proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..25b44b303b35 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- char *buf = file->private_data;
+ loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
size_t tsz;
int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
@@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
};
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
min(vmcoreinfo_size, notes_len - i));
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
*fpos += tsz;
}
@@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
if (!m) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
switch (m->type) {
case KCORE_VMALLOC:
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
+ {
+ const char *src = (char *)start;
+ size_t read = 0, left = tsz;
+
+ /*
+ * vmalloc uses spinlocks, so we optimistically try to
+ * read memory. If this fails, fault pages in and try
+ * again until we are done.
+ */
+ while (true) {
+ read += vread_iter(iter, src, left);
+ if (read == tsz)
+ break;
+
+ src += read;
+ left -= read;
+
+ if (fault_in_iov_iter_writeable(iter, left)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
break;
+ }
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
if (!page || PageOffline(page) ||
is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
+ * We use _copy_to_iter() to bypass usermode hardening
+ * which would otherwise prevent this operation.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
+ ret = -EFAULT;
+ goto out;
}
break;
default:
pr_warn_once("Unhandled KCORE type: %d\n", m->type);
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
skip:
buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
@@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
- filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
-
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
-static int release_kcore(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
- .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 440960110a42..b43d0bd42762 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -6,6 +6,7 @@
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
+#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
#include <linux/seq_file.h>
@@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
+#ifdef CONFIG_MEMTEST
+ if (early_memtest_done) {
+ unsigned long early_memtest_bad_size_kb;
+
+ early_memtest_bad_size_kb = early_memtest_bad_size>>10;
+ if (early_memtest_bad_size && !early_memtest_bad_size_kb)
+ early_memtest_bad_size_kb = 1;
+ /* When 0 is reported, it means there actually was a successful test */
+ seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
+ }
+#endif
+
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10));
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5851eb5bc726..8038833ff5b0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -841,7 +841,6 @@ static int proc_sys_setattr(struct mnt_idmap *idmap,
return error;
setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
return 0;
}
@@ -1283,11 +1282,43 @@ out:
return err;
}
+/* Find the directory for the ctl_table. If one is not found create it. */
+static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
+{
+ const char *name, *nextname;
+
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ /*
+ * namelen ensures if name is "foo/bar/yay" only foo is
+ * registered first. We traverse as if using mkdir -p and
+ * return a ctl_dir for the last directory entry.
+ */
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ break;
+ }
+ return dir;
+}
+
/**
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
+ * @table: the top-level table structure without any child. This table
+ * should not be free'd after registration. So it should not be
+ * used on stack. It can either be a global or dynamically allocated
+ * by the caller and free'd later after sysctl unregistration.
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1308,9 +1339,12 @@ out:
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
+ * XXX: we should eventually modify these to use long min / max [0]
+ * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
+ * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
+ * sysctl_check_table() verifies this.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
@@ -1331,7 +1365,6 @@ struct ctl_table_header *__register_sysctl_table(
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
- const char *name, *nextname;
struct ctl_dir *dir;
struct ctl_table *entry;
struct ctl_node *node;
@@ -1352,28 +1385,13 @@ struct ctl_table_header *__register_sysctl_table(
spin_lock(&sysctl_lock);
dir = &set->dir;
- /* Reference moved down the diretory tree get_subdir */
+ /* Reference moved down the directory tree get_subdir */
dir->header.nreg++;
spin_unlock(&sysctl_lock);
- /* Find the directory for the ctl_table */
- for (name = path; name; name = nextname) {
- int namelen;
- nextname = strchr(name, '/');
- if (nextname) {
- namelen = nextname - name;
- nextname++;
- } else {
- namelen = strlen(name);
- }
- if (namelen == 0)
- continue;
-
- dir = get_subdir(dir, name, namelen);
- if (IS_ERR(dir))
- goto fail;
- }
-
+ dir = sysctl_mkdir_p(dir, path);
+ if (IS_ERR(dir))
+ goto fail;
spin_lock(&sysctl_lock);
if (insert_header(dir, header))
goto fail_put_dir_locked;
@@ -1394,8 +1412,15 @@ fail:
/**
* register_sysctl - register a sysctl table
- * @path: The path to the directory the sysctl table is in.
- * @table: the table structure
+ * @path: The path to the directory the sysctl table is in. If the path
+ * doesn't exist we will create it for you.
+ * @table: the table structure. The calller must ensure the life of the @table
+ * will be kept during the lifetime use of the syctl. It must not be freed
+ * until unregister_sysctl_table() is called with the given returned table
+ * with this registration. If your code is non modular then you don't need
+ * to call unregister_sysctl_table() and can instead use something like
+ * register_sysctl_init() which does not care for the result of the syctl
+ * registration.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1411,8 +1436,11 @@ EXPORT_SYMBOL(register_sysctl);
/**
* __register_sysctl_init() - register sysctl table to path
- * @path: path name for sysctl base
- * @table: This is the sysctl table that needs to be registered to the path
+ * @path: path name for sysctl base. If that path doesn't exist we will create
+ * it for you.
+ * @table: This is the sysctl table that needs to be registered to the path.
+ * The caller must ensure the life of the @table will be kept during the
+ * lifetime use of the sysctl.
* @table_name: The name of sysctl table, only used for log printing when
* registration fails
*
@@ -1424,10 +1452,7 @@ EXPORT_SYMBOL(register_sysctl);
* register_sysctl() failing on init are extremely low, and so for both reasons
* this function does not return any error as it is used by initialization code.
*
- * Context: Can only be called after your respective sysctl base path has been
- * registered. So for instance, most base directories are registered early on
- * init before init levels are processed through proc_sys_init() and
- * sysctl_init_bases().
+ * Context: if your base directory does not exist it will be created for you.
*/
void __init __register_sysctl_init(const char *path, struct ctl_table *table,
const char *table_name)
@@ -1550,24 +1575,18 @@ out:
}
/**
- * __register_sysctl_paths - register a sysctl table hierarchy
- * @set: Sysctl tree to register on
- * @path: The path to the directory the sysctl table is in.
+ * register_sysctl_table - register a sysctl table hierarchy
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_table for more details.
+ * We are slowly deprecating this call so avoid its use.
*/
-struct ctl_table_header *__register_sysctl_paths(
- struct ctl_table_set *set,
- const struct ctl_path *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
{
struct ctl_table *ctl_table_arg = table;
int nr_subheaders = count_subheaders(table);
struct ctl_table_header *header = NULL, **subheaders, **subheader;
- const struct ctl_path *component;
char *new_path, *pos;
pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -1575,11 +1594,6 @@ struct ctl_table_header *__register_sysctl_paths(
return NULL;
pos[0] = '\0';
- for (component = path; component->procname; component++) {
- pos = append_path(new_path, pos, component->procname);
- if (!pos)
- goto out;
- }
while (table->procname && table->child && !table[1].procname) {
pos = append_path(new_path, pos, table->procname);
if (!pos)
@@ -1587,7 +1601,7 @@ struct ctl_table_header *__register_sysctl_paths(
table = table->child;
}
if (nr_subheaders == 1) {
- header = __register_sysctl_table(set, new_path, table);
+ header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table);
if (header)
header->ctl_table_arg = ctl_table_arg;
} else {
@@ -1601,7 +1615,7 @@ struct ctl_table_header *__register_sysctl_paths(
header->ctl_table_arg = ctl_table_arg;
if (register_leaf_sysctl_tables(new_path, pos, &subheader,
- set, table))
+ &sysctl_table_root.default_set, table))
goto err_register_leaves;
}
@@ -1620,40 +1634,6 @@ err_register_leaves:
header = NULL;
goto out;
}
-
-/**
- * register_sysctl_paths - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return __register_sysctl_paths(&sysctl_table_root.default_set,
- path, table);
-}
-EXPORT_SYMBOL(register_sysctl_paths);
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
- static const struct ctl_path null_path[] = { {} };
-
- return register_sysctl_paths(null_path, table);
-}
EXPORT_SYMBOL(register_sysctl_table);
int __register_sysctl_base(struct ctl_table *base_table)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4fb8729a68d4..da60956b2915 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,30 +22,6 @@
#define arch_irq_stat() 0
#endif
-#ifdef arch_idle_time
-
-u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 idle;
-
- idle = kcs->cpustat[CPUTIME_IDLE];
- if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
- idle += arch_idle_time(cpu);
- return idle;
-}
-
-static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
-{
- u64 iowait;
-
- iowait = kcs->cpustat[CPUTIME_IOWAIT];
- if (cpu_online(cpu) && nr_iowait_cpu(cpu))
- iowait += arch_idle_time(cpu);
- return iowait;
-}
-
-#else
-
u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -78,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
return iowait;
}
-#endif
-
static void show_irq_gap(struct seq_file *p, unsigned int gap)
{
static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6a96e1713fd5..420510f6a545 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
if (start >= vma->vm_end)
return;
-#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma,
ops = &smaps_shmem_walk_ops;
}
}
-#endif
+
/* mmap_lock is held in m_start */
if (!start)
walk_page_vma(vma, ops, mss);
@@ -1689,8 +1688,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* watch out for wraparound */
start_vaddr = end_vaddr;
- if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
- start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+ }
/* Ensure the address is inside the task */
if (start_vaddr > mm->task_size)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 12af614f33ce..03f5963914a1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -339,7 +339,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
return acc;
}
- /* Read Elf note segment */
+ /* Read ELF note segment */
if (*fpos < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
@@ -1109,7 +1109,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
ehdr_ptr = (Elf64_Ehdr *)elfptr;
phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1152,7 +1152,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
ehdr_ptr = (Elf32_Ehdr *)elfptr;
phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
@@ -1188,7 +1188,7 @@ static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
loff_t vmcore_off;
struct vmcore *m;
- /* Skip Elf header, program headers and Elf note segment. */
+ /* Skip ELF header, program headers and ELF note segment. */
vmcore_off = elfsz + elfnotes_sz;
list_for_each_entry(m, vc_list, list) {
@@ -1213,7 +1213,7 @@ static int __init parse_crash_elf64_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1269,7 +1269,7 @@ static int __init parse_crash_elf32_headers(void)
addr = elfcorehdr_addr;
- /* Read Elf header */
+ /* Read ELF header */
rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
if (rc < 0)
return rc;
@@ -1376,12 +1376,12 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
}
/**
- * vmcoredd_update_program_headers - Update all Elf program headers
+ * vmcoredd_update_program_headers - Update all ELF program headers
* @elfptr: Pointer to elf header
* @elfnotesz: Size of elf notes aligned to page size
* @vmcoreddsz: Size of device dumps to be added to elf note header
*
- * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Determine type of ELF header (Elf64 or Elf32) and update the elf note size.
* Also update the offsets of all the program headers after the elf note header.
*/
static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
@@ -1439,10 +1439,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
/**
* vmcoredd_update_size - Update the total size of the device dumps and update
- * Elf header
+ * ELF header
* @dump_size: Size of the current device dump to be added to total size
*
- * Update the total size of all the device dumps and update the Elf program
+ * Update the total size of all the device dumps and update the ELF program
* headers. Calculate the new offsets for the vmcore list and update the
* total vmcore size.
*/
@@ -1466,7 +1466,7 @@ static void vmcoredd_update_size(size_t dump_size)
* @data: dump info.
*
* Allocate a buffer and invoke the calling driver's dump collect routine.
- * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * Write ELF note at the beginning of the buffer to indicate vmcore device
* dump and add the dump to global list.
*/
int vmcore_add_device_dump(struct vmcoredd_data *data)
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index ab82e5f05346..55f139afa327 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -7,10 +7,9 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <linux/rtmutex.h>
#include "internal.h"
-static DEFINE_RT_MUTEX(pmsg_lock);
+static DEFINE_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
@@ -29,9 +28,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (!access_ok(buf, count))
return -EFAULT;
- rt_mutex_lock(&pmsg_lock);
+ mutex_lock(&pmsg_lock);
ret = psinfo->write_user(&record, buf);
- rt_mutex_unlock(&pmsg_lock);
+ mutex_unlock(&pmsg_lock);
return ret ? ret : count;
}
@@ -64,7 +63,7 @@ void pstore_register_pmsg(void)
goto err;
}
- pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+ pmsg_class = class_create(PMSG_NAME);
if (IS_ERR(pmsg_class)) {
pr_err("device class file already in use\n");
goto err_class;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2f67516bb9bf..9fbb9b5256f7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
- if (unlikely(order >= MAX_ORDER))
+ if (unlikely(order > MAX_ORDER))
return -EFBIG;
ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index ace133cf6072..3b43a51e6f7e 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -327,17 +327,18 @@ struct smb2_tree_connect_req {
#define SMB2_SHAREFLAG_NO_CACHING 0x00000030
#define SHI1005_FLAGS_DFS 0x00000001
#define SHI1005_FLAGS_DFS_ROOT 0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
+#define SMB2_SHAREFLAG_RESTRICT_EXCLUSIVE_OPENS 0x00000100
+#define SMB2_SHAREFLAG_FORCE_SHARED_DELETE 0x00000200
+#define SMB2_SHAREFLAG_ALLOW_NAMESPACE_CACHING 0x00000400
+#define SMB2_SHAREFLAG_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
+#define SMB2_SHAREFLAG_FORCE_LEVELII_OPLOCK 0x00001000
+#define SMB2_SHAREFLAG_ENABLE_HASH_V1 0x00002000
+#define SMB2_SHAREFLAG_ENABLE_HASH_V2 0x00004000
#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL 0x0014FF33
+#define SMB2_SHAREFLAG_ISOLATED_TRANSPORT 0x00200000
+#define SHI1005_FLAGS_ALL 0x0034FF33
/* Possible share capabilities */
#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
@@ -1171,6 +1172,34 @@ struct create_posix {
__u32 Reserved;
} __packed;
+/* See MS-SMB2 2.2.13.2.3 and MS-SMB2 2.2.13.2.4 */
+struct create_durable {
+ struct create_context ccontext;
+ __u8 Name[8];
+ union {
+ __u8 Reserved[16];
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ } Data;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.5 */
+struct create_mxac_req {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le64 Timestamp;
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.5 */
+struct create_mxac_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le32 QueryStatus;
+ __le32 MaximalAccess;
+} __packed;
+
#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00)
#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01)
#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02)
@@ -1180,6 +1209,7 @@ struct create_posix {
#define SMB2_LEASE_KEY_SIZE 16
+/* See MS-SMB2 2.2.13.2.8 */
struct lease_context {
__u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
__le32 LeaseState;
@@ -1187,6 +1217,7 @@ struct lease_context {
__le64 LeaseDuration;
} __packed;
+/* See MS-SMB2 2.2.13.2.10 */
struct lease_context_v2 {
__u8 LeaseKey[SMB2_LEASE_KEY_SIZE];
__le32 LeaseState;
@@ -1210,6 +1241,15 @@ struct create_lease_v2 {
__u8 Pad[4];
} __packed;
+/* See MS-SMB2 2.2.14.2.9 */
+struct create_disk_id_rsp {
+ struct create_context ccontext;
+ __u8 Name[8];
+ __le64 DiskFileId;
+ __le64 VolumeId;
+ __u8 Reserved[16];
+} __packed;
+
/* See MS-SMB2 2.2.31 and 2.2.32 */
struct smb2_ioctl_req {
struct smb2_hdr hdr;
diff --git a/fs/super.c b/fs/super.c
index 04bc62ab7dfe..34afe411cf2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 3a92e6af69b2..75461777c466 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -217,7 +217,6 @@ static void compr_exit(struct ubifs_compressor *compr)
{
if (compr->capi_name)
crypto_free_comp(compr->cc);
- return;
}
/**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 1505539f6fe9..ef0499edc248 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -358,7 +358,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
umode_t mode = S_IFCHR | WHITEOUT_MODE;
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct fscrypt_name nm;
/*
* Create an inode('nlink = 1') for whiteout without updating journal,
@@ -369,10 +368,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
- if (err)
- return ERR_PTR(err);
-
inode = ubifs_new_inode(c, dir, mode, false);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -395,7 +390,6 @@ out_inode:
make_bad_inode(inode);
iput(inode);
out_free:
- fscrypt_free_filename(&nm);
ubifs_err(c, "cannot create whiteout file, error %d", err);
return ERR_PTR(err);
}
@@ -492,6 +486,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
return finish_open_simple(file, 0);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 2469f72eeaab..6b7d95b65f4b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -44,6 +44,33 @@ enum {
NOT_ON_MEDIA = 3,
};
+static void do_insert_old_idx(struct ubifs_info *c,
+ struct ubifs_old_idx *old_idx)
+{
+ struct ubifs_old_idx *o;
+ struct rb_node **p, *parent = NULL;
+
+ p = &c->old_idx.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_old_idx, rb);
+ if (old_idx->lnum < o->lnum)
+ p = &(*p)->rb_left;
+ else if (old_idx->lnum > o->lnum)
+ p = &(*p)->rb_right;
+ else if (old_idx->offs < o->offs)
+ p = &(*p)->rb_left;
+ else if (old_idx->offs > o->offs)
+ p = &(*p)->rb_right;
+ else {
+ ubifs_err(c, "old idx added twice!");
+ kfree(old_idx);
+ }
+ }
+ rb_link_node(&old_idx->rb, parent, p);
+ rb_insert_color(&old_idx->rb, &c->old_idx);
+}
+
/**
* insert_old_idx - record an index node obsoleted since the last commit start.
* @c: UBIFS file-system description object
@@ -69,35 +96,15 @@ enum {
*/
static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
{
- struct ubifs_old_idx *old_idx, *o;
- struct rb_node **p, *parent = NULL;
+ struct ubifs_old_idx *old_idx;
old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
if (unlikely(!old_idx))
return -ENOMEM;
old_idx->lnum = lnum;
old_idx->offs = offs;
+ do_insert_old_idx(c, old_idx);
- p = &c->old_idx.rb_node;
- while (*p) {
- parent = *p;
- o = rb_entry(parent, struct ubifs_old_idx, rb);
- if (lnum < o->lnum)
- p = &(*p)->rb_left;
- else if (lnum > o->lnum)
- p = &(*p)->rb_right;
- else if (offs < o->offs)
- p = &(*p)->rb_left;
- else if (offs > o->offs)
- p = &(*p)->rb_right;
- else {
- ubifs_err(c, "old idx added twice!");
- kfree(old_idx);
- return 0;
- }
- }
- rb_link_node(&old_idx->rb, parent, p);
- rb_insert_color(&old_idx->rb, &c->old_idx);
return 0;
}
@@ -199,23 +206,6 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
- ubifs_assert(c, !ubifs_zn_obsolete(znode));
- __set_bit(OBSOLETE_ZNODE, &znode->flags);
-
- if (znode->level != 0) {
- int i;
- const int n = zn->child_cnt;
-
- /* The children now have new parent */
- for (i = 0; i < n; i++) {
- struct ubifs_zbranch *zbr = &zn->zbranch[i];
-
- if (zbr->znode)
- zbr->znode->parent = zn;
- }
- }
-
- atomic_long_inc(&c->dirty_zn_cnt);
return zn;
}
@@ -234,6 +224,42 @@ static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
}
/**
+ * replace_znode - replace old znode with new znode.
+ * @c: UBIFS file-system description object
+ * @new_zn: new znode
+ * @old_zn: old znode
+ * @zbr: the branch of parent znode
+ *
+ * Replace old znode with new znode in TNC.
+ */
+static void replace_znode(struct ubifs_info *c, struct ubifs_znode *new_zn,
+ struct ubifs_znode *old_zn, struct ubifs_zbranch *zbr)
+{
+ ubifs_assert(c, !ubifs_zn_obsolete(old_zn));
+ __set_bit(OBSOLETE_ZNODE, &old_zn->flags);
+
+ if (old_zn->level != 0) {
+ int i;
+ const int n = new_zn->child_cnt;
+
+ /* The children now have new parent */
+ for (i = 0; i < n; i++) {
+ struct ubifs_zbranch *child = &new_zn->zbranch[i];
+
+ if (child->znode)
+ child->znode->parent = new_zn;
+ }
+ }
+
+ zbr->znode = new_zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+
+ atomic_long_inc(&c->dirty_zn_cnt);
+}
+
+/**
* dirty_cow_znode - ensure a znode is not being committed.
* @c: UBIFS file-system description object
* @zbr: branch of znode to check
@@ -265,28 +291,32 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
return zn;
if (zbr->len) {
- err = insert_old_idx(c, zbr->lnum, zbr->offs);
- if (unlikely(err))
- /*
- * Obsolete znodes will be freed by tnc_destroy_cnext()
- * or free_obsolete_znodes(), copied up znodes should
- * be added back to tnc and freed by
- * ubifs_destroy_tnc_subtree().
- */
+ struct ubifs_old_idx *old_idx;
+
+ old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+ if (unlikely(!old_idx)) {
+ err = -ENOMEM;
goto out;
+ }
+ old_idx->lnum = zbr->lnum;
+ old_idx->offs = zbr->offs;
+
err = add_idx_dirt(c, zbr->lnum, zbr->len);
- } else
- err = 0;
+ if (err) {
+ kfree(old_idx);
+ goto out;
+ }
-out:
- zbr->znode = zn;
- zbr->lnum = 0;
- zbr->offs = 0;
- zbr->len = 0;
+ do_insert_old_idx(c, old_idx);
+ }
+
+ replace_znode(c, zn, znode, zbr);
- if (unlikely(err))
- return ERR_PTR(err);
return zn;
+
+out:
+ kfree(zn);
+ return ERR_PTR(err);
}
/**
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 67aaadc3ab07..8395066341a4 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -214,4 +214,3 @@ void utf8_unload(struct unicode_map *um)
}
EXPORT_SYMBOL(utf8_unload);
-MODULE_LICENSE("GPL v2");
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 40f9e1a2ebdd..0fd96d6e39ce 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -32,7 +32,22 @@
#include <linux/swapops.h>
#include <linux/miscdevice.h>
-int sysctl_unprivileged_userfaultfd __read_mostly;
+static int sysctl_unprivileged_userfaultfd __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vm_userfaultfd_table[] = {
+ {
+ .procname = "unprivileged_userfaultfd",
+ .data = &sysctl_unprivileged_userfaultfd,
+ .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+#endif
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
@@ -108,6 +123,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return false;
+
+ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t flags)
{
@@ -1629,7 +1659,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
- uffd_wp_range(mm, vma, start, vma_end - start, false);
+ uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
@@ -1714,6 +1744,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_copy = (struct uffdio_copy __user *) arg;
@@ -1740,10 +1771,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, &ctx->mmap_changing,
+ flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1793,9 +1826,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1875,6 +1908,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1899,13 +1933,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
uffdio_continue.range.start) {
goto out;
}
- if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len,
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1973,6 +2010,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
@@ -2180,6 +2218,9 @@ static int __init userfaultfd_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
init_once_userfaultfd_ctx);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", vm_userfaultfd_table);
+#endif
return 0;
}
__initcall(userfaultfd_init);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 9fac5ea8d0e4..52e1823241fb 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -47,6 +47,33 @@ config XFS_SUPPORT_V4
To continue supporting the old V4 format (crc=0), say Y.
To close off an attack surface, say N.
+config XFS_SUPPORT_ASCII_CI
+ bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
+ depends on XFS_FS
+ default y
+ help
+ The ASCII case insensitivity filesystem feature only works correctly
+ on systems that have been coerced into using ISO 8859-1, and it does
+ not work on extended attributes. The kernel has no visibility into
+ the locale settings in userspace, so it corrupts UTF-8 names.
+ Enabling this feature makes XFS vulnerable to mixed case sensitivity
+ attacks. Because of this, the feature is deprecated. All users
+ should upgrade by backing up their files, reformatting, and restoring
+ from the backup.
+
+ Administrators and users can detect such a filesystem by running
+ xfs_info against a filesystem mountpoint and checking for a string
+ beginning with "ascii-ci=". If the string "ascii-ci=1" is found, the
+ filesystem is a case-insensitive filesystem. If no such string is
+ found, please upgrade xfsprogs to the latest version and try again.
+
+ This option will become default N in September 2025. Support for the
+ feature will be removed entirely in September 2030. Distributors
+ can say N here to withdraw support earlier.
+
+ To continue supporting case-insensitivity (ascii-ci=1), say Y.
+ To close off an attack surface, say N.
+
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
@@ -93,10 +120,15 @@ config XFS_RT
If unsure, say N.
+config XFS_DRAIN_INTENTS
+ bool
+ select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
+ select XFS_DRAIN_INTENTS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 92d88dc3c9f7..16e4eb431230 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y)
xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
endif
+xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
+
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -146,6 +148,7 @@ xfs-y += $(addprefix scrub/, \
agheader.o \
alloc.o \
attr.o \
+ bitmap.o \
bmap.o \
btree.o \
common.o \
@@ -156,6 +159,7 @@ xfs-y += $(addprefix scrub/, \
ialloc.o \
inode.o \
parent.o \
+ readdir.o \
refcount.o \
rmap.o \
scrub.o \
@@ -169,7 +173,6 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
- bitmap.o \
repair.o \
)
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 86696a1c6891..1b078bbbf225 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -81,6 +81,19 @@ xfs_perag_get_tag(
return pag;
}
+/* Get a passive reference to the given perag. */
+struct xfs_perag *
+xfs_perag_hold(
+ struct xfs_perag *pag)
+{
+ ASSERT(atomic_read(&pag->pag_ref) > 0 ||
+ atomic_read(&pag->pag_active_ref) > 0);
+
+ trace_xfs_perag_hold(pag, _RET_IP_);
+ atomic_inc(&pag->pag_ref);
+ return pag;
+}
+
void
xfs_perag_put(
struct xfs_perag *pag)
@@ -247,6 +260,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
+ xfs_defer_drain_free(&pag->pag_intents_drain);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_buf_hash_destroy(pag);
@@ -372,6 +386,7 @@ xfs_initialize_perag(
spin_lock_init(&pag->pag_state_lock);
INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ xfs_defer_drain_init(&pag->pag_intents_drain);
init_waitqueue_head(&pag->pagb_wait);
init_waitqueue_head(&pag->pag_active_wq);
pag->pagb_count = 0;
@@ -408,6 +423,7 @@ xfs_initialize_perag(
return 0;
out_remove_pag:
+ xfs_defer_drain_free(&pag->pag_intents_drain);
radix_tree_delete(&mp->m_perag_tree, index);
out_free_pag:
kmem_free(pag);
@@ -418,6 +434,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_defer_drain_free(&pag->pag_intents_drain);
kmem_free(pag);
}
return error;
@@ -1043,10 +1060,8 @@ xfs_ag_extend_space(
if (error)
return error;
- error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
- be32_to_cpu(agf->agf_length) - len),
- len, &XFS_RMAP_OINFO_SKIP_UPDATE,
- XFS_AG_RESV_NONE);
+ error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len,
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5e18536dfdce..2e0aef87d633 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -101,6 +101,14 @@ struct xfs_perag {
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
+ /*
+ * We use xfs_drain to track the number of deferred log intent items
+ * that have been queued (but not yet processed) so that waiters (e.g.
+ * scrub) will not lock resources when other threads are in the middle
+ * of processing a chain of intent items only to find momentary
+ * inconsistencies.
+ */
+ struct xfs_defer_drain pag_intents_drain;
#endif /* __KERNEL__ */
};
@@ -134,6 +142,7 @@ void xfs_free_perag(struct xfs_mount *mp);
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
unsigned int tag);
+struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
void xfs_perag_put(struct xfs_perag *pag);
/* Active AG references */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 203f16c48c19..fdfa08cbf4db 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -233,6 +233,52 @@ xfs_alloc_update(
return xfs_btree_update(cur, &rec);
}
+/* Convert the ondisk btree record to its incore representation. */
+void
+xfs_alloc_btrec_to_irec(
+ const union xfs_btree_rec *rec,
+ struct xfs_alloc_rec_incore *irec)
+{
+ irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+}
+
+/* Simple checks for free space records. */
+xfs_failaddr_t
+xfs_alloc_check_irec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *irec)
+{
+ struct xfs_perag *pag = cur->bc_ag.pag;
+
+ if (irec->ar_blockcount == 0)
+ return __this_address;
+
+ /* check for valid extent range, including overflow */
+ if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
+ return __this_address;
+
+ return NULL;
+}
+
+static inline int
+xfs_alloc_complain_bad_rec(
+ struct xfs_btree_cur *cur,
+ xfs_failaddr_t fa,
+ const struct xfs_alloc_rec_incore *irec)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+
+ xfs_warn(mp,
+ "%s Freespace BTree record corruption in AG %d detected at %pS!",
+ cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+ cur->bc_ag.pag->pag_agno, fa);
+ xfs_warn(mp,
+ "start block 0x%x block count 0x%x", irec->ar_startblock,
+ irec->ar_blockcount);
+ return -EFSCORRUPTED;
+}
+
/*
* Get the data from the pointed-to record.
*/
@@ -243,35 +289,23 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat) /* output: success/failure */
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_alloc_rec_incore irec;
union xfs_btree_rec *rec;
+ xfs_failaddr_t fa;
int error;
error = xfs_btree_get_rec(cur, &rec, stat);
if (error || !(*stat))
return error;
- *bno = be32_to_cpu(rec->alloc.ar_startblock);
- *len = be32_to_cpu(rec->alloc.ar_blockcount);
-
- if (*len == 0)
- goto out_bad_rec;
-
- /* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(pag, *bno, *len))
- goto out_bad_rec;
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ fa = xfs_alloc_check_irec(cur, &irec);
+ if (fa)
+ return xfs_alloc_complain_bad_rec(cur, fa, &irec);
+ *bno = irec.ar_startblock;
+ *len = irec.ar_blockcount;
return 0;
-
-out_bad_rec:
- xfs_warn(mp,
- "%s Freespace BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
- pag->pag_agno);
- xfs_warn(mp,
- "start block 0x%x block count 0x%x", *bno, *len);
- return -EFSCORRUPTED;
}
/*
@@ -2405,6 +2439,7 @@ xfs_defer_agfl_block(
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+ xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
}
@@ -2421,8 +2456,8 @@ __xfs_free_extent_later(
bool skip_discard)
{
struct xfs_extent_free_item *xefi;
-#ifdef DEBUG
struct xfs_mount *mp = tp->t_mountp;
+#ifdef DEBUG
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -2456,9 +2491,11 @@ __xfs_free_extent_later(
} else {
xefi->xefi_owner = XFS_RMAP_OWN_NULL;
}
- trace_xfs_bmap_free_defer(tp->t_mountp,
+ trace_xfs_bmap_free_defer(mp,
XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
+
+ xfs_extent_free_get_group(mp, xefi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
}
@@ -3596,7 +3633,8 @@ xfs_free_extent_fix_freelist(
int
__xfs_free_extent(
struct xfs_trans *tp,
- xfs_fsblock_t bno,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
@@ -3604,12 +3642,9 @@ __xfs_free_extent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
- struct xfs_perag *pag;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3618,10 +3653,9 @@ __xfs_free_extent(
XFS_ERRTAG_FREE_EXTENT))
return -EIO;
- pag = xfs_perag_get(mp, agno);
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error)
- goto err;
+ return error;
agf = agbp->b_addr;
if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
@@ -3635,20 +3669,18 @@ __xfs_free_extent(
goto err_release;
}
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
+ error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
+ type);
if (error)
goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
- xfs_perag_put(pag);
return 0;
err_release:
xfs_trans_brelse(tp, agbp);
-err:
- xfs_perag_put(pag);
return error;
}
@@ -3666,9 +3698,13 @@ xfs_alloc_query_range_helper(
{
struct xfs_alloc_query_range_info *query = priv;
struct xfs_alloc_rec_incore irec;
+ xfs_failaddr_t fa;
+
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ fa = xfs_alloc_check_irec(cur, &irec);
+ if (fa)
+ return xfs_alloc_complain_bad_rec(cur, fa, &irec);
- irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
- irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
return query->fn(cur, &irec, query->priv);
}
@@ -3709,13 +3745,16 @@ xfs_alloc_query_all(
return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
}
-/* Is there a record covering a given extent? */
+/*
+ * Scan part of the keyspace of the free space and tell us if the area has no
+ * records, is fully mapped by records, or is partially filled.
+ */
int
-xfs_alloc_has_record(
+xfs_alloc_has_records(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- bool *exists)
+ enum xbtree_recpacking *outcome)
{
union xfs_btree_irec low;
union xfs_btree_irec high;
@@ -3725,7 +3764,7 @@ xfs_alloc_has_record(
memset(&high, 0xFF, sizeof(high));
high.a.ar_startblock = bno + len - 1;
- return xfs_btree_has_record(cur, &low, &high, exists);
+ return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
/*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2b246d74c189..5dbb25546d0b 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -141,7 +141,8 @@ int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args,
int /* error */
__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len, /* length of extent */
const struct xfs_owner_info *oinfo, /* extent owner */
enum xfs_ag_resv_type type, /* block reservation type */
@@ -150,12 +151,13 @@ __xfs_free_extent(
static inline int
xfs_free_extent(
struct xfs_trans *tp,
- xfs_fsblock_t bno,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+ return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false);
}
int /* error */
@@ -179,6 +181,12 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
+union xfs_btree_rec;
+void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
+ struct xfs_alloc_rec_incore *irec);
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+ const struct xfs_alloc_rec_incore *irec);
+
int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
struct xfs_buf **agfbpp);
int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -205,8 +213,8 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur,
int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
void *priv);
-int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, bool *exist);
+int xfs_alloc_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, enum xbtree_recpacking *outcome);
typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
void *priv);
@@ -235,9 +243,13 @@ struct xfs_extent_free_item {
uint64_t xefi_owner;
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
+ struct xfs_perag *xefi_pag;
unsigned int xefi_flags;
};
+void xfs_extent_free_get_group(struct xfs_mount *mp,
+ struct xfs_extent_free_item *xefi);
+
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 0f29c7b1b39f..c65228efed4a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -260,20 +260,27 @@ STATIC int64_t
xfs_bnobt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
+ ASSERT(!mask || mask->alloc.ar_startblock);
+
return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ be32_to_cpu(k2->alloc.ar_startblock);
}
STATIC int64_t
xfs_cntbt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
int64_t diff;
+ ASSERT(!mask || (mask->alloc.ar_blockcount &&
+ mask->alloc.ar_startblock));
+
diff = be32_to_cpu(k1->alloc.ar_blockcount) -
be32_to_cpu(k2->alloc.ar_blockcount);
if (diff)
@@ -423,6 +430,19 @@ xfs_cntbt_recs_inorder(
be32_to_cpu(r2->alloc.ar_startblock));
}
+STATIC enum xbtree_key_contig
+xfs_allocbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->alloc.ar_startblock);
+
+ return xbtree_key_contig(be32_to_cpu(key1->alloc.ar_startblock),
+ be32_to_cpu(key2->alloc.ar_startblock));
+}
+
static const struct xfs_btree_ops xfs_bnobt_ops = {
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
@@ -443,6 +463,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
+ .keys_contiguous = xfs_allocbt_keys_contiguous,
};
static const struct xfs_btree_ops xfs_cntbt_ops = {
@@ -465,6 +486,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
+ .keys_contiguous = NULL, /* not needed right now */
};
/* Allocate most of a new allocation btree cursor. */
@@ -492,9 +514,7 @@ xfs_allocbt_init_common(
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
}
- /* take a reference for the cursor */
- atomic_inc(&pag->pag_ref);
- cur->bc_ag.pag = pag;
+ cur->bc_ag.pag = xfs_perag_hold(pag);
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 34de6e6898c4..b512de0540d5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1083,6 +1083,34 @@ struct xfs_iread_state {
xfs_extnum_t loaded;
};
+int
+xfs_bmap_complain_bad_rec(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_failaddr_t fa,
+ const struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ const char *forkname;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK: forkname = "data"; break;
+ case XFS_ATTR_FORK: forkname = "attr"; break;
+ case XFS_COW_FORK: forkname = "CoW"; break;
+ default: forkname = "???"; break;
+ }
+
+ xfs_warn(mp,
+ "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!",
+ ip->i_ino, forkname, fa);
+ xfs_warn(mp,
+ "Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x",
+ irec->br_startoff, irec->br_startblock, irec->br_blockcount,
+ irec->br_state);
+
+ return -EFSCORRUPTED;
+}
+
/* Stuff every bmbt record from this block into the incore extent map. */
static int
xfs_iread_bmbt_block(
@@ -1125,7 +1153,8 @@ xfs_iread_bmbt_block(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iread_extents(2)", frp,
sizeof(*frp), fa);
- return -EFSCORRUPTED;
+ return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
+ &new);
}
xfs_iext_insert(ip, &ir->icur, &new,
xfs_bmap_fork_to_state(whichfork));
@@ -1171,6 +1200,12 @@ xfs_iread_extents(
goto out;
}
ASSERT(ir.loaded == xfs_iext_count(ifp));
+ /*
+ * Use release semantics so that we can use acquire semantics in
+ * xfs_need_iread_extents and be guaranteed to see a valid mapping tree
+ * after that load.
+ */
+ smp_store_release(&ifp->if_needextents, 0);
return 0;
out:
xfs_iext_destroy(ifp);
@@ -3505,7 +3540,6 @@ xfs_bmap_btalloc_at_eof(
* original non-aligned state so the caller can proceed on allocation
* failure as if this function was never called.
*/
- args->fsbno = ap->blkno;
args->alignment = 1;
return 0;
}
@@ -6075,6 +6109,7 @@ __xfs_bmap_add(
bi->bi_whichfork = whichfork;
bi->bi_bmap = *bmap;
+ xfs_bmap_update_get_group(tp->t_mountp, bi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index dd08361ca5a6..e33470e39728 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{ BMAP_COWFORK, "COW" }
/* Return true if the extent is an allocated extent, written or not. */
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec)
{
return irec->br_startblock != HOLESTARTBLOCK &&
irec->br_startblock != DELAYSTARTBLOCK &&
@@ -238,9 +238,13 @@ struct xfs_bmap_intent {
enum xfs_bmap_intent_type bi_type;
int bi_whichfork;
struct xfs_inode *bi_owner;
+ struct xfs_perag *bi_pag;
struct xfs_bmbt_irec bi_bmap;
};
+void xfs_bmap_update_get_group(struct xfs_mount *mp,
+ struct xfs_bmap_intent *bi);
+
int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
@@ -261,6 +265,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
+int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
+ xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec);
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index b8ad95050c9b..1b40e5f8b1ec 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -382,11 +382,14 @@ STATIC int64_t
xfs_bmbt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+ ASSERT(!mask || mask->bmbt.br_startoff);
+
/*
* Note: This routine previously casted a and b to int64 and subtracted
* them to generate a result. This lead to problems if b was the
@@ -500,6 +503,19 @@ xfs_bmbt_recs_inorder(
xfs_bmbt_disk_get_startoff(&r2->bmbt);
}
+STATIC enum xbtree_key_contig
+xfs_bmbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->bmbt.br_startoff);
+
+ return xbtree_key_contig(be64_to_cpu(key1->bmbt.br_startoff),
+ be64_to_cpu(key2->bmbt.br_startoff));
+}
+
static const struct xfs_btree_ops xfs_bmbt_ops = {
.rec_len = sizeof(xfs_bmbt_rec_t),
.key_len = sizeof(xfs_bmbt_key_t),
@@ -520,6 +536,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.buf_ops = &xfs_bmbt_buf_ops,
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
+ .keys_contiguous = xfs_bmbt_keys_contiguous,
};
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c4649cc624e1..6a6503ab0cd7 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2067,8 +2067,7 @@ xfs_btree_get_leaf_keys(
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
rec = xfs_btree_rec_addr(cur, n, block);
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
- if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
- > 0)
+ if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey))
max_hkey = hkey;
}
@@ -2096,7 +2095,7 @@ xfs_btree_get_node_keys(
max_hkey = xfs_btree_high_key_addr(cur, 1, block);
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
hkey = xfs_btree_high_key_addr(cur, n, block);
- if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
+ if (xfs_btree_keycmp_gt(cur, hkey, max_hkey))
max_hkey = hkey;
}
@@ -2183,8 +2182,8 @@ __xfs_btree_updkeys(
nlkey = xfs_btree_key_addr(cur, ptr, block);
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
if (!force_all &&
- !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
- cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
+ xfs_btree_keycmp_eq(cur, nlkey, lkey) &&
+ xfs_btree_keycmp_eq(cur, nhkey, hkey))
break;
xfs_btree_copy_keys(cur, nlkey, lkey, 1);
xfs_btree_log_keys(cur, bp, ptr, ptr);
@@ -4716,7 +4715,6 @@ xfs_btree_simple_query_range(
{
union xfs_btree_rec *recp;
union xfs_btree_key rec_key;
- int64_t diff;
int stat;
bool firstrec = true;
int error;
@@ -4746,20 +4744,17 @@ xfs_btree_simple_query_range(
if (error || !stat)
break;
- /* Skip if high_key(rec) < low_key. */
+ /* Skip if low_key > high_key(rec). */
if (firstrec) {
cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
firstrec = false;
- diff = cur->bc_ops->diff_two_keys(cur, low_key,
- &rec_key);
- if (diff > 0)
+ if (xfs_btree_keycmp_gt(cur, low_key, &rec_key))
goto advloop;
}
- /* Stop if high_key < low_key(rec). */
+ /* Stop if low_key(rec) > high_key. */
cur->bc_ops->init_key_from_rec(&rec_key, recp);
- diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
- if (diff > 0)
+ if (xfs_btree_keycmp_gt(cur, &rec_key, high_key))
break;
/* Callback */
@@ -4813,8 +4808,6 @@ xfs_btree_overlapped_query_range(
union xfs_btree_key *hkp;
union xfs_btree_rec *recp;
struct xfs_btree_block *block;
- int64_t ldiff;
- int64_t hdiff;
int level;
struct xfs_buf *bp;
int i;
@@ -4854,25 +4847,23 @@ pop_up:
block);
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
- ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
- low_key);
-
cur->bc_ops->init_key_from_rec(&rec_key, recp);
- hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
- &rec_key);
/*
+ * If (query's high key < record's low key), then there
+ * are no more interesting records in this block. Pop
+ * up to the leaf level to find more record blocks.
+ *
* If (record's high key >= query's low key) and
* (query's high key >= record's low key), then
* this record overlaps the query range; callback.
*/
- if (ldiff >= 0 && hdiff >= 0) {
+ if (xfs_btree_keycmp_lt(cur, high_key, &rec_key))
+ goto pop_up;
+ if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) {
error = fn(cur, recp, priv);
if (error)
break;
- } else if (hdiff < 0) {
- /* Record is larger than high key; pop. */
- goto pop_up;
}
cur->bc_levels[level].ptr++;
continue;
@@ -4884,15 +4875,18 @@ pop_up:
block);
pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
- ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
- hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
-
/*
+ * If (query's high key < pointer's low key), then there are no
+ * more interesting keys in this block. Pop up one leaf level
+ * to continue looking for records.
+ *
* If (pointer's high key >= query's low key) and
* (query's high key >= pointer's low key), then
* this record overlaps the query range; follow pointer.
*/
- if (ldiff >= 0 && hdiff >= 0) {
+ if (xfs_btree_keycmp_lt(cur, high_key, lkp))
+ goto pop_up;
+ if (xfs_btree_keycmp_ge(cur, hkp, low_key)) {
level--;
error = xfs_btree_lookup_get_block(cur, level, pp,
&block);
@@ -4907,9 +4901,6 @@ pop_up:
#endif
cur->bc_levels[level].ptr = 1;
continue;
- } else if (hdiff < 0) {
- /* The low key is larger than the upper range; pop. */
- goto pop_up;
}
cur->bc_levels[level].ptr++;
}
@@ -4937,6 +4928,19 @@ out:
return error;
}
+static inline void
+xfs_btree_key_from_irec(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ const union xfs_btree_irec *irec)
+{
+ union xfs_btree_rec rec;
+
+ cur->bc_rec = *irec;
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+ cur->bc_ops->init_key_from_rec(key, &rec);
+}
+
/*
* Query a btree for all records overlapping a given interval of keys. The
* supplied function will be called with each record found; return one of the
@@ -4951,21 +4955,15 @@ xfs_btree_query_range(
xfs_btree_query_range_fn fn,
void *priv)
{
- union xfs_btree_rec rec;
union xfs_btree_key low_key;
union xfs_btree_key high_key;
/* Find the keys of both ends of the interval. */
- cur->bc_rec = *high_rec;
- cur->bc_ops->init_rec_from_cur(cur, &rec);
- cur->bc_ops->init_key_from_rec(&high_key, &rec);
+ xfs_btree_key_from_irec(cur, &high_key, high_rec);
+ xfs_btree_key_from_irec(cur, &low_key, low_rec);
- cur->bc_rec = *low_rec;
- cur->bc_ops->init_rec_from_cur(cur, &rec);
- cur->bc_ops->init_key_from_rec(&low_key, &rec);
-
- /* Enforce low key < high key. */
- if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
+ /* Enforce low key <= high key. */
+ if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
return -EINVAL;
if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
@@ -5027,34 +5025,132 @@ xfs_btree_diff_two_ptrs(
return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
}
-/* If there's an extent, we're done. */
+struct xfs_btree_has_records {
+ /* Keys for the start and end of the range we want to know about. */
+ union xfs_btree_key start_key;
+ union xfs_btree_key end_key;
+
+ /* Mask for key comparisons, if desired. */
+ const union xfs_btree_key *key_mask;
+
+ /* Highest record key we've seen so far. */
+ union xfs_btree_key high_key;
+
+ enum xbtree_recpacking outcome;
+};
+
STATIC int
-xfs_btree_has_record_helper(
+xfs_btree_has_records_helper(
struct xfs_btree_cur *cur,
const union xfs_btree_rec *rec,
void *priv)
{
- return -ECANCELED;
+ union xfs_btree_key rec_key;
+ union xfs_btree_key rec_high_key;
+ struct xfs_btree_has_records *info = priv;
+ enum xbtree_key_contig key_contig;
+
+ cur->bc_ops->init_key_from_rec(&rec_key, rec);
+
+ if (info->outcome == XBTREE_RECPACKING_EMPTY) {
+ info->outcome = XBTREE_RECPACKING_SPARSE;
+
+ /*
+ * If the first record we find does not overlap the start key,
+ * then there is a hole at the start of the search range.
+ * Classify this as sparse and stop immediately.
+ */
+ if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key,
+ info->key_mask))
+ return -ECANCELED;
+ } else {
+ /*
+ * If a subsequent record does not overlap with the any record
+ * we've seen so far, there is a hole in the middle of the
+ * search range. Classify this as sparse and stop.
+ * If the keys overlap and this btree does not allow overlap,
+ * signal corruption.
+ */
+ key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
+ &rec_key, info->key_mask);
+ if (key_contig == XBTREE_KEY_OVERLAP &&
+ !(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return -EFSCORRUPTED;
+ if (key_contig == XBTREE_KEY_GAP)
+ return -ECANCELED;
+ }
+
+ /*
+ * If high_key(rec) is larger than any other high key we've seen,
+ * remember it for later.
+ */
+ cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec);
+ if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key,
+ info->key_mask))
+ info->high_key = rec_high_key; /* struct copy */
+
+ return 0;
}
-/* Is there a record covering a given range of keys? */
+/*
+ * Scan part of the keyspace of a btree and tell us if that keyspace does not
+ * map to any records; is fully mapped to records; or is partially mapped to
+ * records. This is the btree record equivalent to determining if a file is
+ * sparse.
+ *
+ * For most btree types, the record scan should use all available btree key
+ * fields to compare the keys encountered. These callers should pass NULL for
+ * @mask. However, some callers (e.g. scanning physical space in the rmapbt)
+ * want to ignore some part of the btree record keyspace when performing the
+ * comparison. These callers should pass in a union xfs_btree_key object with
+ * the fields that *should* be a part of the comparison set to any nonzero
+ * value, and the rest zeroed.
+ */
int
-xfs_btree_has_record(
+xfs_btree_has_records(
struct xfs_btree_cur *cur,
const union xfs_btree_irec *low,
const union xfs_btree_irec *high,
- bool *exists)
+ const union xfs_btree_key *mask,
+ enum xbtree_recpacking *outcome)
{
+ struct xfs_btree_has_records info = {
+ .outcome = XBTREE_RECPACKING_EMPTY,
+ .key_mask = mask,
+ };
int error;
- error = xfs_btree_query_range(cur, low, high,
- &xfs_btree_has_record_helper, NULL);
- if (error == -ECANCELED) {
- *exists = true;
- return 0;
+ /* Not all btrees support this operation. */
+ if (!cur->bc_ops->keys_contiguous) {
+ ASSERT(0);
+ return -EOPNOTSUPP;
}
- *exists = false;
- return error;
+
+ xfs_btree_key_from_irec(cur, &info.start_key, low);
+ xfs_btree_key_from_irec(cur, &info.end_key, high);
+
+ error = xfs_btree_query_range(cur, low, high,
+ xfs_btree_has_records_helper, &info);
+ if (error == -ECANCELED)
+ goto out;
+ if (error)
+ return error;
+
+ if (info.outcome == XBTREE_RECPACKING_EMPTY)
+ goto out;
+
+ /*
+ * If the largest high_key(rec) we saw during the walk is greater than
+ * the end of the search range, classify this as full. Otherwise,
+ * there is a hole at the end of the search range.
+ */
+ if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key,
+ mask))
+ info.outcome = XBTREE_RECPACKING_FULL;
+
+out:
+ *outcome = info.outcome;
+ return 0;
}
/* Are there more records in this btree? */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 29c4b4ccb909..a2aa36b23e25 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -90,6 +90,27 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+enum xbtree_key_contig {
+ XBTREE_KEY_GAP = 0,
+ XBTREE_KEY_CONTIGUOUS,
+ XBTREE_KEY_OVERLAP,
+};
+
+/*
+ * Decide if these two numeric btree key fields are contiguous, overlapping,
+ * or if there's a gap between them. @x should be the field from the high
+ * key and @y should be the field from the low key.
+ */
+static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
+{
+ x++;
+ if (x < y)
+ return XBTREE_KEY_GAP;
+ if (x == y)
+ return XBTREE_KEY_CONTIGUOUS;
+ return XBTREE_KEY_OVERLAP;
+}
+
struct xfs_btree_ops {
/* size of the key and record structures */
size_t key_len;
@@ -140,11 +161,14 @@ struct xfs_btree_ops {
/*
* Difference between key2 and key1 -- positive if key1 > key2,
- * negative if key1 < key2, and zero if equal.
+ * negative if key1 < key2, and zero if equal. If the @mask parameter
+ * is non NULL, each key field to be used in the comparison must
+ * contain a nonzero value.
*/
int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
const union xfs_btree_key *key1,
- const union xfs_btree_key *key2);
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask);
const struct xfs_buf_ops *buf_ops;
@@ -157,6 +181,22 @@ struct xfs_btree_ops {
int (*recs_inorder)(struct xfs_btree_cur *cur,
const union xfs_btree_rec *r1,
const union xfs_btree_rec *r2);
+
+ /*
+ * Are these two btree keys immediately adjacent?
+ *
+ * Given two btree keys @key1 and @key2, decide if it is impossible for
+ * there to be a third btree key K satisfying the relationship
+ * @key1 < K < @key2. To determine if two btree records are
+ * immediately adjacent, @key1 should be the high key of the first
+ * record and @key2 should be the low key of the second record.
+ * If the @mask parameter is non NULL, each key field to be used in the
+ * comparison must contain a nonzero value.
+ */
+ enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask);
};
/*
@@ -540,12 +580,105 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
-int xfs_btree_has_record(struct xfs_btree_cur *cur,
+typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2);
+
+int xfs_btree_has_records(struct xfs_btree_cur *cur,
const union xfs_btree_irec *low,
- const union xfs_btree_irec *high, bool *exists);
+ const union xfs_btree_irec *high,
+ const union xfs_btree_key *mask,
+ enum xbtree_recpacking *outcome);
+
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
+/* Key comparison helpers */
+static inline bool
+xfs_btree_keycmp_lt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
+}
+
+static inline bool
+xfs_btree_keycmp_gt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
+}
+
+static inline bool
+xfs_btree_keycmp_eq(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
+}
+
+static inline bool
+xfs_btree_keycmp_le(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return !xfs_btree_keycmp_gt(cur, key1, key2);
+}
+
+static inline bool
+xfs_btree_keycmp_ge(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return !xfs_btree_keycmp_lt(cur, key1, key2);
+}
+
+static inline bool
+xfs_btree_keycmp_ne(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2)
+{
+ return !xfs_btree_keycmp_eq(cur, key1, key2);
+}
+
+/* Masked key comparison helpers */
+static inline bool
+xfs_btree_masked_keycmp_lt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
+}
+
+static inline bool
+xfs_btree_masked_keycmp_gt(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
+}
+
+static inline bool
+xfs_btree_masked_keycmp_ge(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask);
+}
+
/* Does this cursor point to the last block in the given level? */
static inline bool
xfs_btree_islastblock(
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 5a321b783398..bcfb6a4203cd 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -397,6 +397,7 @@ xfs_defer_cancel_list(
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
+ trace_xfs_defer_cancel_item(mp, dfp, pwi);
ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
@@ -476,6 +477,7 @@ xfs_defer_finish_one(
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
+ trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
error = ops->finish_item(tp, dfp->dfp_done, li, &state);
if (error == -EAGAIN) {
int ret;
@@ -623,7 +625,7 @@ xfs_defer_add(
struct list_head *li)
{
struct xfs_defer_pending *dfp = NULL;
- const struct xfs_defer_op_type *ops;
+ const struct xfs_defer_op_type *ops = defer_op_types[type];
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -636,7 +638,6 @@ xfs_defer_add(
if (!list_empty(&tp->t_dfops)) {
dfp = list_last_entry(&tp->t_dfops,
struct xfs_defer_pending, dfp_list);
- ops = defer_op_types[dfp->dfp_type];
if (dfp->dfp_type != type ||
(ops->max_items && dfp->dfp_count >= ops->max_items))
dfp = NULL;
@@ -653,6 +654,7 @@ xfs_defer_add(
}
list_add_tail(li, &dfp->dfp_work);
+ trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
dfp->dfp_count++;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 92bac3373f1f..f5462fd582d5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -64,7 +64,7 @@ xfs_ascii_ci_hashname(
int i;
for (i = 0, hash = 0; i < name->len; i++)
- hash = tolower(name->name[i]) ^ rol32(hash, 7);
+ hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7);
return hash;
}
@@ -85,7 +85,8 @@ xfs_ascii_ci_compname(
for (i = 0; i < len; i++) {
if (args->name[i] == name[i])
continue;
- if (tolower(args->name[i]) != tolower(name[i]))
+ if (xfs_ascii_ci_xfrm(args->name[i]) !=
+ xfs_ascii_ci_xfrm(name[i]))
return XFS_CMP_DIFFERENT;
result = XFS_CMP_CASE;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index dd39f17dd9a9..19af22a16c41 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
bool xfs_dir2_namecheck(const void *name, size_t length);
+/*
+ * The "ascii-ci" feature was created to speed up case-insensitive lookups for
+ * a Samba product. Because of the inherent problems with CI and UTF-8
+ * encoding, etc, it was decided that Samba would be configured to export
+ * latin1/iso 8859-1 encodings as that covered >90% of the target markets for
+ * the product. Hence the "ascii-ci" casefolding code could be encoded into
+ * the XFS directory operations and remove all the overhead of casefolding from
+ * Samba.
+ *
+ * To provide consistent hashing behavior between the userspace and kernel,
+ * these functions prepare names for hashing by transforming specific bytes
+ * to other bytes. Robustness with other encodings is not guaranteed.
+ */
+static inline bool xfs_ascii_ci_need_xfrm(unsigned char c)
+{
+ if (c >= 0x41 && c <= 0x5a) /* A-Z */
+ return true;
+ if (c >= 0xc0 && c <= 0xd6) /* latin A-O with accents */
+ return true;
+ if (c >= 0xd8 && c <= 0xde) /* latin O-Y with accents */
+ return true;
+ return false;
+}
+
+static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
+{
+ if (xfs_ascii_ci_need_xfrm(c))
+ c -= 'A' - 'a';
+ return c;
+}
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 7ee292aecbeb..a16d5de16933 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -95,33 +95,25 @@ xfs_inobt_btrec_to_irec(
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_inobt_get_rec(
- struct xfs_btree_cur *cur,
- struct xfs_inobt_rec_incore *irec,
- int *stat)
+/* Simple checks for inode records. */
+xfs_failaddr_t
+xfs_inobt_check_irec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_inobt_rec_incore *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
- union xfs_btree_rec *rec;
- int error;
uint64_t realfree;
- error = xfs_btree_get_rec(cur, &rec, stat);
- if (error || *stat == 0)
- return error;
-
- xfs_inobt_btrec_to_irec(mp, rec, irec);
-
+ /* Record has to be properly aligned within the AG. */
if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
- goto out_bad_rec;
+ return __this_address;
+ if (!xfs_verify_agino(cur->bc_ag.pag,
+ irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
+ return __this_address;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
irec->ir_count > XFS_INODES_PER_CHUNK)
- goto out_bad_rec;
+ return __this_address;
if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
- goto out_bad_rec;
+ return __this_address;
/* if there are no holes, return the first available offset */
if (!xfs_inobt_issparse(irec->ir_holemask))
@@ -129,15 +121,23 @@ xfs_inobt_get_rec(
else
realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
if (hweight64(realfree) != irec->ir_freecount)
- goto out_bad_rec;
+ return __this_address;
- return 0;
+ return NULL;
+}
+
+static inline int
+xfs_inobt_complain_bad_rec(
+ struct xfs_btree_cur *cur,
+ xfs_failaddr_t fa,
+ const struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = cur->bc_mp;
-out_bad_rec:
xfs_warn(mp,
- "%s Inode BTree record corruption in AG %d detected!",
+ "%s Inode BTree record corruption in AG %d detected at %pS!",
cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
- cur->bc_ag.pag->pag_agno);
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -146,6 +146,32 @@ out_bad_rec:
}
/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *irec,
+ int *stat)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ union xfs_btree_rec *rec;
+ xfs_failaddr_t fa;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_inobt_btrec_to_irec(mp, rec, irec);
+ fa = xfs_inobt_check_irec(cur, irec);
+ if (fa)
+ return xfs_inobt_complain_bad_rec(cur, fa, irec);
+
+ return 0;
+}
+
+/*
* Insert a single inobt record. Cursor must already point to desired location.
*/
int
@@ -1952,8 +1978,6 @@ xfs_difree_inobt(
*/
if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
- struct xfs_perag *pag = agbp->b_pag;
-
xic->deleted = true;
xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
rec.ir_startino);
@@ -2617,44 +2641,50 @@ xfs_ialloc_read_agi(
return 0;
}
-/* Is there an inode record covering a given range of inode numbers? */
-int
-xfs_ialloc_has_inode_record(
- struct xfs_btree_cur *cur,
- xfs_agino_t low,
- xfs_agino_t high,
- bool *exists)
+/* How many inodes are backed by inode clusters ondisk? */
+STATIC int
+xfs_ialloc_count_ondisk(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t low,
+ xfs_agino_t high,
+ unsigned int *allocated)
{
struct xfs_inobt_rec_incore irec;
- xfs_agino_t agino;
- uint16_t holemask;
- int has_record;
- int i;
- int error;
+ unsigned int ret = 0;
+ int has_record;
+ int error;
- *exists = false;
error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
- while (error == 0 && has_record) {
+ if (error)
+ return error;
+
+ while (has_record) {
+ unsigned int i, hole_idx;
+
error = xfs_inobt_get_rec(cur, &irec, &has_record);
- if (error || irec.ir_startino > high)
+ if (error)
+ return error;
+ if (irec.ir_startino > high)
break;
- agino = irec.ir_startino;
- holemask = irec.ir_holemask;
- for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
- i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
- if (holemask & 1)
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+ if (irec.ir_startino + i < low)
continue;
- if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
- agino <= high) {
- *exists = true;
- return 0;
- }
+ if (irec.ir_startino + i > high)
+ break;
+
+ hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
+ if (!(irec.ir_holemask & (1U << hole_idx)))
+ ret++;
}
error = xfs_btree_increment(cur, 0, &has_record);
+ if (error)
+ return error;
}
- return error;
+
+ *allocated = ret;
+ return 0;
}
/* Is there an inode record covering a given extent? */
@@ -2663,15 +2693,27 @@ xfs_ialloc_has_inodes_at_extent(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- bool *exists)
+ enum xbtree_recpacking *outcome)
{
- xfs_agino_t low;
- xfs_agino_t high;
+ xfs_agino_t agino;
+ xfs_agino_t last_agino;
+ unsigned int allocated;
+ int error;
+
+ agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+ last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
- low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
- high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
+ error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
+ if (error)
+ return error;
- return xfs_ialloc_has_inode_record(cur, low, high, exists);
+ if (allocated == 0)
+ *outcome = XBTREE_RECPACKING_EMPTY;
+ else if (allocated == last_agino - agino + 1)
+ *outcome = XBTREE_RECPACKING_FULL;
+ else
+ *outcome = XBTREE_RECPACKING_SPARSE;
+ return 0;
}
struct xfs_ialloc_count_inodes {
@@ -2688,8 +2730,13 @@ xfs_ialloc_count_inodes_rec(
{
struct xfs_inobt_rec_incore irec;
struct xfs_ialloc_count_inodes *ci = priv;
+ xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
+ fa = xfs_inobt_check_irec(cur, &irec);
+ if (fa)
+ return xfs_inobt_complain_bad_rec(cur, fa, &irec);
+
ci->count += irec.ir_count;
ci->freecount += irec.ir_freecount;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index ab8c30b4ec22..fe824bb04a09 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -93,10 +93,11 @@ union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+ const struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
- xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
-int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
- xfs_agino_t high, bool *exists);
+ xfs_agblock_t bno, xfs_extlen_t len,
+ enum xbtree_recpacking *outcome);
int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
xfs_agino_t *freecount);
int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b28211d5a4c..5a945ae21b5d 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -156,9 +156,12 @@ __xfs_inobt_free_block(
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_fsblock_t fsbno;
+
xfs_inobt_mod_blockcount(cur, -1);
- return xfs_free_extent(cur->bc_tp,
- XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1,
&XFS_RMAP_OINFO_INOBT, resv);
}
@@ -266,10 +269,13 @@ STATIC int64_t
xfs_inobt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
+ ASSERT(!mask || mask->inobt.ir_startino);
+
return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
- be32_to_cpu(k2->inobt.ir_startino);
+ be32_to_cpu(k2->inobt.ir_startino);
}
static xfs_failaddr_t
@@ -380,6 +386,19 @@ xfs_inobt_recs_inorder(
be32_to_cpu(r2->inobt.ir_startino);
}
+STATIC enum xbtree_key_contig
+xfs_inobt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->inobt.ir_startino);
+
+ return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino),
+ be32_to_cpu(key2->inobt.ir_startino));
+}
+
static const struct xfs_btree_ops xfs_inobt_ops = {
.rec_len = sizeof(xfs_inobt_rec_t),
.key_len = sizeof(xfs_inobt_key_t),
@@ -399,6 +418,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
+ .keys_contiguous = xfs_inobt_keys_contiguous,
};
static const struct xfs_btree_ops xfs_finobt_ops = {
@@ -420,6 +440,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
+ .keys_contiguous = xfs_inobt_keys_contiguous,
};
/*
@@ -447,9 +468,7 @@ xfs_inobt_init_common(
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- /* take a reference for the cursor */
- atomic_inc(&pag->pag_ref);
- cur->bc_ag.pag = pag;
+ cur->bc_ag.pag = xfs_perag_hold(pag);
return cur;
}
@@ -607,7 +626,7 @@ xfs_iallocbt_maxlevels_ondisk(void)
*/
uint64_t
xfs_inobt_irec_to_allocmask(
- struct xfs_inobt_rec_incore *rec)
+ const struct xfs_inobt_rec_incore *rec)
{
uint64_t bitmap = 0;
uint64_t inodespbit;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index e859a6e05230..3262c3fe5ebe 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -53,7 +53,7 @@ struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
-uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec);
#if defined(DEBUG) || defined(XFS_WARN)
int xfs_inobt_rec_check_count(struct xfs_mount *,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 6b21760184d9..5a2e7ddfa76d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -140,7 +140,8 @@ xfs_iformat_extents(
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(2)",
dp, sizeof(*dp), fa);
- return -EFSCORRUPTED;
+ return xfs_bmap_complain_bad_rec(ip, whichfork,
+ fa, &new);
}
xfs_iext_insert(ip, &icur, &new, state);
@@ -226,10 +227,15 @@ xfs_iformat_data_fork(
/*
* Initialize the extent count early, as the per-format routines may
- * depend on it.
+ * depend on it. Use release semantics to set needextents /after/ we
+ * set the format. This ensures that we can use acquire semantics on
+ * needextents in xfs_need_iread_extents() and be guaranteed to see a
+ * valid format value after that load.
*/
ip->i_df.if_format = dip->di_format;
ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+ smp_store_release(&ip->i_df.if_needextents,
+ ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -282,8 +288,17 @@ xfs_ifork_init_attr(
enum xfs_dinode_fmt format,
xfs_extnum_t nextents)
{
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it. Use release semantics to set needextents /after/ we
+ * set the format. This ensures that we can use acquire semantics on
+ * needextents in xfs_need_iread_extents() and be guaranteed to see a
+ * valid format value after that load.
+ */
ip->i_af.if_format = format;
ip->i_af.if_nextents = nextents;
+ smp_store_release(&ip->i_af.if_needextents,
+ ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index d3943d6ad0b9..96d307784c85 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -24,6 +24,7 @@ struct xfs_ifork {
xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
+ uint8_t if_needextents; /* extents have not been read */
};
/*
@@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
-static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
{
- return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+ /* see xfs_iformat_{data,attr}_fork() for needextents semantics */
+ return smp_load_acquire(&ifp->if_needextents) != 0;
}
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index bcf46aa0d08b..c1c65774dcc2 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -120,45 +120,41 @@ xfs_refcount_btrec_to_irec(
irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
}
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_refcount_get_rec(
+/* Simple checks for refcount records. */
+xfs_failaddr_t
+xfs_refcount_check_irec(
struct xfs_btree_cur *cur,
- struct xfs_refcount_irec *irec,
- int *stat)
+ const struct xfs_refcount_irec *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- union xfs_btree_rec *rec;
- int error;
-
- error = xfs_btree_get_rec(cur, &rec, stat);
- if (error || !*stat)
- return error;
- xfs_refcount_btrec_to_irec(rec, irec);
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
- goto out_bad_rec;
+ return __this_address;
if (!xfs_refcount_check_domain(irec))
- goto out_bad_rec;
+ return __this_address;
/* check for valid extent range, including overflow */
if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount))
- goto out_bad_rec;
+ return __this_address;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
- goto out_bad_rec;
+ return __this_address;
- trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
- return 0;
+ return NULL;
+}
+
+static inline int
+xfs_refcount_complain_bad_rec(
+ struct xfs_btree_cur *cur,
+ xfs_failaddr_t fa,
+ const struct xfs_refcount_irec *irec)
+{
+ struct xfs_mount *mp = cur->bc_mp;
-out_bad_rec:
xfs_warn(mp,
- "Refcount BTree record corruption in AG %d detected!",
- pag->pag_agno);
+ "Refcount BTree record corruption in AG %d detected at %pS!",
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -166,6 +162,32 @@ out_bad_rec:
}
/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ xfs_failaddr_t fa;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ xfs_refcount_btrec_to_irec(rec, irec);
+ fa = xfs_refcount_check_irec(cur, irec);
+ if (fa)
+ return xfs_refcount_complain_bad_rec(cur, fa, irec);
+
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ return 0;
+}
+
+/*
* Update the record referred to by cur to the value given
* by [bno, len, refcount].
* This either works (return 0) or gets an EFSCORRUPTED error.
@@ -1332,26 +1354,22 @@ xfs_refcount_finish_one(
xfs_agblock_t bno;
unsigned long nr_ops = 0;
int shape_changes = 0;
- struct xfs_perag *pag;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
ri->ri_blockcount);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
- error = -EIO;
- goto out_drop;
- }
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ return -EIO;
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_ag.pag != pag) {
+ if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
nr_ops = rcur->bc_ag.refc.nr_ops;
shape_changes = rcur->bc_ag.refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
@@ -1359,12 +1377,12 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
- &agbp);
+ error = xfs_alloc_read_agf(ri->ri_pag, tp,
+ XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
- goto out_drop;
+ return error;
- rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
+ rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
rcur->bc_ag.refc.nr_ops = nr_ops;
rcur->bc_ag.refc.shape_changes = shape_changes;
}
@@ -1375,7 +1393,7 @@ xfs_refcount_finish_one(
error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
XFS_REFCOUNT_ADJUST_INCREASE);
if (error)
- goto out_drop;
+ return error;
if (ri->ri_blockcount > 0)
error = xfs_refcount_continue_op(rcur, ri, bno);
break;
@@ -1383,31 +1401,29 @@ xfs_refcount_finish_one(
error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
XFS_REFCOUNT_ADJUST_DECREASE);
if (error)
- goto out_drop;
+ return error;
if (ri->ri_blockcount > 0)
error = xfs_refcount_continue_op(rcur, ri, bno);
break;
case XFS_REFCOUNT_ALLOC_COW:
error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
if (error)
- goto out_drop;
+ return error;
ri->ri_blockcount = 0;
break;
case XFS_REFCOUNT_FREE_COW:
error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
if (error)
- goto out_drop;
+ return error;
ri->ri_blockcount = 0;
break;
default:
ASSERT(0);
- error = -EFSCORRUPTED;
+ return -EFSCORRUPTED;
}
if (!error && ri->ri_blockcount > 0)
- trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
+ trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno,
ri->ri_type, bno, ri->ri_blockcount);
-out_drop:
- xfs_perag_put(pag);
return error;
}
@@ -1435,6 +1451,7 @@ __xfs_refcount_add(
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
+ xfs_refcount_update_get_group(tp->t_mountp, ri);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
}
@@ -1876,7 +1893,8 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (XFS_IS_CORRUPT(cur->bc_mp,
+ if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+ XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
kfree(rr);
return -EFSCORRUPTED;
@@ -1980,14 +1998,17 @@ out_free:
return error;
}
-/* Is there a record covering a given extent? */
+/*
+ * Scan part of the keyspace of the refcount records and tell us if the area
+ * has no records, is fully mapped by records, or is partially filled.
+ */
int
-xfs_refcount_has_record(
+xfs_refcount_has_records(
struct xfs_btree_cur *cur,
enum xfs_refc_domain domain,
xfs_agblock_t bno,
xfs_extlen_t len,
- bool *exists)
+ enum xbtree_recpacking *outcome)
{
union xfs_btree_irec low;
union xfs_btree_irec high;
@@ -1998,7 +2019,7 @@ xfs_refcount_has_record(
high.rc.rc_startblock = bno + len - 1;
low.rc.rc_domain = high.rc.rc_domain = domain;
- return xfs_btree_has_record(cur, &low, &high, exists);
+ return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
int __init
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index c633477ce3ce..783cd89ca195 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -50,6 +50,7 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
+ struct xfs_perag *ri_pag;
enum xfs_refcount_intent_type ri_type;
xfs_extlen_t ri_blockcount;
xfs_fsblock_t ri_startblock;
@@ -67,6 +68,9 @@ xfs_refcount_check_domain(
return true;
}
+void xfs_refcount_update_get_group(struct xfs_mount *mp,
+ struct xfs_refcount_intent *ri);
+
void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -107,12 +111,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
+extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
enum xfs_refc_domain domain, xfs_agblock_t bno,
- xfs_extlen_t len, bool *exists);
+ xfs_extlen_t len, enum xbtree_recpacking *outcome);
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index f3b860970b26..d4afc5f4e6a5 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -112,8 +112,9 @@ xfs_refcountbt_free_block(
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
- error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
- XFS_AG_RESV_METADATA);
+ error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag,
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1,
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
if (error)
return error;
@@ -201,10 +202,13 @@ STATIC int64_t
xfs_refcountbt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ be32_to_cpu(k2->refc.rc_startblock);
}
STATIC xfs_failaddr_t
@@ -299,6 +303,19 @@ xfs_refcountbt_recs_inorder(
be32_to_cpu(r2->refc.rc_startblock);
}
+STATIC enum xbtree_key_contig
+xfs_refcountbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->refc.rc_startblock);
+
+ return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock),
+ be32_to_cpu(key2->refc.rc_startblock));
+}
+
static const struct xfs_btree_ops xfs_refcountbt_ops = {
.rec_len = sizeof(struct xfs_refcount_rec),
.key_len = sizeof(struct xfs_refcount_key),
@@ -318,6 +335,7 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
.diff_two_keys = xfs_refcountbt_diff_two_keys,
.keys_inorder = xfs_refcountbt_keys_inorder,
.recs_inorder = xfs_refcountbt_recs_inorder,
+ .keys_contiguous = xfs_refcountbt_keys_contiguous,
};
/*
@@ -339,10 +357,7 @@ xfs_refcountbt_init_common(
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- /* take a reference for the cursor */
- atomic_inc(&pag->pag_ref);
- cur->bc_ag.pag = pag;
-
+ cur->bc_ag.pag = xfs_perag_hold(pag);
cur->bc_ag.refc.nr_ops = 0;
cur->bc_ag.refc.shape_changes = 0;
cur->bc_ops = &xfs_refcountbt_ops;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index df720041cd3d..f4dc23b3b837 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -193,7 +193,7 @@ done:
}
/* Convert an internal btree record to an rmap record. */
-int
+xfs_failaddr_t
xfs_rmap_btrec_to_irec(
const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec)
@@ -205,51 +205,74 @@ xfs_rmap_btrec_to_irec(
irec);
}
-/*
- * Get the data from the pointed-to record.
- */
-int
-xfs_rmap_get_rec(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *irec,
- int *stat)
+/* Simple checks for rmap records. */
+xfs_failaddr_t
+xfs_rmap_check_irec(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
- union xfs_btree_rec *rec;
- int error;
-
- error = xfs_btree_get_rec(cur, &rec, stat);
- if (error || !*stat)
- return error;
-
- if (xfs_rmap_btrec_to_irec(rec, irec))
- goto out_bad_rec;
+ struct xfs_mount *mp = cur->bc_mp;
+ bool is_inode;
+ bool is_unwritten;
+ bool is_bmbt;
+ bool is_attr;
if (irec->rm_blockcount == 0)
- goto out_bad_rec;
+ return __this_address;
if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) {
if (irec->rm_owner != XFS_RMAP_OWN_FS)
- goto out_bad_rec;
+ return __this_address;
if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
- goto out_bad_rec;
+ return __this_address;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbext(pag, irec->rm_startblock,
- irec->rm_blockcount))
- goto out_bad_rec;
+ if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
+ irec->rm_blockcount))
+ return __this_address;
}
if (!(xfs_verify_ino(mp, irec->rm_owner) ||
(irec->rm_owner <= XFS_RMAP_OWN_FS &&
irec->rm_owner >= XFS_RMAP_OWN_MIN)))
- goto out_bad_rec;
+ return __this_address;
+
+ /* Check flags. */
+ is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
+ is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK;
+ is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK;
+ is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN;
+
+ if (is_bmbt && irec->rm_offset != 0)
+ return __this_address;
+
+ if (!is_inode && irec->rm_offset != 0)
+ return __this_address;
+
+ if (is_unwritten && (is_bmbt || !is_inode || is_attr))
+ return __this_address;
+
+ if (!is_inode && (is_bmbt || is_unwritten || is_attr))
+ return __this_address;
+
+ /* Check for a valid fork offset, if applicable. */
+ if (is_inode && !is_bmbt &&
+ !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount))
+ return __this_address;
+
+ return NULL;
+}
+
+static inline int
+xfs_rmap_complain_bad_rec(
+ struct xfs_btree_cur *cur,
+ xfs_failaddr_t fa,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_mount *mp = cur->bc_mp;
- return 0;
-out_bad_rec:
xfs_warn(mp,
- "Reverse Mapping BTree record corruption in AG %d detected!",
- pag->pag_agno);
+ "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+ cur->bc_ag.pag->pag_agno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -257,6 +280,32 @@ out_bad_rec:
return -EFSCORRUPTED;
}
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_rmap_get_rec(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *irec,
+ int *stat)
+{
+ union xfs_btree_rec *rec;
+ xfs_failaddr_t fa;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (error || !*stat)
+ return error;
+
+ fa = xfs_rmap_btrec_to_irec(rec, irec);
+ if (!fa)
+ fa = xfs_rmap_check_irec(cur, irec);
+ if (fa)
+ return xfs_rmap_complain_bad_rec(cur, fa, irec);
+
+ return 0;
+}
+
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
@@ -2320,11 +2369,14 @@ xfs_rmap_query_range_helper(
{
struct xfs_rmap_query_range_info *query = priv;
struct xfs_rmap_irec irec;
- int error;
+ xfs_failaddr_t fa;
+
+ fa = xfs_rmap_btrec_to_irec(rec, &irec);
+ if (!fa)
+ fa = xfs_rmap_check_irec(cur, &irec);
+ if (fa)
+ return xfs_rmap_complain_bad_rec(cur, fa, &irec);
- error = xfs_rmap_btrec_to_irec(rec, &irec);
- if (error)
- return error;
return query->fn(cur, &irec, query->priv);
}
@@ -2394,7 +2446,6 @@ xfs_rmap_finish_one(
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
struct xfs_btree_cur *rcur;
struct xfs_buf *agbp = NULL;
int error = 0;
@@ -2402,26 +2453,22 @@ xfs_rmap_finish_one(
xfs_agblock_t bno;
bool unwritten;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock));
bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
- trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno,
+ trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno,
ri->ri_owner, ri->ri_whichfork,
ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
ri->ri_bmap.br_state);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
- error = -EIO;
- goto out_drop;
- }
-
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+ return -EIO;
/*
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_ag.pag != pag) {
+ if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
xfs_rmap_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -2432,15 +2479,13 @@ xfs_rmap_finish_one(
* rmapbt, because a shape change could cause us to
* allocate blocks.
*/
- error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
+ error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
if (error)
- goto out_drop;
- if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
- error = -EFSCORRUPTED;
- goto out_drop;
- }
+ return error;
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+ return -EFSCORRUPTED;
- rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+ rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
*pcur = rcur;
@@ -2480,8 +2525,7 @@ xfs_rmap_finish_one(
ASSERT(0);
error = -EFSCORRUPTED;
}
-out_drop:
- xfs_perag_put(pag);
+
return error;
}
@@ -2526,6 +2570,7 @@ __xfs_rmap_add(
ri->ri_whichfork = whichfork;
ri->ri_bmap = *bmap;
+ xfs_rmap_update_get_group(tp->t_mountp, ri);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
}
@@ -2664,14 +2709,21 @@ xfs_rmap_compare(
return 0;
}
-/* Is there a record covering a given extent? */
+/*
+ * Scan the physical storage part of the keyspace of the reverse mapping index
+ * and tell us if the area has no records, is fully mapped by records, or is
+ * partially filled.
+ */
int
-xfs_rmap_has_record(
+xfs_rmap_has_records(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- bool *exists)
+ enum xbtree_recpacking *outcome)
{
+ union xfs_btree_key mask = {
+ .rmap.rm_startblock = cpu_to_be32(-1U),
+ };
union xfs_btree_irec low;
union xfs_btree_irec high;
@@ -2680,68 +2732,144 @@ xfs_rmap_has_record(
memset(&high, 0xFF, sizeof(high));
high.r.rm_startblock = bno + len - 1;
- return xfs_btree_has_record(cur, &low, &high, exists);
+ return xfs_btree_has_records(cur, &low, &high, &mask, outcome);
}
-/*
- * Is there a record for this owner completely covering a given physical
- * extent? If so, *has_rmap will be set to true. If there is no record
- * or the record only covers part of the range, we set *has_rmap to false.
- * This function doesn't perform range lookups or offset checks, so it is
- * not suitable for checking data fork blocks.
- */
-int
-xfs_rmap_record_exists(
- struct xfs_btree_cur *cur,
+struct xfs_rmap_ownercount {
+ /* Owner that we're looking for. */
+ struct xfs_rmap_irec good;
+
+ /* rmap search keys */
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+
+ struct xfs_rmap_matches *results;
+
+ /* Stop early if we find a nonmatch? */
+ bool stop_on_nonmatch;
+};
+
+/* Does this rmap represent space that can have multiple owners? */
+static inline bool
+xfs_rmap_shareable(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rmap)
+{
+ if (!xfs_has_reflink(mp))
+ return false;
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+ if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK |
+ XFS_RMAP_BMBT_BLOCK))
+ return false;
+ return true;
+}
+
+static inline void
+xfs_rmap_ownercount_init(
+ struct xfs_rmap_ownercount *roc,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
- bool *has_rmap)
+ struct xfs_rmap_matches *results)
{
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
- int has_record;
- struct xfs_rmap_irec irec;
- int error;
+ memset(roc, 0, sizeof(*roc));
+ roc->results = results;
+
+ roc->low.rm_startblock = bno;
+ memset(&roc->high, 0xFF, sizeof(roc->high));
+ roc->high.rm_startblock = bno + len - 1;
+
+ memset(results, 0, sizeof(*results));
+ roc->good.rm_startblock = bno;
+ roc->good.rm_blockcount = len;
+ roc->good.rm_owner = oinfo->oi_owner;
+ roc->good.rm_offset = oinfo->oi_offset;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+ roc->good.rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+ roc->good.rm_flags |= XFS_RMAP_BMBT_BLOCK;
+}
- xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
- ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
- (flags & XFS_RMAP_BMBT_BLOCK));
+/* Figure out if this is a match for the owner. */
+STATIC int
+xfs_rmap_count_owners_helper(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_rmap_ownercount *roc = priv;
+ struct xfs_rmap_irec check = *rec;
+ unsigned int keyflags;
+ bool filedata;
+ int64_t delta;
+
+ filedata = !XFS_RMAP_NON_INODE_OWNER(check.rm_owner) &&
+ !(check.rm_flags & XFS_RMAP_BMBT_BLOCK);
+
+ /* Trim the part of check that comes before the comparison range. */
+ delta = (int64_t)roc->good.rm_startblock - check.rm_startblock;
+ if (delta > 0) {
+ check.rm_startblock += delta;
+ check.rm_blockcount -= delta;
+ if (filedata)
+ check.rm_offset += delta;
+ }
- error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
- &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
+ /* Trim the part of check that comes after the comparison range. */
+ delta = (check.rm_startblock + check.rm_blockcount) -
+ (roc->good.rm_startblock + roc->good.rm_blockcount);
+ if (delta > 0)
+ check.rm_blockcount -= delta;
+
+ /* Don't care about unwritten status for establishing ownership. */
+ keyflags = check.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK);
+
+ if (check.rm_startblock == roc->good.rm_startblock &&
+ check.rm_blockcount == roc->good.rm_blockcount &&
+ check.rm_owner == roc->good.rm_owner &&
+ check.rm_offset == roc->good.rm_offset &&
+ keyflags == roc->good.rm_flags) {
+ roc->results->matches++;
+ } else {
+ roc->results->non_owner_matches++;
+ if (xfs_rmap_shareable(cur->bc_mp, &roc->good) ^
+ xfs_rmap_shareable(cur->bc_mp, &check))
+ roc->results->bad_non_owner_matches++;
}
- *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
- irec.rm_startblock + irec.rm_blockcount >= bno + len);
+ if (roc->results->non_owner_matches && roc->stop_on_nonmatch)
+ return -ECANCELED;
+
return 0;
}
-struct xfs_rmap_key_state {
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
-};
-
-/* For each rmap given, figure out if it doesn't match the key we want. */
-STATIC int
-xfs_rmap_has_other_keys_helper(
+/* Count the number of owners and non-owners of this range of blocks. */
+int
+xfs_rmap_count_owners(
struct xfs_btree_cur *cur,
- const struct xfs_rmap_irec *rec,
- void *priv)
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ struct xfs_rmap_matches *results)
{
- struct xfs_rmap_key_state *rks = priv;
+ struct xfs_rmap_ownercount roc;
+ int error;
- if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
- ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
- return 0;
- return -ECANCELED;
+ xfs_rmap_ownercount_init(&roc, bno, len, oinfo, results);
+ error = xfs_rmap_query_range(cur, &roc.low, &roc.high,
+ xfs_rmap_count_owners_helper, &roc);
+ if (error)
+ return error;
+
+ /*
+ * There can't be any non-owner rmaps that conflict with the given
+ * owner if we didn't find any rmaps matching the owner.
+ */
+ if (!results->matches)
+ results->bad_non_owner_matches = 0;
+
+ return 0;
}
/*
@@ -2754,28 +2882,26 @@ xfs_rmap_has_other_keys(
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
- bool *has_rmap)
+ bool *has_other)
{
- struct xfs_rmap_irec low = {0};
- struct xfs_rmap_irec high;
- struct xfs_rmap_key_state rks;
+ struct xfs_rmap_matches res;
+ struct xfs_rmap_ownercount roc;
int error;
- xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
- *has_rmap = false;
-
- low.rm_startblock = bno;
- memset(&high, 0xFF, sizeof(high));
- high.rm_startblock = bno + len - 1;
+ xfs_rmap_ownercount_init(&roc, bno, len, oinfo, &res);
+ roc.stop_on_nonmatch = true;
- error = xfs_rmap_query_range(cur, &low, &high,
- xfs_rmap_has_other_keys_helper, &rks);
+ error = xfs_rmap_query_range(cur, &roc.low, &roc.high,
+ xfs_rmap_count_owners_helper, &roc);
if (error == -ECANCELED) {
- *has_rmap = true;
+ *has_other = true;
return 0;
}
+ if (error)
+ return error;
- return error;
+ *has_other = false;
+ return 0;
}
const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 2dac88cea28d..3c98d9d50afb 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -62,13 +62,14 @@ xfs_rmap_irec_offset_pack(
return x;
}
-static inline int
+static inline xfs_failaddr_t
xfs_rmap_irec_offset_unpack(
__u64 offset,
struct xfs_rmap_irec *irec)
{
if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
- return -EFSCORRUPTED;
+ return __this_address;
+
irec->rm_offset = XFS_RMAP_OFF(offset);
irec->rm_flags = 0;
if (offset & XFS_RMAP_OFF_ATTR_FORK)
@@ -77,7 +78,7 @@ xfs_rmap_irec_offset_unpack(
irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
if (offset & XFS_RMAP_OFF_UNWRITTEN)
irec->rm_flags |= XFS_RMAP_UNWRITTEN;
- return 0;
+ return NULL;
}
static inline void
@@ -162,8 +163,12 @@ struct xfs_rmap_intent {
int ri_whichfork;
uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
+ struct xfs_perag *ri_pag;
};
+void xfs_rmap_update_get_group(struct xfs_mount *mp,
+ struct xfs_rmap_intent *ri);
+
/* functions for updating the rmapbt based on bmbt map/unmap operations */
void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
@@ -188,16 +193,31 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_compare(const struct xfs_rmap_irec *a,
const struct xfs_rmap_irec *b);
union xfs_btree_rec;
-int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
+xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec);
-int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, bool *exists);
-int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec);
+
+int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, enum xbtree_recpacking *outcome);
+
+struct xfs_rmap_matches {
+ /* Number of owner matches. */
+ unsigned long long matches;
+
+ /* Number of non-owner matches. */
+ unsigned long long non_owner_matches;
+
+ /* Number of non-owner matches that conflict with the owner matches. */
+ unsigned long long bad_non_owner_matches;
+};
+
+int xfs_rmap_count_owners(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, const struct xfs_owner_info *oinfo,
- bool *has_rmap);
+ struct xfs_rmap_matches *rmatch);
int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, const struct xfs_owner_info *oinfo,
- bool *has_rmap);
+ bool *has_other);
int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index d3285684bb5e..6c81b20e97d2 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs(
return cur->bc_mp->m_rmap_mxr[level != 0];
}
+/*
+ * Convert the ondisk record's offset field into the ondisk key's offset field.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
+{
+ return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+}
+
STATIC void
xfs_rmapbt_init_key_from_rec(
union xfs_btree_key *key,
@@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec(
{
key->rmap.rm_startblock = rec->rmap.rm_startblock;
key->rmap.rm_owner = rec->rmap.rm_owner;
- key->rmap.rm_offset = rec->rmap.rm_offset;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
}
/*
@@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec(
key->rmap.rm_startblock = rec->rmap.rm_startblock;
be32_add_cpu(&key->rmap.rm_startblock, adj);
key->rmap.rm_owner = rec->rmap.rm_owner;
- key->rmap.rm_offset = rec->rmap.rm_offset;
+ key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
return;
@@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur(
ptr->s = agf->agf_roots[cur->bc_btnum];
}
+/*
+ * Mask the appropriate parts of the ondisk key field for a key comparison.
+ * Fork and bmbt are significant parts of the rmap record key, but written
+ * status is merely a record attribute.
+ */
+static inline uint64_t offset_keymask(uint64_t offset)
+{
+ return offset & ~XFS_RMAP_OFF_UNWRITTEN;
+}
+
STATIC int64_t
xfs_rmapbt_key_diff(
struct xfs_btree_cur *cur,
@@ -240,8 +260,8 @@ xfs_rmapbt_key_diff(
else if (y > x)
return -1;
- x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
- y = rec->rm_offset;
+ x = offset_keymask(be64_to_cpu(kp->rm_offset));
+ y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
if (x > y)
return 1;
else if (y > x)
@@ -253,31 +273,43 @@ STATIC int64_t
xfs_rmapbt_diff_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
- const union xfs_btree_key *k2)
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
int64_t d;
__u64 x, y;
+ /* Doesn't make sense to mask off the physical space part */
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ be32_to_cpu(kp2->rm_startblock);
if (d)
return d;
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ if (!mask || mask->rmap.rm_owner) {
+ x = be64_to_cpu(kp1->rm_owner);
+ y = be64_to_cpu(kp2->rm_owner);
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
+
+ if (!mask || mask->rmap.rm_offset) {
+ /* Doesn't make sense to allow offset but not owner */
+ ASSERT(!mask || mask->rmap.rm_owner);
+
+ x = offset_keymask(be64_to_cpu(kp1->rm_offset));
+ y = offset_keymask(be64_to_cpu(kp2->rm_offset));
+ if (x > y)
+ return 1;
+ else if (y > x)
+ return -1;
+ }
- x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
- y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
return 0;
}
@@ -387,8 +419,8 @@ xfs_rmapbt_keys_inorder(
return 1;
else if (a > b)
return 0;
- a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
- b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
+ a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
if (a <= b)
return 1;
return 0;
@@ -417,13 +449,33 @@ xfs_rmapbt_recs_inorder(
return 1;
else if (a > b)
return 0;
- a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
- b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
+ a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
+ b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
if (a <= b)
return 1;
return 0;
}
+STATIC enum xbtree_key_contig
+xfs_rmapbt_keys_contiguous(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask)
+{
+ ASSERT(!mask || mask->rmap.rm_startblock);
+
+ /*
+ * We only support checking contiguity of the physical space component.
+ * If any callers ever need more specificity than that, they'll have to
+ * implement it here.
+ */
+ ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
+
+ return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
+ be32_to_cpu(key2->rmap.rm_startblock));
+}
+
static const struct xfs_btree_ops xfs_rmapbt_ops = {
.rec_len = sizeof(struct xfs_rmap_rec),
.key_len = 2 * sizeof(struct xfs_rmap_key),
@@ -443,6 +495,7 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
.diff_two_keys = xfs_rmapbt_diff_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
+ .keys_contiguous = xfs_rmapbt_keys_contiguous,
};
static struct xfs_btree_cur *
@@ -460,10 +513,7 @@ xfs_rmapbt_init_common(
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_ops = &xfs_rmapbt_ops;
- /* take a reference for the cursor */
- atomic_inc(&pag->pag_ref);
- cur->bc_ag.pag = pag;
-
+ cur->bc_ag.pag = xfs_perag_hold(pag);
return cur;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 99cc03a298e2..ba0f17bc1dc0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -72,7 +72,8 @@ xfs_sb_validate_v5_features(
}
/*
- * We support all XFS versions newer than a v4 superblock with V2 directories.
+ * We current support XFS v5 formats with known features and v4 superblocks with
+ * at least V2 directories.
*/
bool
xfs_sb_good_version(
@@ -86,16 +87,16 @@ xfs_sb_good_version(
if (xfs_sb_is_v5(sbp))
return xfs_sb_validate_v5_features(sbp);
+ /* versions prior to v4 are not supported */
+ if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4)
+ return false;
+
/* We must not have any unknown v4 feature bits set */
if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
(sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
return false;
- /* versions prior to v4 are not supported */
- if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
- return false;
-
/* V4 filesystems need v2 directories and unwritten extents */
if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 5ebdda7e1078..851220021484 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -204,6 +204,18 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_RMAPBT,
};
+/* Results of scanning a btree keyspace to check occupancy. */
+enum xbtree_recpacking {
+ /* None of the keyspace maps to records. */
+ XBTREE_RECPACKING_EMPTY = 0,
+
+ /* Some, but not all, of the keyspace maps to records. */
+ XBTREE_RECPACKING_SPARSE,
+
+ /* The entire keyspace maps to records. */
+ XBTREE_RECPACKING_FULL,
+};
+
/*
* Type verifier functions
*/
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 4dd52b15f09c..6c6e5eba42c8 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -18,6 +18,15 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
+int
+xchk_setup_agheader(
+ struct xfs_scrub *sc)
+{
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ return xchk_setup_fs(sc);
+}
+
/* Superblock */
/* Cross-reference with the other btrees. */
@@ -42,8 +51,9 @@ xchk_superblock_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
/* scrub teardown will take care of sc->sa for us */
}
@@ -505,9 +515,10 @@ xchk_agf_xref(
xchk_agf_xref_freeblks(sc);
xchk_agf_xref_cntbt(sc);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_agf_xref_btreeblks(sc);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
xchk_agf_xref_refcblks(sc);
/* scrub teardown will take care of sc->sa for us */
@@ -633,8 +644,9 @@ xchk_agfl_block_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
}
/* Scrub an AGFL block. */
@@ -689,8 +701,9 @@ xchk_agfl_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
/*
* Scrub teardown will take care of sc->sa for us. Leave sc->sa
@@ -844,8 +857,9 @@ xchk_agi_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
xchk_agi_xref_icounts(sc);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
xchk_agi_xref_fiblocks(sc);
/* scrub teardown will take care of sc->sa for us */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index c37e6d72760b..bbaa65422c4f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2018 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -487,10 +487,11 @@ xrep_agfl_walk_rmap(
/* Strike out the blocks that are cross-linked according to the rmapbt. */
STATIC int
xrep_agfl_check_extent(
- struct xrep_agfl *ra,
uint64_t start,
- uint64_t len)
+ uint64_t len,
+ void *priv)
{
+ struct xrep_agfl *ra = priv;
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
@@ -538,7 +539,6 @@ xrep_agfl_collect_blocks(
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
- struct xbitmap_range *br, *n;
int error;
ra.sc = sc;
@@ -579,11 +579,7 @@ xrep_agfl_collect_blocks(
/* Strike out the blocks that are cross-linked. */
ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
- for_each_xbitmap_extent(br, n, agfl_extents) {
- error = xrep_agfl_check_extent(&ra, br->start, br->len);
- if (error)
- break;
- }
+ error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
xfs_btree_del_cursor(ra.rmap_cur, error);
if (error)
goto out_bmp;
@@ -629,21 +625,58 @@ xrep_agfl_update_agf(
XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
}
+struct xrep_agfl_fill {
+ struct xbitmap used_extents;
+ struct xfs_scrub *sc;
+ __be32 *agfl_bno;
+ xfs_agblock_t flcount;
+ unsigned int fl_off;
+};
+
+/* Fill the AGFL with whatever blocks are in this extent. */
+static int
+xrep_agfl_fill(
+ uint64_t start,
+ uint64_t len,
+ void *priv)
+{
+ struct xrep_agfl_fill *af = priv;
+ struct xfs_scrub *sc = af->sc;
+ xfs_fsblock_t fsbno = start;
+ int error;
+
+ while (fsbno < start + len && af->fl_off < af->flcount)
+ af->agfl_bno[af->fl_off++] =
+ cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++));
+
+ trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno,
+ XFS_FSB_TO_AGBNO(sc->mp, start), len);
+
+ error = xbitmap_set(&af->used_extents, start, fsbno - 1);
+ if (error)
+ return error;
+
+ if (af->fl_off == af->flcount)
+ return -ECANCELED;
+
+ return 0;
+}
+
/* Write out a totally new AGFL. */
-STATIC void
+STATIC int
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
struct xbitmap *agfl_extents,
xfs_agblock_t flcount)
{
+ struct xrep_agfl_fill af = {
+ .sc = sc,
+ .flcount = flcount,
+ };
struct xfs_mount *mp = sc->mp;
- __be32 *agfl_bno;
- struct xbitmap_range *br;
- struct xbitmap_range *n;
struct xfs_agfl *agfl;
- xfs_agblock_t agbno;
- unsigned int fl_off;
+ int error;
ASSERT(flcount <= xfs_agfl_size(mp));
@@ -662,36 +695,18 @@ xrep_agfl_init_header(
* blocks than fit in the AGFL, they will be freed in a subsequent
* step.
*/
- fl_off = 0;
- agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
- for_each_xbitmap_extent(br, n, agfl_extents) {
- agbno = XFS_FSB_TO_AGBNO(mp, br->start);
-
- trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno,
- br->len);
-
- while (br->len > 0 && fl_off < flcount) {
- agfl_bno[fl_off] = cpu_to_be32(agbno);
- fl_off++;
- agbno++;
-
- /*
- * We've now used br->start by putting it in the AGFL,
- * so bump br so that we don't reap the block later.
- */
- br->start++;
- br->len--;
- }
-
- if (br->len)
- break;
- list_del(&br->list);
- kfree(br);
- }
+ xbitmap_init(&af.used_extents);
+ af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
+ xbitmap_walk(agfl_extents, xrep_agfl_fill, &af);
+ error = xbitmap_disunion(agfl_extents, &af.used_extents);
+ if (error)
+ return error;
/* Write new AGFL to disk. */
xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
+ xbitmap_destroy(&af.used_extents);
+ return 0;
}
/* Repair the AGFL. */
@@ -744,7 +759,9 @@ xrep_agfl(
* buffers until we know that part works.
*/
xrep_agfl_update_agf(sc, agf_bp, flcount);
- xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+ error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount);
+ if (error)
+ goto err;
/*
* Ok, the AGFL should be ready to go now. Roll the transaction to
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 3b38f4e2a537..279af72b1671 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -24,10 +24,19 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
return xchk_setup_ag_btree(sc, false);
}
/* Free space btree scrubber. */
+
+struct xchk_alloc {
+ /* Previous free space extent. */
+ struct xfs_alloc_rec_incore prev;
+};
+
/*
* Ensure there's a corresponding cntbt/bnobt record matching this
* bnobt/cntbt record, respectively.
@@ -75,9 +84,11 @@ xchk_allocbt_xref_other(
STATIC void
xchk_allocbt_xref(
struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- xfs_extlen_t len)
+ const struct xfs_alloc_rec_incore *irec)
{
+ xfs_agblock_t agbno = irec->ar_startblock;
+ xfs_extlen_t len = irec->ar_blockcount;
+
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
@@ -85,25 +96,44 @@ xchk_allocbt_xref(
xchk_xref_is_not_inode_chunk(sc, agbno, len);
xchk_xref_has_no_owner(sc, agbno, len);
xchk_xref_is_not_shared(sc, agbno, len);
+ xchk_xref_is_not_cow_staging(sc, agbno, len);
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_allocbt_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_alloc *ca,
+ const struct xfs_alloc_rec_incore *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (ca->prev.ar_blockcount > 0 &&
+ ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock &&
+ ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&ca->prev, irec, sizeof(*irec));
}
/* Scrub a bnobt/cntbt record. */
STATIC int
xchk_allocbt_rec(
- struct xchk_btree *bs,
- const union xfs_btree_rec *rec)
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
{
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
- xfs_agblock_t bno;
- xfs_extlen_t len;
+ struct xfs_alloc_rec_incore irec;
+ struct xchk_alloc *ca = bs->private;
- bno = be32_to_cpu(rec->alloc.ar_startblock);
- len = be32_to_cpu(rec->alloc.ar_blockcount);
-
- if (!xfs_verify_agbext(pag, bno, len))
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
- xchk_allocbt_xref(bs->sc, bno, len);
+ xchk_allocbt_mergeable(bs, ca, &irec);
+ xchk_allocbt_xref(bs->sc, &irec);
return 0;
}
@@ -114,10 +144,11 @@ xchk_allocbt(
struct xfs_scrub *sc,
xfs_btnum_t which)
{
+ struct xchk_alloc ca = { };
struct xfs_btree_cur *cur;
cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
int
@@ -141,15 +172,15 @@ xchk_xref_is_used_space(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- bool is_freesp;
+ enum xbtree_recpacking outcome;
int error;
if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
+ error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur))
return;
- if (is_freesp)
+ if (outcome != XBTREE_RECPACKING_EMPTY)
xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0);
}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 31529b9bf389..6c16d9530cca 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -15,11 +15,51 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_attr_sf.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/attr.h"
+/* Free the buffers linked from the xattr buffer. */
+static void
+xchk_xattr_buf_cleanup(
+ void *priv)
+{
+ struct xchk_xattr_buf *ab = priv;
+
+ kvfree(ab->freemap);
+ ab->freemap = NULL;
+ kvfree(ab->usedmap);
+ ab->usedmap = NULL;
+ kvfree(ab->value);
+ ab->value = NULL;
+ ab->value_sz = 0;
+}
+
+/*
+ * Allocate the free space bitmap if we're trying harder; there are leaf blocks
+ * in the attr fork; or we can't tell if there are leaf blocks.
+ */
+static inline bool
+xchk_xattr_want_freemap(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+
+ if (sc->flags & XCHK_TRY_HARDER)
+ return true;
+
+ if (!sc->ip)
+ return true;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+ if (!ifp)
+ return false;
+
+ return xfs_ifork_has_extents(ifp);
+}
+
/*
* Allocate enough memory to hold an attr value and attr block bitmaps,
* reallocating the buffer if necessary. Buffer contents are not preserved
@@ -28,41 +68,49 @@
static int
xchk_setup_xattr_buf(
struct xfs_scrub *sc,
- size_t value_size,
- gfp_t flags)
+ size_t value_size)
{
- size_t sz;
+ size_t bmp_sz;
struct xchk_xattr_buf *ab = sc->buf;
+ void *new_val;
- /*
- * We need enough space to read an xattr value from the file or enough
- * space to hold three copies of the xattr free space bitmap. We don't
- * need the buffer space for both purposes at the same time.
- */
- sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
- sz = max_t(size_t, sz, value_size);
+ bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
- /*
- * If there's already a buffer, figure out if we need to reallocate it
- * to accommodate a larger size.
- */
- if (ab) {
- if (sz <= ab->sz)
- return 0;
- kvfree(ab);
- sc->buf = NULL;
- }
+ if (ab)
+ goto resize_value;
- /*
- * Don't zero the buffer upon allocation to avoid runtime overhead.
- * All users must be careful never to read uninitialized contents.
- */
- ab = kvmalloc(sizeof(*ab) + sz, flags);
+ ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS);
if (!ab)
return -ENOMEM;
-
- ab->sz = sz;
sc->buf = ab;
+ sc->buf_cleanup = xchk_xattr_buf_cleanup;
+
+ ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS);
+ if (!ab->usedmap)
+ return -ENOMEM;
+
+ if (xchk_xattr_want_freemap(sc)) {
+ ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS);
+ if (!ab->freemap)
+ return -ENOMEM;
+ }
+
+resize_value:
+ if (ab->value_sz >= value_size)
+ return 0;
+
+ if (ab->value) {
+ kvfree(ab->value);
+ ab->value = NULL;
+ ab->value_sz = 0;
+ }
+
+ new_val = kvmalloc(value_size, XCHK_GFP_FLAGS);
+ if (!new_val)
+ return -ENOMEM;
+
+ ab->value = new_val;
+ ab->value_sz = value_size;
return 0;
}
@@ -79,8 +127,7 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
- XCHK_GFP_FLAGS);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX);
if (error)
return error;
}
@@ -111,11 +158,24 @@ xchk_xattr_listent(
int namelen,
int valuelen)
{
+ struct xfs_da_args args = {
+ .op_flags = XFS_DA_OP_NOTIME,
+ .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK,
+ .geo = context->dp->i_mount->m_attr_geo,
+ .whichfork = XFS_ATTR_FORK,
+ .dp = context->dp,
+ .name = name,
+ .namelen = namelen,
+ .hashval = xfs_da_hashname(name, namelen),
+ .trans = context->tp,
+ .valuelen = valuelen,
+ };
+ struct xchk_xattr_buf *ab;
struct xchk_xattr *sx;
- struct xfs_da_args args = { NULL };
int error = 0;
sx = container_of(context, struct xchk_xattr, context);
+ ab = sx->sc->buf;
if (xchk_should_terminate(sx->sc, &error)) {
context->seen_enough = error;
@@ -128,18 +188,32 @@ xchk_xattr_listent(
return;
}
+ /* Only one namespace bit allowed. */
+ if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ goto fail_xref;
+ }
+
/* Does this name make sense? */
if (!xfs_attr_namecheck(name, namelen)) {
xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
- return;
+ goto fail_xref;
}
/*
+ * Local xattr values are stored in the attr leaf block, so we don't
+ * need to retrieve the value from a remote block to detect corruption
+ * problems.
+ */
+ if (flags & XFS_ATTR_LOCAL)
+ goto fail_xref;
+
+ /*
* Try to allocate enough memory to extrat the attr value. If that
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -147,17 +221,7 @@ xchk_xattr_listent(
return;
}
- args.op_flags = XFS_DA_OP_NOTIME;
- args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
- args.geo = context->dp->i_mount->m_attr_geo;
- args.whichfork = XFS_ATTR_FORK;
- args.dp = context->dp;
- args.name = name;
- args.namelen = namelen;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.trans = context->tp;
- args.value = xchk_xattr_valuebuf(sx->sc);
- args.valuelen = valuelen;
+ args.value = ab->value;
error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */
@@ -213,25 +277,23 @@ xchk_xattr_set_map(
STATIC bool
xchk_xattr_check_freemap(
struct xfs_scrub *sc,
- unsigned long *map,
struct xfs_attr3_icleaf_hdr *leafhdr)
{
- unsigned long *freemap = xchk_xattr_freemap(sc);
- unsigned long *dstmap = xchk_xattr_dstmap(sc);
+ struct xchk_xattr_buf *ab = sc->buf;
unsigned int mapsize = sc->mp->m_attr_geo->blksize;
int i;
/* Construct bitmap of freemap contents. */
- bitmap_zero(freemap, mapsize);
+ bitmap_zero(ab->freemap, mapsize);
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
- if (!xchk_xattr_set_map(sc, freemap,
+ if (!xchk_xattr_set_map(sc, ab->freemap,
leafhdr->freemap[i].base,
leafhdr->freemap[i].size))
return false;
}
/* Look for bits that are set in freemap and are marked in use. */
- return bitmap_and(dstmap, freemap, map, mapsize) == 0;
+ return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize);
}
/*
@@ -251,7 +313,7 @@ xchk_xattr_entry(
__u32 *last_hashval)
{
struct xfs_mount *mp = ds->state->mp;
- unsigned long *usedmap = xchk_xattr_usedmap(ds->sc);
+ struct xchk_xattr_buf *ab = ds->sc->buf;
char *name_end;
struct xfs_attr_leaf_name_local *lentry;
struct xfs_attr_leaf_name_remote *rentry;
@@ -291,7 +353,7 @@ xchk_xattr_entry(
if (name_end > buf_end)
xchk_da_set_corrupt(ds, level);
- if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize))
+ if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize))
xchk_da_set_corrupt(ds, level);
if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
*usedbytes += namesize;
@@ -311,35 +373,26 @@ xchk_xattr_block(
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *ent;
struct xfs_attr_leaf_entry *entries;
- unsigned long *usedmap;
+ struct xchk_xattr_buf *ab = ds->sc->buf;
char *buf_end;
size_t off;
__u32 last_hashval = 0;
unsigned int usedbytes = 0;
unsigned int hdrsize;
int i;
- int error;
if (*last_checked == blk->blkno)
return 0;
- /* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
- if (error == -ENOMEM)
- return -EDEADLOCK;
- if (error)
- return error;
- usedmap = xchk_xattr_usedmap(ds->sc);
-
*last_checked = blk->blkno;
- bitmap_zero(usedmap, mp->m_attr_geo->blksize);
+ bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize);
/* Check all the padding. */
if (xfs_has_crc(ds->sc->mp)) {
- struct xfs_attr3_leafblock *leaf = bp->b_addr;
+ struct xfs_attr3_leafblock *leaf3 = bp->b_addr;
- if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 ||
- leaf->hdr.info.hdr.pad != 0)
+ if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 ||
+ leaf3->hdr.info.hdr.pad != 0)
xchk_da_set_corrupt(ds, level);
} else {
if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0)
@@ -356,7 +409,7 @@ xchk_xattr_block(
xchk_da_set_corrupt(ds, level);
if (leafhdr.firstused < hdrsize)
xchk_da_set_corrupt(ds, level);
- if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize))
+ if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize))
xchk_da_set_corrupt(ds, level);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -370,7 +423,7 @@ xchk_xattr_block(
for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) {
/* Mark the leaf entry itself. */
off = (char *)ent - (char *)leaf;
- if (!xchk_xattr_set_map(ds->sc, usedmap, off,
+ if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off,
sizeof(xfs_attr_leaf_entry_t))) {
xchk_da_set_corrupt(ds, level);
goto out;
@@ -384,7 +437,7 @@ xchk_xattr_block(
goto out;
}
- if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr))
+ if (!xchk_xattr_check_freemap(ds->sc, &leafhdr))
xchk_da_set_corrupt(ds, level);
if (leafhdr.usedbytes != usedbytes)
@@ -468,38 +521,115 @@ out:
return error;
}
+/* Check space usage of shortform attrs. */
+STATIC int
+xchk_xattr_check_sf(
+ struct xfs_scrub *sc)
+{
+ struct xchk_xattr_buf *ab = sc->buf;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_attr_sf_entry *next;
+ struct xfs_ifork *ifp;
+ unsigned char *end;
+ int i;
+ int error = 0;
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+
+ bitmap_zero(ab->usedmap, ifp->if_bytes);
+ sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
+ end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+
+ sfe = &sf->list[0];
+ if ((unsigned char *)sfe > end) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ return 0;
+ }
+
+ for (i = 0; i < sf->hdr.count; i++) {
+ unsigned char *name = sfe->nameval;
+ unsigned char *value = &sfe->nameval[sfe->namelen];
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ next = xfs_attr_sf_nextentry(sfe);
+ if ((unsigned char *)next > end) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)sfe - (char *)sf,
+ sizeof(struct xfs_attr_sf_entry))) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)name - (char *)sf,
+ sfe->namelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
+ if (!xchk_xattr_set_map(sc, ab->usedmap,
+ (char *)value - (char *)sf,
+ sfe->valuelen)) {
+ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
+ break;
+ }
+
+ sfe = next;
+ }
+
+ return 0;
+}
+
/* Scrub the extended attribute metadata. */
int
xchk_xattr(
struct xfs_scrub *sc)
{
- struct xchk_xattr sx;
+ struct xchk_xattr sx = {
+ .sc = sc,
+ .context = {
+ .dp = sc->ip,
+ .tp = sc->tp,
+ .resynch = 1,
+ .put_listent = xchk_xattr_listent,
+ .allow_incomplete = true,
+ },
+ };
xfs_dablk_t last_checked = -1U;
int error = 0;
if (!xfs_inode_hasattr(sc->ip))
return -ENOENT;
- memset(&sx, 0, sizeof(sx));
- /* Check attribute tree structure */
- error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec,
- &last_checked);
+ /* Allocate memory for xattr checking. */
+ error = xchk_setup_xattr_buf(sc, 0);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
if (error)
- goto out;
+ return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ /* Check the physical structure of the xattr. */
+ if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL)
+ error = xchk_xattr_check_sf(sc);
+ else
+ error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec,
+ &last_checked);
+ if (error)
+ return error;
- /* Check that every attr key can also be looked up by hash. */
- sx.context.dp = sc->ip;
- sx.context.resynch = 1;
- sx.context.put_listent = xchk_xattr_listent;
- sx.context.tp = sc->tp;
- sx.context.allow_incomplete = true;
- sx.sc = sc;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
/*
- * Look up every xattr in this file by name.
+ * Look up every xattr in this file by name and hash.
*
* Use the backend implementation of xfs_attr_list to call
* xchk_xattr_listent on every attribute key in this inode.
@@ -516,11 +646,11 @@ xchk_xattr(
*/
error = xfs_attr_list_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
- goto out;
+ return error;
/* Did our listent function try to return any errors? */
if (sx.context.seen_enough < 0)
- error = sx.context.seen_enough;
-out:
- return error;
+ return sx.context.seen_enough;
+
+ return 0;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 3590e10e3e62..48fd9402c432 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * Copyright (C) 2019 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_ATTR_H__
#define __XFS_SCRUB_ATTR_H__
@@ -10,59 +10,15 @@
* Temporary storage for online scrub and repair of extended attributes.
*/
struct xchk_xattr_buf {
- /* Size of @buf, in bytes. */
- size_t sz;
+ /* Bitmap of used space in xattr leaf blocks and shortform forks. */
+ unsigned long *usedmap;
- /*
- * Memory buffer -- either used for extracting attr values while
- * walking the attributes; or for computing attr block bitmaps when
- * checking the attribute tree.
- *
- * Each bitmap contains enough bits to track every byte in an attr
- * block (rounded up to the size of an unsigned long). The attr block
- * used space bitmap starts at the beginning of the buffer; the free
- * space bitmap follows immediately after; and we have a third buffer
- * for storing intermediate bitmap results.
- */
- uint8_t buf[];
-};
-
-/* A place to store attribute values. */
-static inline uint8_t *
-xchk_xattr_valuebuf(
- struct xfs_scrub *sc)
-{
- struct xchk_xattr_buf *ab = sc->buf;
-
- return ab->buf;
-}
-
-/* A bitmap of space usage computed by walking an attr leaf block. */
-static inline unsigned long *
-xchk_xattr_usedmap(
- struct xfs_scrub *sc)
-{
- struct xchk_xattr_buf *ab = sc->buf;
+ /* Bitmap of free space in xattr leaf blocks. */
+ unsigned long *freemap;
- return (unsigned long *)ab->buf;
-}
-
-/* A bitmap of free space computed by walking attr leaf block free info. */
-static inline unsigned long *
-xchk_xattr_freemap(
- struct xfs_scrub *sc)
-{
- return xchk_xattr_usedmap(sc) +
- BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
-}
-
-/* A bitmap used to hold temporary results. */
-static inline unsigned long *
-xchk_xattr_dstmap(
- struct xfs_scrub *sc)
-{
- return xchk_xattr_freemap(sc) +
- BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
-}
+ /* Memory buffer used to extract xattr values. */
+ void *value;
+ size_t value_sz;
+};
#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index a255f09e9f0a..0c959be396ea 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -1,11 +1,12 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2018 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
@@ -13,27 +14,160 @@
#include "scrub/scrub.h"
#include "scrub/bitmap.h"
+#include <linux/interval_tree_generic.h>
+
+struct xbitmap_node {
+ struct rb_node bn_rbnode;
+
+ /* First set bit of this interval and subtree. */
+ uint64_t bn_start;
+
+ /* Last set bit of this interval. */
+ uint64_t bn_last;
+
+ /* Last set bit of this subtree. Do not touch this. */
+ uint64_t __bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint64_t parameters. */
+
+#define START(node) ((node)->bn_start)
+#define LAST(node) ((node)->bn_last)
+
/*
- * Set a range of this bitmap. Caller must ensure the range is not set.
- *
- * This is the logical equivalent of bitmap |= mask(start, len).
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
*/
+static inline void
+xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+
+static inline void
+xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap_node *
+xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+ uint64_t last);
+
+static inline struct xbitmap_node *
+xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+ uint64_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+
+/* Iterate each interval of a bitmap. Do not change the bitmap. */
+#define for_each_xbitmap_extent(bn, bitmap) \
+ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+ struct xbitmap_node, bn_rbnode); \
+ (bn) != NULL; \
+ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+ struct xbitmap_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap_clear(
+ struct xbitmap *bitmap,
+ uint64_t start,
+ uint64_t len)
+{
+ struct xbitmap_node *bn;
+ struct xbitmap_node *new_bn;
+ uint64_t last = start + len - 1;
+
+ while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+ if (bn->bn_start < start && bn->bn_last > last) {
+ uint64_t old_last = bn->bn_last;
+
+ /* overlaps with the entire clearing range */
+ xbitmap_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap_tree_insert(bn, &bitmap->xb_root);
+
+ /* add an extent */
+ new_bn = kmalloc(sizeof(struct xbitmap_node),
+ XCHK_GFP_FLAGS);
+ if (!new_bn)
+ return -ENOMEM;
+ new_bn->bn_start = last + 1;
+ new_bn->bn_last = old_last;
+ xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+ } else if (bn->bn_start < start) {
+ /* overlaps with the left side of the clearing range */
+ xbitmap_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap_tree_insert(bn, &bitmap->xb_root);
+ } else if (bn->bn_last > last) {
+ /* overlaps with the right side of the clearing range */
+ xbitmap_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_start = last + 1;
+ xbitmap_tree_insert(bn, &bitmap->xb_root);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ xbitmap_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+ }
+
+ return 0;
+}
+
+/* Set a range of this bitmap. */
int
xbitmap_set(
struct xbitmap *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_range *bmr;
+ struct xbitmap_node *left;
+ struct xbitmap_node *right;
+ uint64_t last = start + len - 1;
+ int error;
- bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
- if (!bmr)
- return -ENOMEM;
+ /* Is this whole range already set? */
+ left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ if (left && left->bn_start <= start && left->bn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ error = xbitmap_clear(bitmap, start, len);
+ if (error)
+ return error;
+
+ /* Do we have a left-adjacent extent? */
+ left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ ASSERT(!left || left->bn_last + 1 == start);
+
+ /* Do we have a right-adjacent extent? */
+ right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ ASSERT(!right || right->bn_start == last + 1);
- INIT_LIST_HEAD(&bmr->list);
- bmr->start = start;
- bmr->len = len;
- list_add_tail(&bmr->list, &bitmap->list);
+ if (left && right) {
+ /* combine left and right adjacent extent */
+ xbitmap_tree_remove(left, &bitmap->xb_root);
+ xbitmap_tree_remove(right, &bitmap->xb_root);
+ left->bn_last = right->bn_last;
+ xbitmap_tree_insert(left, &bitmap->xb_root);
+ kfree(right);
+ } else if (left) {
+ /* combine with left extent */
+ xbitmap_tree_remove(left, &bitmap->xb_root);
+ left->bn_last = last;
+ xbitmap_tree_insert(left, &bitmap->xb_root);
+ } else if (right) {
+ /* combine with right extent */
+ xbitmap_tree_remove(right, &bitmap->xb_root);
+ right->bn_start = start;
+ xbitmap_tree_insert(right, &bitmap->xb_root);
+ } else {
+ /* add an extent */
+ left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+ if (!left)
+ return -ENOMEM;
+ left->bn_start = start;
+ left->bn_last = last;
+ xbitmap_tree_insert(left, &bitmap->xb_root);
+ }
return 0;
}
@@ -43,12 +177,11 @@ void
xbitmap_destroy(
struct xbitmap *bitmap)
{
- struct xbitmap_range *bmr;
- struct xbitmap_range *n;
+ struct xbitmap_node *bn;
- for_each_xbitmap_extent(bmr, n, bitmap) {
- list_del(&bmr->list);
- kfree(bmr);
+ while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+ xbitmap_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
}
}
@@ -57,27 +190,7 @@ void
xbitmap_init(
struct xbitmap *bitmap)
{
- INIT_LIST_HEAD(&bitmap->list);
-}
-
-/* Compare two btree extents. */
-static int
-xbitmap_range_cmp(
- void *priv,
- const struct list_head *a,
- const struct list_head *b)
-{
- struct xbitmap_range *ap;
- struct xbitmap_range *bp;
-
- ap = container_of(a, struct xbitmap_range, list);
- bp = container_of(b, struct xbitmap_range, list);
-
- if (ap->start > bp->start)
- return 1;
- if (ap->start < bp->start)
- return -1;
- return 0;
+ bitmap->xb_root = RB_ROOT_CACHED;
}
/*
@@ -94,118 +207,26 @@ xbitmap_range_cmp(
*
* This is the logical equivalent of bitmap &= ~sub.
*/
-#define LEFT_ALIGNED (1 << 0)
-#define RIGHT_ALIGNED (1 << 1)
int
xbitmap_disunion(
struct xbitmap *bitmap,
struct xbitmap *sub)
{
- struct list_head *lp;
- struct xbitmap_range *br;
- struct xbitmap_range *new_br;
- struct xbitmap_range *sub_br;
- uint64_t sub_start;
- uint64_t sub_len;
- int state;
- int error = 0;
+ struct xbitmap_node *bn;
+ int error;
- if (list_empty(&bitmap->list) || list_empty(&sub->list))
+ if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
return 0;
- ASSERT(!list_empty(&sub->list));
-
- list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
- list_sort(NULL, &sub->list, xbitmap_range_cmp);
-
- /*
- * Now that we've sorted both lists, we iterate bitmap once, rolling
- * forward through sub and/or bitmap as necessary until we find an
- * overlap or reach the end of either list. We do not reset lp to the
- * head of bitmap nor do we reset sub_br to the head of sub. The
- * list traversal is similar to merge sort, but we're deleting
- * instead. In this manner we avoid O(n^2) operations.
- */
- sub_br = list_first_entry(&sub->list, struct xbitmap_range,
- list);
- lp = bitmap->list.next;
- while (lp != &bitmap->list) {
- br = list_entry(lp, struct xbitmap_range, list);
-
- /*
- * Advance sub_br and/or br until we find a pair that
- * intersect or we run out of extents.
- */
- while (sub_br->start + sub_br->len <= br->start) {
- if (list_is_last(&sub_br->list, &sub->list))
- goto out;
- sub_br = list_next_entry(sub_br, list);
- }
- if (sub_br->start >= br->start + br->len) {
- lp = lp->next;
- continue;
- }
- /* trim sub_br to fit the extent we have */
- sub_start = sub_br->start;
- sub_len = sub_br->len;
- if (sub_br->start < br->start) {
- sub_len -= br->start - sub_br->start;
- sub_start = br->start;
- }
- if (sub_len > br->len)
- sub_len = br->len;
-
- state = 0;
- if (sub_start == br->start)
- state |= LEFT_ALIGNED;
- if (sub_start + sub_len == br->start + br->len)
- state |= RIGHT_ALIGNED;
- switch (state) {
- case LEFT_ALIGNED:
- /* Coincides with only the left. */
- br->start += sub_len;
- br->len -= sub_len;
- break;
- case RIGHT_ALIGNED:
- /* Coincides with only the right. */
- br->len -= sub_len;
- lp = lp->next;
- break;
- case LEFT_ALIGNED | RIGHT_ALIGNED:
- /* Total overlap, just delete ex. */
- lp = lp->next;
- list_del(&br->list);
- kfree(br);
- break;
- case 0:
- /*
- * Deleting from the middle: add the new right extent
- * and then shrink the left extent.
- */
- new_br = kmalloc(sizeof(struct xbitmap_range),
- XCHK_GFP_FLAGS);
- if (!new_br) {
- error = -ENOMEM;
- goto out;
- }
- INIT_LIST_HEAD(&new_br->list);
- new_br->start = sub_start + sub_len;
- new_br->len = br->start + br->len - new_br->start;
- list_add(&new_br->list, &br->list);
- br->len = sub_start - br->start;
- lp = lp->next;
- break;
- default:
- ASSERT(0);
- break;
- }
+ for_each_xbitmap_extent(bn, sub) {
+ error = xbitmap_clear(bitmap, bn->bn_start,
+ bn->bn_last - bn->bn_start + 1);
+ if (error)
+ return error;
}
-out:
- return error;
+ return 0;
}
-#undef LEFT_ALIGNED
-#undef RIGHT_ALIGNED
/*
* Record all btree blocks seen while iterating all records of a btree.
@@ -242,6 +263,38 @@ out:
* For the 300th record we just exit, with the list being [1, 4, 2, 3].
*/
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+ return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
+}
+
/*
* Record all the buffers pointed to by the btree cursor. Callers already
* engaged in a btree walk should call this function to capture the list of
@@ -304,12 +357,97 @@ uint64_t
xbitmap_hweight(
struct xbitmap *bitmap)
{
- struct xbitmap_range *bmr;
- struct xbitmap_range *n;
+ struct xbitmap_node *bn;
uint64_t ret = 0;
- for_each_xbitmap_extent(bmr, n, bitmap)
- ret += bmr->len;
+ for_each_xbitmap_extent(bn, bitmap)
+ ret += bn->bn_last - bn->bn_start + 1;
return ret;
}
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap_walk(
+ struct xbitmap *bitmap,
+ xbitmap_walk_fn fn,
+ void *priv)
+{
+ struct xbitmap_node *bn;
+ int error = 0;
+
+ for_each_xbitmap_extent(bn, bitmap) {
+ error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+struct xbitmap_walk_bits {
+ xbitmap_walk_bits_fn fn;
+ void *priv;
+};
+
+/* Walk all the bits in a run. */
+static int
+xbitmap_walk_bits_in_run(
+ uint64_t start,
+ uint64_t len,
+ void *priv)
+{
+ struct xbitmap_walk_bits *wb = priv;
+ uint64_t i;
+ int error = 0;
+
+ for (i = start; i < start + len; i++) {
+ error = wb->fn(i, wb->priv);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/* Call a function for every set bit in this bitmap. */
+int
+xbitmap_walk_bits(
+ struct xbitmap *bitmap,
+ xbitmap_walk_bits_fn fn,
+ void *priv)
+{
+ struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv};
+
+ return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb);
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap_empty(
+ struct xbitmap *bitmap)
+{
+ return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear? And for how long? */
+bool
+xbitmap_test(
+ struct xbitmap *bitmap,
+ uint64_t start,
+ uint64_t *len)
+{
+ struct xbitmap_node *bn;
+ uint64_t last = start + *len - 1;
+
+ bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ if (!bn)
+ return false;
+ if (bn->bn_start <= start) {
+ if (bn->bn_last < last)
+ *len = bn->bn_last - start + 1;
+ return true;
+ }
+ *len = bn->bn_start - start;
+ return false;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 900646b72de1..84981724ecaf 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -1,31 +1,19 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2018 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xbitmap_range {
- struct list_head list;
- uint64_t start;
- uint64_t len;
-};
-
struct xbitmap {
- struct list_head list;
+ struct rb_root_cached xb_root;
};
void xbitmap_init(struct xbitmap *bitmap);
void xbitmap_destroy(struct xbitmap *bitmap);
-#define for_each_xbitmap_extent(bex, n, bitmap) \
- list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
-
-#define for_each_xbitmap_block(b, bex, n, bitmap) \
- list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
- for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
-
+int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
int xbitmap_set_btcur_path(struct xbitmap *bitmap,
@@ -34,4 +22,93 @@ int xbitmap_set_btblocks(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iteration caller. The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own. Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+ void *priv);
+
+typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv);
+int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
+ void *priv);
+
+bool xbitmap_empty(struct xbitmap *bitmap);
+bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+ struct xbitmap agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+ xbitmap_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+ xbitmap_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool
+xagb_bitmap_test(
+ struct xagb_bitmap *bitmap,
+ xfs_agblock_t start,
+ xfs_extlen_t *len)
+{
+ uint64_t biglen = *len;
+ bool ret;
+
+ ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
+
+ if (start + biglen >= UINT_MAX) {
+ ASSERT(0);
+ biglen = UINT_MAX - start;
+ }
+
+ *len = biglen;
+ return ret;
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+ struct xagb_bitmap *sub)
+{
+ return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+ return xbitmap_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+ return xbitmap_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+ xbitmap_walk_fn fn, void *priv)
+{
+ return xbitmap_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index dbbc7037074c..87ab9f95a487 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -31,12 +31,15 @@ xchk_setup_inode_bmap(
{
int error;
- error = xchk_get_inode(sc);
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ error = xchk_iget_for_scrubbing(sc);
if (error)
goto out;
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
/*
* We don't want any ephemeral data fork updates sitting around
@@ -47,6 +50,9 @@ xchk_setup_inode_bmap(
sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+ sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+
inode_dio_wait(VFS_I(sc->ip));
/*
@@ -90,11 +96,23 @@ out:
struct xchk_bmap_info {
struct xfs_scrub *sc;
+
+ /* Incore extent tree cursor */
struct xfs_iext_cursor icur;
- xfs_fileoff_t lastoff;
+
+ /* Previous fork mapping that we examined */
+ struct xfs_bmbt_irec prev_rec;
+
+ /* Is this a realtime fork? */
bool is_rt;
+
+ /* May mappings point to shared space? */
bool is_shared;
+
+ /* Was the incore extent tree loaded? */
bool was_loaded;
+
+ /* Which inode fork are we checking? */
int whichfork;
};
@@ -147,49 +165,7 @@ xchk_bmap_get_rmap(
return has_rmap;
}
-static inline bool
-xchk_bmap_has_prev(
- struct xchk_bmap_info *info,
- struct xfs_bmbt_irec *irec)
-{
- struct xfs_bmbt_irec got;
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
-
- if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
- return false;
- if (got.br_startoff + got.br_blockcount != irec->br_startoff)
- return false;
- if (got.br_startblock + got.br_blockcount != irec->br_startblock)
- return false;
- if (got.br_state != irec->br_state)
- return false;
- return true;
-}
-
-static inline bool
-xchk_bmap_has_next(
- struct xchk_bmap_info *info,
- struct xfs_bmbt_irec *irec)
-{
- struct xfs_bmbt_irec got;
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
-
- if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
- return false;
- if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
- return false;
- if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
- return false;
- if (got.br_state != irec->br_state)
- return false;
- return true;
-}
-
-/* Make sure that we have rmapbt records for this extent. */
+/* Make sure that we have rmapbt records for this data/attr fork extent. */
STATIC void
xchk_bmap_xref_rmap(
struct xchk_bmap_info *info,
@@ -198,41 +174,39 @@ xchk_bmap_xref_rmap(
{
struct xfs_rmap_irec rmap;
unsigned long long rmap_end;
- uint64_t owner;
+ uint64_t owner = info->sc->ip->i_ino;
if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
return;
- if (info->whichfork == XFS_COW_FORK)
- owner = XFS_RMAP_OWN_COW;
- else
- owner = info->sc->ip->i_ino;
-
/* Find the rmap record for this irec. */
if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
return;
- /* Check the rmap. */
+ /*
+ * The rmap must be an exact match for this incore file mapping record,
+ * which may have arisen from multiple ondisk records.
+ */
+ if (rmap.rm_startblock != agbno)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (rmap.rm_startblock > agbno ||
- agbno + irec->br_blockcount > rmap_end)
+ if (rmap_end != agbno + irec->br_blockcount)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- /*
- * Check the logical offsets if applicable. CoW staging extents
- * don't track logical offsets since the mappings only exist in
- * memory.
- */
- if (info->whichfork != XFS_COW_FORK) {
- rmap_end = (unsigned long long)rmap.rm_offset +
- rmap.rm_blockcount;
- if (rmap.rm_offset > irec->br_startoff ||
- irec->br_startoff + irec->br_blockcount > rmap_end)
- xchk_fblock_xref_set_corrupt(info->sc,
- info->whichfork, irec->br_startoff);
- }
+ /* Check the logical offsets. */
+ if (rmap.rm_offset != irec->br_startoff)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount;
+ if (rmap_end != irec->br_startoff + irec->br_blockcount)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Check the owner */
if (rmap.rm_owner != owner)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -244,8 +218,7 @@ xchk_bmap_xref_rmap(
* records because the blocks are owned (on-disk) by the refcountbt,
* which doesn't track unwritten state.
*/
- if (owner != XFS_RMAP_OWN_COW &&
- !!(irec->br_state == XFS_EXT_UNWRITTEN) !=
+ if (!!(irec->br_state == XFS_EXT_UNWRITTEN) !=
!!(rmap.rm_flags & XFS_RMAP_UNWRITTEN))
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -257,34 +230,60 @@ xchk_bmap_xref_rmap(
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+}
+
+/* Make sure that we have rmapbt records for this COW fork extent. */
+STATIC void
+xchk_bmap_xref_rmap_cow(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec,
+ xfs_agblock_t agbno)
+{
+ struct xfs_rmap_irec rmap;
+ unsigned long long rmap_end;
+ uint64_t owner = XFS_RMAP_OWN_COW;
+
+ if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm))
+ return;
+
+ /* Find the rmap record for this irec. */
+ if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap))
+ return;
/*
- * If the rmap starts before this bmbt record, make sure there's a bmbt
- * record for the previous offset that is contiguous with this mapping.
- * Skip this for CoW fork extents because the refcount btree (and not
- * the inode) is the ondisk owner for those extents.
+ * CoW staging extents are owned by the refcount btree, so the rmap
+ * can start before and end after the physical space allocated to this
+ * mapping. There are no offsets to check.
*/
- if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
- !xchk_bmap_has_prev(info, irec)) {
+ if (rmap.rm_startblock > agbno)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+ if (rmap_end < agbno + irec->br_blockcount)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Check the owner */
+ if (rmap.rm_owner != owner)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- return;
- }
/*
- * If the rmap ends after this bmbt record, make sure there's a bmbt
- * record for the next offset that is contiguous with this mapping.
- * Skip this for CoW fork extents because the refcount btree (and not
- * the inode) is the ondisk owner for those extents.
+ * No flags allowed. Note that the (in-memory) CoW fork distinguishes
+ * between unwritten and written extents, but we don't track that in
+ * the rmap records because the blocks are owned (on-disk) by the
+ * refcountbt, which doesn't track unwritten state.
*/
- rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
- if (info->whichfork != XFS_COW_FORK &&
- rmap_end > agbno + irec->br_blockcount &&
- !xchk_bmap_has_next(info, irec)) {
+ if (rmap.rm_flags & XFS_RMAP_ATTR_FORK)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ if (rmap.rm_flags & XFS_RMAP_UNWRITTEN)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- return;
- }
}
/* Cross-reference a single rtdev extent record. */
@@ -305,6 +304,7 @@ xchk_bmap_iextent_xref(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
+ struct xfs_owner_info oinfo;
struct xfs_mount *mp = info->sc->mp;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -322,17 +322,35 @@ xchk_bmap_iextent_xref(
xchk_xref_is_used_space(info->sc, agbno, len);
xchk_xref_is_not_inode_chunk(info->sc, agbno, len);
- xchk_bmap_xref_rmap(info, irec, agbno);
switch (info->whichfork) {
case XFS_DATA_FORK:
- if (xfs_is_reflink_inode(info->sc->ip))
- break;
- fallthrough;
+ xchk_bmap_xref_rmap(info, irec, agbno);
+ if (!xfs_is_reflink_inode(info->sc->ip)) {
+ xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+ info->whichfork, irec->br_startoff);
+ xchk_xref_is_only_owned_by(info->sc, agbno,
+ irec->br_blockcount, &oinfo);
+ xchk_xref_is_not_shared(info->sc, agbno,
+ irec->br_blockcount);
+ }
+ xchk_xref_is_not_cow_staging(info->sc, agbno,
+ irec->br_blockcount);
+ break;
case XFS_ATTR_FORK:
+ xchk_bmap_xref_rmap(info, irec, agbno);
+ xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino,
+ info->whichfork, irec->br_startoff);
+ xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
+ &oinfo);
xchk_xref_is_not_shared(info->sc, agbno,
irec->br_blockcount);
+ xchk_xref_is_not_cow_staging(info->sc, agbno,
+ irec->br_blockcount);
break;
case XFS_COW_FORK:
+ xchk_bmap_xref_rmap_cow(info, irec, agbno);
+ xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount,
+ &XFS_RMAP_OINFO_COW);
xchk_xref_is_cow_staging(info->sc, agbno,
irec->br_blockcount);
xchk_xref_is_not_shared(info->sc, agbno,
@@ -382,7 +400,8 @@ xchk_bmap_iextent(
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
*/
- if (irec->br_startoff < info->lastoff)
+ if (irec->br_startoff < info->prev_rec.br_startoff +
+ info->prev_rec.br_blockcount)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -392,15 +411,7 @@ xchk_bmap_iextent(
xchk_bmap_dirattr_extent(ip, info, irec);
- /* There should never be a "hole" extent in either extent list. */
- if (irec->br_startblock == HOLESTARTBLOCK)
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
-
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
if (info->is_rt &&
!xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -468,6 +479,12 @@ xchk_bmapbt_rec(
return 0;
xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) {
+ xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+ irec.br_startoff);
+ return 0;
+ }
+
if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
&iext_irec) ||
irec.br_startoff != iext_irec.br_startoff ||
@@ -618,45 +635,57 @@ xchk_bmap_check_ag_rmaps(
return error;
}
-/* Make sure each rmap has a corresponding bmbt entry. */
-STATIC int
-xchk_bmap_check_rmaps(
- struct xfs_scrub *sc,
- int whichfork)
+/*
+ * Decide if we want to walk every rmap btree in the fs to make sure that each
+ * rmap for this file fork has corresponding bmbt entries.
+ */
+static bool
+xchk_bmap_want_check_rmaps(
+ struct xchk_bmap_info *info)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
- bool zero_size;
- int error;
+ struct xfs_scrub *sc = info->sc;
+ struct xfs_ifork *ifp;
- if (!xfs_has_rmapbt(sc->mp) ||
- whichfork == XFS_COW_FORK ||
- (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- return 0;
+ if (!xfs_has_rmapbt(sc->mp))
+ return false;
+ if (info->whichfork == XFS_COW_FORK)
+ return false;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return false;
/* Don't support realtime rmap checks yet. */
- if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
- return 0;
-
- ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
+ if (info->is_rt)
+ return false;
/*
- * Only do this for complex maps that are in btree format, or for
- * situations where we would seem to have a size but zero extents.
- * The inode repair code can zap broken iforks, which means we have
- * to flag this bmap as corrupt if there are rmaps that need to be
- * reattached.
+ * The inode repair code zaps broken inode forks by resetting them back
+ * to EXTENTS format and zero extent records. If we encounter a fork
+ * in this state along with evidence that the fork isn't supposed to be
+ * empty, we need to scan the reverse mappings to decide if we're going
+ * to rebuild the fork. Data forks with nonzero file size are scanned.
+ * xattr forks are never empty of content, so they are always scanned.
*/
+ ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
+ if (info->whichfork == XFS_DATA_FORK &&
+ i_size_read(VFS_I(sc->ip)) == 0)
+ return false;
- if (whichfork == XFS_DATA_FORK)
- zero_size = i_size_read(VFS_I(sc->ip)) == 0;
- else
- zero_size = false;
+ return true;
+ }
- if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
- (zero_size || ifp->if_nextents > 0))
- return 0;
+ return false;
+}
+
+/* Make sure each rmap has a corresponding bmbt entry. */
+STATIC int
+xchk_bmap_check_rmaps(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error;
for_each_perag(sc->mp, agno, pag) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
@@ -683,7 +712,8 @@ xchk_bmap_iextent_delalloc(
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
*/
- if (irec->br_startoff < info->lastoff)
+ if (irec->br_startoff < info->prev_rec.br_startoff +
+ info->prev_rec.br_blockcount)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -697,6 +727,101 @@ xchk_bmap_iextent_delalloc(
irec->br_startoff);
}
+/* Decide if this individual fork mapping is ok. */
+static bool
+xchk_bmap_iext_mapping(
+ struct xchk_bmap_info *info,
+ const struct xfs_bmbt_irec *irec)
+{
+ /* There should never be a "hole" extent in either extent list. */
+ if (irec->br_startblock == HOLESTARTBLOCK)
+ return false;
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+ return false;
+ return true;
+}
+
+/* Are these two mappings contiguous with each other? */
+static inline bool
+xchk_are_bmaps_contiguous(
+ const struct xfs_bmbt_irec *b1,
+ const struct xfs_bmbt_irec *b2)
+{
+ /* Don't try to combine unallocated mappings. */
+ if (!xfs_bmap_is_real_extent(b1))
+ return false;
+ if (!xfs_bmap_is_real_extent(b2))
+ return false;
+
+ /* Does b2 come right after b1 in the logical and physical range? */
+ if (b1->br_startoff + b1->br_blockcount != b2->br_startoff)
+ return false;
+ if (b1->br_startblock + b1->br_blockcount != b2->br_startblock)
+ return false;
+ if (b1->br_state != b2->br_state)
+ return false;
+ return true;
+}
+
+/*
+ * Walk the incore extent records, accumulating consecutive contiguous records
+ * into a single incore mapping. Returns true if @irec has been set to a
+ * mapping or false if there are no more mappings. Caller must ensure that
+ * @info.icur is zeroed before the first call.
+ */
+static int
+xchk_bmap_iext_iter(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+ xfs_filblks_t prev_len;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ /* Advance to the next iextent record and check the mapping. */
+ xfs_iext_next(ifp, &info->icur);
+ if (!xfs_iext_get_extent(ifp, &info->icur, irec))
+ return false;
+
+ if (!xchk_bmap_iext_mapping(info, irec)) {
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return false;
+ }
+
+ /*
+ * Iterate subsequent iextent records and merge them with the one
+ * that we just read, if possible.
+ */
+ prev_len = irec->br_blockcount;
+ while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
+ if (!xchk_are_bmaps_contiguous(irec, &got))
+ break;
+
+ if (!xchk_bmap_iext_mapping(info, &got)) {
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ got.br_startoff);
+ return false;
+ }
+
+ /*
+ * Notify the user of mergeable records in the data or attr
+ * forks. CoW forks only exist in memory so we ignore them.
+ */
+ if (info->whichfork != XFS_COW_FORK &&
+ prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
+ xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
+
+ irec->br_blockcount += got.br_blockcount;
+ prev_len = got.br_blockcount;
+ xfs_iext_next(ifp, &info->icur);
+ }
+
+ return true;
+}
+
/*
* Scrub an inode fork's block mappings.
*
@@ -776,10 +901,15 @@ xchk_bmap(
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
goto out;
- /* Scrub extent records. */
- info.lastoff = 0;
- ifp = xfs_ifork_ptr(ip, whichfork);
- for_each_xfs_iext(ifp, &info.icur, &irec) {
+ /*
+ * Scrub extent records. We use a special iterator function here that
+ * combines adjacent mappings if they are logically and physically
+ * contiguous. For large allocations that require multiple bmbt
+ * records, this reduces the number of cross-referencing calls, which
+ * reduces runtime. Cross referencing with the rmap is simpler because
+ * the rmap must match the combined mapping exactly.
+ */
+ while (xchk_bmap_iext_iter(&info, &irec)) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
goto out;
@@ -794,12 +924,14 @@ xchk_bmap(
xchk_bmap_iextent_delalloc(ip, &info, &irec);
else
xchk_bmap_iextent(ip, &info, &irec);
- info.lastoff = irec.br_startoff + irec.br_blockcount;
+ memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec));
}
- error = xchk_bmap_check_rmaps(sc, whichfork);
- if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
- goto out;
+ if (xchk_bmap_want_check_rmaps(&info)) {
+ error = xchk_bmap_check_rmaps(sc, whichfork);
+ if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
out:
return error;
}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 0fd36d5b4646..1935b9ce1885 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -36,6 +36,7 @@ __xchk_btree_process_error(
switch (*error) {
case -EDEADLOCK:
+ case -ECHRNG:
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
@@ -118,6 +119,16 @@ xchk_btree_xref_set_corrupt(
__return_address);
}
+void
+xchk_btree_set_preen(
+ struct xfs_scrub *sc,
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
+ __return_address);
+}
+
/*
* Make sure this record is in order and doesn't stray outside of the parent
* keys.
@@ -140,29 +151,30 @@ xchk_btree_rec(
trace_xchk_btree_rec(bs->sc, cur, 0);
- /* If this isn't the first record, are they in order? */
- if (cur->bc_levels[0].ptr > 1 &&
+ /* Are all records across all record blocks in order? */
+ if (bs->lastrec_valid &&
!cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
xchk_btree_set_corrupt(bs->sc, cur, 0);
memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
+ bs->lastrec_valid = true;
if (cur->bc_nlevels == 1)
return;
- /* Is this at least as large as the parent low key? */
+ /* Is low_key(rec) at least as large as the parent low key? */
cur->bc_ops->init_key_from_rec(&key, rec);
keyblock = xfs_btree_get_block(cur, 1, &bp);
keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
- if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
+ if (xfs_btree_keycmp_lt(cur, &key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, 1);
if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
return;
- /* Is this no larger than the parent high key? */
+ /* Is high_key(rec) no larger than the parent high key? */
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
- if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
+ if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
@@ -187,29 +199,30 @@ xchk_btree_key(
trace_xchk_btree_key(bs->sc, cur, level);
- /* If this isn't the first key, are they in order? */
- if (cur->bc_levels[level].ptr > 1 &&
- !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
+ /* Are all low keys across all node blocks in order? */
+ if (bs->lastkey[level - 1].valid &&
+ !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
xchk_btree_set_corrupt(bs->sc, cur, level);
- memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
+ memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
+ bs->lastkey[level - 1].valid = true;
if (level + 1 >= cur->bc_nlevels)
return;
- /* Is this at least as large as the parent low key? */
+ /* Is this block's low key at least as large as the parent low key? */
keyblock = xfs_btree_get_block(cur, level + 1, &bp);
keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
- if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
+ if (xfs_btree_keycmp_lt(cur, key, keyp))
xchk_btree_set_corrupt(bs->sc, cur, level);
if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
return;
- /* Is this no larger than the parent high key? */
+ /* Is this block's high key no larger than the parent high key? */
key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
keyblock);
- if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
+ if (xfs_btree_keycmp_lt(cur, keyp, key))
xchk_btree_set_corrupt(bs->sc, cur, level);
}
@@ -389,7 +402,7 @@ xchk_btree_check_block_owner(
if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
bs->cur = NULL;
- xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
+ xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
bs->cur = NULL;
@@ -519,6 +532,48 @@ xchk_btree_check_minrecs(
}
/*
+ * If this btree block has a parent, make sure that the parent's keys capture
+ * the keyspace contained in this block.
+ */
+STATIC void
+xchk_btree_block_check_keys(
+ struct xchk_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ union xfs_btree_key block_key;
+ union xfs_btree_key *block_high_key;
+ union xfs_btree_key *parent_low_key, *parent_high_key;
+ struct xfs_btree_cur *cur = bs->cur;
+ struct xfs_btree_block *parent_block;
+ struct xfs_buf *bp;
+
+ if (level == cur->bc_nlevels - 1)
+ return;
+
+ xfs_btree_get_keys(cur, block, &block_key);
+
+ /* Make sure the low key of this block matches the parent. */
+ parent_block = xfs_btree_get_block(cur, level + 1, &bp);
+ parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
+ parent_block);
+ if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+ return;
+ }
+
+ if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+ return;
+
+ /* Make sure the high key of this block matches the parent. */
+ parent_high_key = xfs_btree_high_key_addr(cur,
+ cur->bc_levels[level + 1].ptr, parent_block);
+ block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
+ if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+}
+
+/*
* Grab and scrub a btree block given a btree pointer. Returns block
* and buffer pointers (if applicable) if they're ok to use.
*/
@@ -569,7 +624,12 @@ xchk_btree_get_block(
* Check the block's siblings; this function absorbs error codes
* for us.
*/
- return xchk_btree_block_check_siblings(bs, *pblock);
+ error = xchk_btree_block_check_siblings(bs, *pblock);
+ if (error)
+ return error;
+
+ xchk_btree_block_check_keys(bs, level, *pblock);
+ return 0;
}
/*
@@ -601,7 +661,7 @@ xchk_btree_block_keys(
parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
- if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
+ if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
xchk_btree_set_corrupt(bs->sc, cur, 1);
if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
@@ -612,7 +672,7 @@ xchk_btree_block_keys(
high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
parent_block);
- if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
+ if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
xchk_btree_set_corrupt(bs->sc, cur, 1);
}
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index da61a53a0b61..9d7b9ee8bef4 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_BTREE_H__
#define __XFS_SCRUB_BTREE_H__
@@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc,
/* Check for btree corruption. */
void xchk_btree_set_corrupt(struct xfs_scrub *sc,
struct xfs_btree_cur *cur, int level);
+void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
+ int level);
/* Check for btree xref discrepancies. */
void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,
@@ -29,6 +31,11 @@ typedef int (*xchk_btree_rec_fn)(
struct xchk_btree *bs,
const union xfs_btree_rec *rec);
+struct xchk_btree_key {
+ union xfs_btree_key key;
+ bool valid;
+};
+
struct xchk_btree {
/* caller-provided scrub state */
struct xfs_scrub *sc;
@@ -38,11 +45,12 @@ struct xchk_btree {
void *private;
/* internal scrub state */
+ bool lastrec_valid;
union xfs_btree_rec lastrec;
struct list_head to_check;
/* this element must come last! */
- union xfs_btree_key lastkey[];
+ struct xchk_btree_key lastkey[];
};
/*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 848a8e32e56f..9aa79665c608 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -75,6 +75,7 @@ __xchk_process_error(
case 0:
return true;
case -EDEADLOCK:
+ case -ECHRNG:
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
@@ -130,6 +131,7 @@ __xchk_fblock_process_error(
case 0:
return true;
case -EDEADLOCK:
+ case -ECHRNG:
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
@@ -396,26 +398,19 @@ want_ag_read_header_failure(
}
/*
- * Grab the perag structure and all the headers for an AG.
+ * Grab the AG header buffers for the attached perag structure.
*
* The headers should be released by xchk_ag_free, but as a fail safe we attach
* all the buffers we grab to the scrub transaction so they'll all be freed
- * when we cancel it. Returns ENOENT if we can't grab the perag structure.
+ * when we cancel it.
*/
-int
-xchk_ag_read_headers(
+static inline int
+xchk_perag_read_headers(
struct xfs_scrub *sc,
- xfs_agnumber_t agno,
struct xchk_ag *sa)
{
- struct xfs_mount *mp = sc->mp;
int error;
- ASSERT(!sa->pag);
- sa->pag = xfs_perag_get(mp, agno);
- if (!sa->pag)
- return -ENOENT;
-
error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
@@ -427,6 +422,104 @@ xchk_ag_read_headers(
return 0;
}
+/*
+ * Grab the AG headers for the attached perag structure and wait for pending
+ * intents to drain.
+ */
+static int
+xchk_perag_drain_and_lock(
+ struct xfs_scrub *sc)
+{
+ struct xchk_ag *sa = &sc->sa;
+ int error = 0;
+
+ ASSERT(sa->pag != NULL);
+ ASSERT(sa->agi_bp == NULL);
+ ASSERT(sa->agf_bp == NULL);
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xchk_perag_read_headers(sc, sa);
+ if (error)
+ return error;
+
+ /*
+ * If we've grabbed an inode for scrubbing then we assume that
+ * holding its ILOCK will suffice to coordinate with any intent
+ * chains involving this inode.
+ */
+ if (sc->ip)
+ return 0;
+
+ /*
+ * Decide if this AG is quiet enough for all metadata to be
+ * consistent with each other. XFS allows the AG header buffer
+ * locks to cycle across transaction rolls while processing
+ * chains of deferred ops, which means that there could be
+ * other threads in the middle of processing a chain of
+ * deferred ops. For regular operations we are careful about
+ * ordering operations to prevent collisions between threads
+ * (which is why we don't need a per-AG lock), but scrub and
+ * repair have to serialize against chained operations.
+ *
+ * We just locked all the AG headers buffers; now take a look
+ * to see if there are any intents in progress. If there are,
+ * drop the AG headers and wait for the intents to drain.
+ * Since we hold all the AG header locks for the duration of
+ * the scrub, this is the only time we have to sample the
+ * intents counter; any threads increasing it after this point
+ * can't possibly be in the middle of a chain of AG metadata
+ * updates.
+ *
+ * Obviously, this should be slanted against scrub and in favor
+ * of runtime threads.
+ */
+ if (!xfs_perag_intent_busy(sa->pag))
+ return 0;
+
+ if (sa->agf_bp) {
+ xfs_trans_brelse(sc->tp, sa->agf_bp);
+ sa->agf_bp = NULL;
+ }
+
+ if (sa->agi_bp) {
+ xfs_trans_brelse(sc->tp, sa->agi_bp);
+ sa->agi_bp = NULL;
+ }
+
+ if (!(sc->flags & XCHK_FSGATES_DRAIN))
+ return -ECHRNG;
+ error = xfs_perag_intent_drain(sa->pag);
+ if (error == -ERESTARTSYS)
+ error = -EINTR;
+ } while (!error);
+
+ return error;
+}
+
+/*
+ * Grab the per-AG structure, grab all AG header buffers, and wait until there
+ * aren't any pending intents. Returns -ENOENT if we can't grab the perag
+ * structure.
+ */
+int
+xchk_ag_read_headers(
+ struct xfs_scrub *sc,
+ xfs_agnumber_t agno,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ ASSERT(!sa->pag);
+ sa->pag = xfs_perag_get(mp, agno);
+ if (!sa->pag)
+ return -ENOENT;
+
+ return xchk_perag_drain_and_lock(sc);
+}
+
/* Release all the AG btree cursors. */
void
xchk_ag_btcur_free(
@@ -550,6 +643,14 @@ xchk_ag_init(
/* Per-scrubber setup functions */
+void
+xchk_trans_cancel(
+ struct xfs_scrub *sc)
+{
+ xfs_trans_cancel(sc->tp);
+ sc->tp = NULL;
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -625,80 +726,273 @@ xchk_checkpoint_log(
return 0;
}
+/* Verify that an inode is allocated ondisk, then return its cached inode. */
+int
+xchk_iget(
+ struct xfs_scrub *sc,
+ xfs_ino_t inum,
+ struct xfs_inode **ipp)
+{
+ return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+}
+
/*
- * Given an inode and the scrub control structure, grab either the
- * inode referenced in the control structure or the inode passed in.
- * The inode is not locked.
+ * Try to grab an inode in a manner that avoids races with physical inode
+ * allocation. If we can't, return the locked AGI buffer so that the caller
+ * can single-step the loading process to see where things went wrong.
+ * Callers must have a valid scrub transaction.
+ *
+ * If the iget succeeds, return 0, a NULL AGI, and the inode.
+ *
+ * If the iget fails, return the error, the locked AGI, and a NULL inode. This
+ * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
+ * no longer allocated; or any other corruption or runtime error.
+ *
+ * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
+ *
+ * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
*/
int
-xchk_get_inode(
+xchk_iget_agi(
+ struct xfs_scrub *sc,
+ xfs_ino_t inum,
+ struct xfs_buf **agi_bpp,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_perag *pag;
+ int error;
+
+ ASSERT(sc->tp != NULL);
+
+again:
+ *agi_bpp = NULL;
+ *ipp = NULL;
+ error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Attach the AGI buffer to the scrub transaction to avoid deadlocks
+ * in the iget cache miss path.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ error = xfs_iget(mp, tp, inum,
+ XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+ if (error == -EAGAIN) {
+ /*
+ * The inode may be in core but temporarily unavailable and may
+ * require the AGI buffer before it can be returned. Drop the
+ * AGI buffer and retry the lookup.
+ *
+ * Incore lookup will fail with EAGAIN on a cache hit if the
+ * inode is queued to the inactivation list. The inactivation
+ * worker may remove the inode from the unlinked list and hence
+ * needs the AGI.
+ *
+ * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
+ * to allow inodegc to make progress and move the inode to
+ * IRECLAIMABLE state where xfs_iget will be able to return it
+ * again if it can lock the inode.
+ */
+ xfs_trans_brelse(tp, *agi_bpp);
+ delay(1);
+ goto again;
+ }
+ if (error)
+ return error;
+
+ /* We got the inode, so we can release the AGI. */
+ ASSERT(*ipp != NULL);
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ return 0;
+}
+
+/* Install an inode that we opened by handle for scrubbing. */
+int
+xchk_install_handle_inode(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+ xchk_irele(sc, ip);
+ return -ENOENT;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/*
+ * In preparation to scrub metadata structures that hang off of an inode,
+ * grab either the inode referenced in the scrub control structure or the
+ * inode passed in. If the inumber does not reference an allocated inode
+ * record, the function returns ENOENT to end the scrub early. The inode
+ * is not locked.
+ */
+int
+xchk_iget_for_scrubbing(
struct xfs_scrub *sc)
{
struct xfs_imap imap;
struct xfs_mount *mp = sc->mp;
struct xfs_perag *pag;
+ struct xfs_buf *agi_bp;
struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
struct xfs_inode *ip = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
int error;
+ ASSERT(sc->tp == NULL);
+
/* We want to scan the inode we already had opened. */
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
sc->ip = ip_in;
return 0;
}
- /* Look up the inode, see if the generation number matches. */
+ /* Reject internal metadata files and obviously bad inode numbers. */
if (xfs_internal_inum(mp, sc->sm->sm_ino))
return -ENOENT;
- error = xfs_iget(mp, NULL, sc->sm->sm_ino,
- XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
- switch (error) {
- case -ENOENT:
- /* Inode doesn't exist, just bail out. */
+ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+ return -ENOENT;
+
+ /* Try a regular untrusted iget. */
+ error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ if (!error)
+ return xchk_install_handle_inode(sc, ip);
+ if (error == -ENOENT)
return error;
- case 0:
- /* Got an inode, continue. */
- break;
- case -EINVAL:
+ if (error != -EINVAL)
+ goto out_error;
+
+ /*
+ * EINVAL with IGET_UNTRUSTED probably means one of several things:
+ * userspace gave us an inode number that doesn't correspond to fs
+ * space; the inode btree lacks a record for this inode; or there is a
+ * record, and it says this inode is free.
+ *
+ * We want to look up this inode in the inobt to distinguish two
+ * scenarios: (1) the inobt says the inode is free, in which case
+ * there's nothing to do; and (2) the inobt says the inode is
+ * allocated, but loading it failed due to corruption.
+ *
+ * Allocate a transaction and grab the AGI to prevent inobt activity
+ * in this AG. Retry the iget in case someone allocated a new inode
+ * after the first iget failed.
+ */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_error;
+
+ error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+ if (error == 0) {
+ /* Actually got the inode, so install it. */
+ xchk_trans_cancel(sc);
+ return xchk_install_handle_inode(sc, ip);
+ }
+ if (error == -ENOENT)
+ goto out_gone;
+ if (error != -EINVAL)
+ goto out_cancel;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ /*
+ * Untrusted iget failed a second time. Let's try an inobt lookup.
+ * If the inobt thinks this the inode neither can exist inside the
+ * filesystem nor is allocated, return ENOENT to signal that the check
+ * can be skipped.
+ *
+ * If the lookup returns corruption, we'll mark this inode corrupt and
+ * exit to userspace. There's little chance of fixing anything until
+ * the inobt is straightened out, but there's nothing we can do here.
+ *
+ * If the lookup encounters any other error, exit to userspace.
+ *
+ * If the lookup succeeds, something else must be very wrong in the fs
+ * such that setting up the incore inode failed in some strange way.
+ * Treat those as corruptions.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+ if (!pag) {
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
+
+ error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED);
+ xfs_perag_put(pag);
+ if (error == -EINVAL || error == -ENOENT)
+ goto out_gone;
+ if (!error)
+ error = -EFSCORRUPTED;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_error:
+ trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
+ return error;
+out_gone:
+ /* The file is gone, so there's nothing to check. */
+ xchk_trans_cancel(sc);
+ return -ENOENT;
+}
+
+/* Release an inode, possibly dropping it in the process. */
+void
+xchk_irele(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (current->journal_info != NULL) {
+ ASSERT(current->journal_info == sc->tp);
+
/*
- * -EINVAL with IGET_UNTRUSTED could mean one of several
- * things: userspace gave us an inode number that doesn't
- * correspond to fs space, or doesn't have an inobt entry;
- * or it could simply mean that the inode buffer failed the
- * read verifiers.
+ * If we are in a transaction, we /cannot/ drop the inode
+ * ourselves, because the VFS will trigger writeback, which
+ * can require a transaction. Clear DONTCACHE to force the
+ * inode to the LRU, where someone else can take care of
+ * dropping it.
*
- * Try just the inode mapping lookup -- if it succeeds, then
- * the inode buffer verifier failed and something needs fixing.
- * Otherwise, we really couldn't find it so tell userspace
- * that it no longer exists.
+ * Note that when we grabbed our reference to the inode, it
+ * could have had an active ref and DONTCACHE set if a sysadmin
+ * is trying to coerce a change in file access mode. icache
+ * hits do not clear DONTCACHE, so we must do it here.
*/
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
- if (pag) {
- error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
- XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
- xfs_perag_put(pag);
- if (error)
- return -ENOENT;
- }
- error = -EFSCORRUPTED;
- fallthrough;
- default:
- trace_xchk_op_error(sc,
- XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
- XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
- error, __return_address);
- return error;
- }
- if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
- xfs_irele(ip);
- return -ENOENT;
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
+ } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
+ /*
+ * If this is the last reference to the inode and the caller
+ * permits it, set DONTCACHE to avoid thrashing.
+ */
+ d_mark_dontcache(VFS_I(ip));
}
- sc->ip = ip;
- return 0;
+ xfs_irele(ip);
}
-/* Set us up to scrub a file's contents. */
+/*
+ * Set us up to scrub metadata mapped by a file's fork. Callers must not use
+ * this to operate on user-accessible regular file data because the MMAPLOCK is
+ * not taken.
+ */
int
xchk_setup_inode_contents(
struct xfs_scrub *sc,
@@ -706,13 +1000,14 @@ xchk_setup_inode_contents(
{
int error;
- error = xchk_get_inode(sc);
+ error = xchk_iget_for_scrubbing(sc);
if (error)
return error;
- /* Got the inode, lock it and we're ready to go. */
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ /* Lock the inode so the VFS cannot touch this file. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
+
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
@@ -869,28 +1164,6 @@ xchk_metadata_inode_forks(
return 0;
}
-/*
- * Try to lock an inode in violation of the usual locking order rules. For
- * example, trying to get the IOLOCK while in transaction context, or just
- * plain breaking AG-order or inode-order inode locking rules. Either way,
- * the only way to avoid an ABBA deadlock is to use trylock and back off if
- * we can't.
- */
-int
-xchk_ilock_inverted(
- struct xfs_inode *ip,
- uint lock_mode)
-{
- int i;
-
- for (i = 0; i < 20; i++) {
- if (xfs_ilock_nowait(ip, lock_mode))
- return 0;
- delay(1);
- }
- return -EDEADLOCK;
-}
-
/* Pause background reaping of resources. */
void
xchk_stop_reaping(
@@ -916,3 +1189,25 @@ xchk_start_reaping(
}
sc->flags &= ~XCHK_REAPING_DISABLED;
}
+
+/*
+ * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
+ * operation. Callers must not hold any locks that intersect with the CPU
+ * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
+ * to change kernel code.
+ */
+void
+xchk_fsgates_enable(
+ struct xfs_scrub *sc,
+ unsigned int scrub_fsgates)
+{
+ ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
+ ASSERT(!(sc->flags & scrub_fsgates));
+
+ trace_xchk_fsgates_enable(sc, scrub_fsgates);
+
+ if (scrub_fsgates & XCHK_FSGATES_DRAIN)
+ xfs_drain_wait_enable();
+
+ sc->flags |= scrub_fsgates;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index b73648d81d23..18b5f2b62f13 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_COMMON_H__
#define __XFS_SCRUB_COMMON_H__
@@ -32,6 +32,8 @@ xchk_should_terminate(
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+void xchk_trans_cancel(struct xfs_scrub *sc);
+
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
@@ -72,6 +74,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
struct xfs_btree_cur **curpp);
/* Setup functions */
+int xchk_setup_agheader(struct xfs_scrub *sc);
int xchk_setup_fs(struct xfs_scrub *sc);
int xchk_setup_ag_allocbt(struct xfs_scrub *sc);
int xchk_setup_ag_iallocbt(struct xfs_scrub *sc);
@@ -132,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
-int xchk_get_inode(struct xfs_scrub *sc);
+int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
+int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
+ struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
+void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
/*
* Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies.
@@ -147,8 +156,21 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
}
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
-int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
void xchk_stop_reaping(struct xfs_scrub *sc);
void xchk_start_reaping(struct xfs_scrub *sc);
+/*
+ * Setting up a hook to wait for intents to drain is costly -- we have to take
+ * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it
+ * up, and again to tear it down. These costs add up quickly, so we only want
+ * to enable the drain waiter if the drain actually detected a conflict with
+ * running intent chains.
+ */
+static inline bool xchk_need_intent_drain(struct xfs_scrub *sc)
+{
+ return sc->flags & XCHK_NEED_DRAIN;
+}
+
+void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index d17cee177085..82b150d3b8b7 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -39,6 +39,7 @@ xchk_da_process_error(
switch (*error) {
case -EDEADLOCK:
+ case -ECHRNG:
/* Used to restart an op with deadlock avoidance. */
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
break;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index 1f3515c6d5a8..4f8c2138a1ec 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_DABTREE_H__
#define __XFS_SCRUB_DABTREE_H__
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d1b0f23c2c59..0b491784b759 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -18,6 +18,7 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
+#include "scrub/readdir.h"
/* Set us up to scrub directories. */
int
@@ -31,168 +32,114 @@ xchk_setup_directory(
/* Scrub a directory entry. */
-struct xchk_dir_ctx {
- /* VFS fill-directory iterator */
- struct dir_context dir_iter;
-
- struct xfs_scrub *sc;
-};
-
-/* Check that an inode's mode matches a given DT_ type. */
-STATIC int
+/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
+STATIC void
xchk_dir_check_ftype(
- struct xchk_dir_ctx *sdc,
+ struct xfs_scrub *sc,
xfs_fileoff_t offset,
- xfs_ino_t inum,
- int dtype)
+ struct xfs_inode *ip,
+ int ftype)
{
- struct xfs_mount *mp = sdc->sc->mp;
- struct xfs_inode *ip;
- int ino_dtype;
- int error = 0;
+ struct xfs_mount *mp = sc->mp;
if (!xfs_has_ftype(mp)) {
- if (dtype != DT_UNKNOWN && dtype != DT_DIR)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
- offset);
- goto out;
- }
-
- /*
- * Grab the inode pointed to by the dirent. We release the
- * inode before we cancel the scrub transaction. Since we're
- * don't know a priori that releasing the inode won't trigger
- * eofblocks cleanup (which allocates what would be a nested
- * transaction), we can't use DONTCACHE here because DONTCACHE
- * inodes can trigger immediate inactive cleanup of the inode.
- *
- * If _iget returns -EINVAL or -ENOENT then the child inode number is
- * garbage and the directory is corrupt. If the _iget returns
- * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
- * cross referencing error. Any other error is an operational error.
- */
- error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip);
- if (error == -EINVAL || error == -ENOENT) {
- error = -EFSCORRUPTED;
- xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error);
- goto out;
+ if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return;
}
- if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset,
- &error))
- goto out;
- /* Convert mode to the DT_* values that dir_emit uses. */
- ino_dtype = xfs_dir3_get_dtype(mp,
- xfs_mode_to_ftype(VFS_I(ip)->i_mode));
- if (ino_dtype != dtype)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
- xfs_irele(ip);
-out:
- return error;
+ if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
}
/*
* Scrub a single directory entry.
*
- * We use the VFS directory iterator (i.e. readdir) to call this
- * function for every directory entry in a directory. Once we're here,
- * we check the inode number to make sure it's sane, then we check that
- * we can look up this filename. Finally, we check the ftype.
+ * Check the inode number to make sure it's sane, then we check that we can
+ * look up this filename. Finally, we check the ftype.
*/
-STATIC bool
+STATIC int
xchk_dir_actor(
- struct dir_context *dir_iter,
- const char *name,
- int namelen,
- loff_t pos,
- u64 ino,
- unsigned type)
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
{
- struct xfs_mount *mp;
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip;
- struct xchk_dir_ctx *sdc;
- struct xfs_name xname;
xfs_ino_t lookup_ino;
xfs_dablk_t offset;
- bool checked_ftype = false;
int error = 0;
- sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter);
- ip = sdc->sc->ip;
- mp = ip->i_mount;
offset = xfs_dir2_db_to_da(mp->m_dir_geo,
- xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+ xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos));
- if (xchk_should_terminate(sdc->sc, &error))
- return !error;
+ if (xchk_should_terminate(sc, &error))
+ return error;
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
- goto out;
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return -ECANCELED;
}
/* Does this name make sense? */
- if (!xfs_dir2_namecheck(name, namelen)) {
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
- goto out;
+ if (!xfs_dir2_namecheck(name->name, name->len)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return -ECANCELED;
}
- if (!strncmp(".", name, namelen)) {
+ if (!strncmp(".", name->name, name->len)) {
/* If this is "." then check that the inum matches the dir. */
- if (xfs_has_ftype(mp) && type != DT_DIR)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
- offset);
- checked_ftype = true;
- if (ino != ip->i_ino)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
- offset);
- } else if (!strncmp("..", name, namelen)) {
+ if (ino != dp->i_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ } else if (!strncmp("..", name->name, name->len)) {
/*
* If this is ".." in the root inode, check that the inum
* matches this dir.
*/
- if (xfs_has_ftype(mp) && type != DT_DIR)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
- offset);
- checked_ftype = true;
- if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino)
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK,
- offset);
+ if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
}
/* Verify that we can look up this name by hash. */
- xname.name = name;
- xname.len = namelen;
- xname.type = XFS_DIR3_FT_UNKNOWN;
-
- error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
+ error = xchk_dir_lookup(sc, dp, name, &lookup_ino);
/* ENOENT means the hash lookup failed and the dir is corrupt */
if (error == -ENOENT)
error = -EFSCORRUPTED;
- if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
- &error))
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
goto out;
if (lookup_ino != ino) {
- xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
- goto out;
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return -ECANCELED;
}
- /* Verify the file type. This function absorbs error codes. */
- if (!checked_ftype) {
- error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type);
- if (error)
- goto out;
- }
-out:
/*
- * A negative error code returned here is supposed to cause the
- * dir_emit caller (xfs_readdir) to abort the directory iteration
- * and return zero to xchk_directory.
+ * Grab the inode pointed to by the dirent. We release the inode
+ * before we cancel the scrub transaction.
+ *
+ * If _iget returns -EINVAL or -ENOENT then the child inode number is
+ * garbage and the directory is corrupt. If the _iget returns
+ * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
+ * cross referencing error. Any other error is an operational error.
*/
- if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return false;
- return !error;
+ error = xchk_iget(sc, ino, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ error = -EFSCORRUPTED;
+ xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
+ goto out;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error))
+ goto out;
+
+ xchk_dir_check_ftype(sc, offset, ip, name->type);
+ xchk_irele(sc, ip);
+out:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -ECANCELED;
+ return error;
}
/* Scrub a directory btree record. */
@@ -201,6 +148,7 @@ xchk_dir_rec(
struct xchk_da_btree *ds,
int level)
{
+ struct xfs_name dname = { };
struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
struct xfs_mount *mp = ds->state->mp;
struct xfs_inode *dp = ds->dargs.dp;
@@ -297,7 +245,11 @@ xchk_dir_rec(
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
goto out_relse;
}
- calc_hash = xfs_da_hashname(dent->name, dent->namelen);
+
+ /* Does the directory hash match? */
+ dname.name = dent->name;
+ dname.len = dent->namelen;
+ calc_hash = xfs_dir2_hashname(mp, &dname);
if (calc_hash != hash)
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
@@ -803,14 +755,7 @@ int
xchk_directory(
struct xfs_scrub *sc)
{
- struct xchk_dir_ctx sdc = {
- .dir_iter.actor = xchk_dir_actor,
- .dir_iter.pos = 0,
- .sc = sc,
- };
- size_t bufsize;
- loff_t oldpos;
- int error = 0;
+ int error;
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
return -ENOENT;
@@ -818,7 +763,7 @@ xchk_directory(
/* Plausible size? */
if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
- goto out;
+ return 0;
}
/* Check directory tree structure */
@@ -827,7 +772,7 @@ xchk_directory(
return error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
+ return 0;
/* Check the freespace. */
error = xchk_directory_blocks(sc);
@@ -835,44 +780,11 @@ xchk_directory(
return error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
-
- /*
- * Check that every dirent we see can also be looked up by hash.
- * Userspace usually asks for a 32k buffer, so we will too.
- */
- bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
- sc->ip->i_disk_size);
-
- /*
- * Look up every name in this directory by hash.
- *
- * Use the xfs_readdir function to call xchk_dir_actor on
- * every directory entry in this directory. In _actor, we check
- * the name, inode number, and ftype (if applicable) of the
- * entry. xfs_readdir uses the VFS filldir functions to provide
- * iteration context.
- *
- * The VFS grabs a read or write lock via i_rwsem before it reads
- * or writes to a directory. If we've gotten this far we've
- * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
- * getting a write lock on i_rwsem. Therefore, it is safe for us
- * to drop the ILOCK here in order to reuse the _readdir and
- * _dir_lookup routines, which do their own ILOCK locking.
- */
- oldpos = 0;
- sc->ilock_flags &= ~XFS_ILOCK_EXCL;
- xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
- while (true) {
- error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize);
- if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
- &error))
- goto out;
- if (oldpos == sdc.dir_iter.pos)
- break;
- oldpos = sdc.dir_iter.pos;
- }
+ return 0;
-out:
+ /* Look up every name in this directory by hash. */
+ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
+ if (error == -ECANCELED)
+ error = 0;
return error;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index f0c7f41897b9..faa315be7978 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Copyright (C) 2019 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -130,6 +130,13 @@ xchk_setup_fscounters(
struct xchk_fscounters *fsc;
int error;
+ /*
+ * If the AGF doesn't track btreeblks, we have to lock the AGF to count
+ * btree block usage by walking the actual btrees.
+ */
+ if (!xfs_has_lazysbcount(sc->mp))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index aa65ec88a0c0..d2b2a1cb6533 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -1,12 +1,14 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2019 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index d0b938d3d028..66a273f8585b 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2019 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_HEALTH_H__
#define __XFS_SCRUB_HEALTH_H__
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index e312be7cd375..575f22a02ebe 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -32,6 +32,8 @@ int
xchk_setup_ag_iallocbt(
struct xfs_scrub *sc)
{
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER);
}
@@ -49,83 +51,237 @@ struct xchk_iallocbt {
};
/*
- * If we're checking the finobt, cross-reference with the inobt.
- * Otherwise we're checking the inobt; if there is an finobt, make sure
- * we have a record or not depending on freecount.
+ * Does the finobt have a record for this inode with the same hole/free state?
+ * This is a bit complicated because of the following:
+ *
+ * - The finobt need not have a record if all inodes in the inobt record are
+ * allocated.
+ * - The finobt need not have a record if all inodes in the inobt record are
+ * free.
+ * - The finobt need not have a record if the inobt record says this is a hole.
+ * This likely doesn't happen in practice.
*/
-static inline void
-xchk_iallocbt_chunk_xref_other(
+STATIC int
+xchk_inobt_xref_finobt(
+ struct xfs_scrub *sc,
+ struct xfs_inobt_rec_incore *irec,
+ xfs_agino_t agino,
+ bool free,
+ bool hole)
+{
+ struct xfs_inobt_rec_incore frec;
+ struct xfs_btree_cur *cur = sc->sa.fino_cur;
+ bool ffree, fhole;
+ unsigned int frec_idx, fhole_idx;
+ int has_record;
+ int error;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
+ if (error)
+ return error;
+ if (!has_record)
+ goto no_record;
+
+ error = xfs_inobt_get_rec(cur, &frec, &has_record);
+ if (!has_record)
+ return -EFSCORRUPTED;
+
+ if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ goto no_record;
+
+ /* There's a finobt record; free and hole status must match. */
+ frec_idx = agino - frec.ir_startino;
+ ffree = frec.ir_free & (1ULL << frec_idx);
+ fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+ fhole = frec.ir_holemask & (1U << fhole_idx);
+
+ if (ffree != free)
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ if (fhole != hole)
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ return 0;
+
+no_record:
+ /* inobt record is fully allocated */
+ if (irec->ir_free == 0)
+ return 0;
+
+ /* inobt record is totally unallocated */
+ if (irec->ir_free == XFS_INOBT_ALL_FREE)
+ return 0;
+
+ /* inobt record says this is a hole */
+ if (hole)
+ return 0;
+
+ /* finobt doesn't care about allocated inodes */
+ if (!free)
+ return 0;
+
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ return 0;
+}
+
+/*
+ * Make sure that each inode of this part of an inobt record has the same
+ * sparse and free status as the finobt.
+ */
+STATIC void
+xchk_inobt_chunk_xref_finobt(
struct xfs_scrub *sc,
struct xfs_inobt_rec_incore *irec,
- xfs_agino_t agino)
+ xfs_agino_t agino,
+ unsigned int nr_inodes)
{
- struct xfs_btree_cur **pcur;
- bool has_irec;
+ xfs_agino_t i;
+ unsigned int rec_idx;
int error;
- if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
- pcur = &sc->sa.ino_cur;
- else
- pcur = &sc->sa.fino_cur;
- if (!(*pcur))
- return;
- error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec);
- if (!xchk_should_check_xref(sc, &error, pcur))
+ ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT);
+
+ if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm))
return;
- if (((irec->ir_freecount > 0 && !has_irec) ||
- (irec->ir_freecount == 0 && has_irec)))
- xchk_btree_xref_set_corrupt(sc, *pcur, 0);
+
+ for (i = agino, rec_idx = agino - irec->ir_startino;
+ i < agino + nr_inodes;
+ i++, rec_idx++) {
+ bool free, hole;
+ unsigned int hole_idx;
+
+ free = irec->ir_free & (1ULL << rec_idx);
+ hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+ hole = irec->ir_holemask & (1U << hole_idx);
+
+ error = xchk_inobt_xref_finobt(sc, irec, i, free, hole);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+ return;
+ }
+}
+
+/*
+ * Does the inobt have a record for this inode with the same hole/free state?
+ * The inobt must always have a record if there's a finobt record.
+ */
+STATIC int
+xchk_finobt_xref_inobt(
+ struct xfs_scrub *sc,
+ struct xfs_inobt_rec_incore *frec,
+ xfs_agino_t agino,
+ bool ffree,
+ bool fhole)
+{
+ struct xfs_inobt_rec_incore irec;
+ struct xfs_btree_cur *cur = sc->sa.ino_cur;
+ bool free, hole;
+ unsigned int rec_idx, hole_idx;
+ int has_record;
+ int error;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
+ if (error)
+ return error;
+ if (!has_record)
+ goto no_record;
+
+ error = xfs_inobt_get_rec(cur, &irec, &has_record);
+ if (!has_record)
+ return -EFSCORRUPTED;
+
+ if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ goto no_record;
+
+ /* There's an inobt record; free and hole status must match. */
+ rec_idx = agino - irec.ir_startino;
+ free = irec.ir_free & (1ULL << rec_idx);
+ hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+ hole = irec.ir_holemask & (1U << hole_idx);
+
+ if (ffree != free)
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ if (fhole != hole)
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ return 0;
+
+no_record:
+ /* finobt should never have a record for which the inobt does not */
+ xchk_btree_xref_set_corrupt(sc, cur, 0);
+ return 0;
}
-/* Cross-reference with the other btrees. */
+/*
+ * Make sure that each inode of this part of an finobt record has the same
+ * sparse and free status as the inobt.
+ */
STATIC void
-xchk_iallocbt_chunk_xref(
+xchk_finobt_chunk_xref_inobt(
struct xfs_scrub *sc,
- struct xfs_inobt_rec_incore *irec,
+ struct xfs_inobt_rec_incore *frec,
xfs_agino_t agino,
- xfs_agblock_t agbno,
- xfs_extlen_t len)
+ unsigned int nr_inodes)
{
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ xfs_agino_t i;
+ unsigned int rec_idx;
+ int error;
+
+ ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT);
+
+ if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm))
return;
- xchk_xref_is_used_space(sc, agbno, len);
- xchk_iallocbt_chunk_xref_other(sc, irec, agino);
- xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
- xchk_xref_is_not_shared(sc, agbno, len);
+ for (i = agino, rec_idx = agino - frec->ir_startino;
+ i < agino + nr_inodes;
+ i++, rec_idx++) {
+ bool ffree, fhole;
+ unsigned int hole_idx;
+
+ ffree = frec->ir_free & (1ULL << rec_idx);
+ hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT;
+ fhole = frec->ir_holemask & (1U << hole_idx);
+
+ error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ }
}
-/* Is this chunk worth checking? */
+/* Is this chunk worth checking and cross-referencing? */
STATIC bool
xchk_iallocbt_chunk(
struct xchk_btree *bs,
struct xfs_inobt_rec_incore *irec,
xfs_agino_t agino,
- xfs_extlen_t len)
+ unsigned int nr_inodes)
{
+ struct xfs_scrub *sc = bs->sc;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_perag *pag = bs->cur->bc_ag.pag;
- xfs_agblock_t bno;
+ xfs_agblock_t agbno;
+ xfs_extlen_t len;
- bno = XFS_AGINO_TO_AGBNO(mp, agino);
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize);
- if (!xfs_verify_agbext(pag, bno, len))
+ if (!xfs_verify_agbext(pag, agbno, len))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return false;
+ xchk_xref_is_used_space(sc, agbno, len);
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes);
+ else
+ xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes);
+ xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
+ xchk_xref_is_not_shared(sc, agbno, len);
+ xchk_xref_is_not_cow_staging(sc, agbno, len);
return true;
}
-/* Count the number of free inodes. */
-static unsigned int
-xchk_iallocbt_freecount(
- xfs_inofree_t freemask)
-{
- BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64));
- return hweight64(freemask);
-}
-
/*
* Check that an inode's allocation status matches ir_free in the inobt
* record. First we try querying the in-core inode state, and if the inode
@@ -272,7 +428,7 @@ xchk_iallocbt_check_cluster(
return 0;
}
- xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+ xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
&XFS_RMAP_OINFO_INODES);
/* Grab the inode cluster buffer. */
@@ -420,36 +576,22 @@ xchk_iallocbt_rec(
const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
xfs_agino_t agino;
- xfs_extlen_t len;
int holecount;
int i;
int error = 0;
- unsigned int real_freecount;
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
-
- if (irec.ir_count > XFS_INODES_PER_CHUNK ||
- irec.ir_freecount > XFS_INODES_PER_CHUNK)
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-
- real_freecount = irec.ir_freecount +
- (XFS_INODES_PER_CHUNK - irec.ir_count);
- if (real_freecount != xchk_iallocbt_freecount(irec.ir_free))
+ if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
agino = irec.ir_startino;
- /* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(pag, agino) ||
- !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- goto out;
- }
xchk_iallocbt_rec_alignment(bs, &irec);
if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
@@ -459,12 +601,11 @@ xchk_iallocbt_rec(
/* Handle non-sparse inodes */
if (!xfs_inobt_issparse(irec.ir_holemask)) {
- len = XFS_B_TO_FSB(mp,
- XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
if (irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
+ if (!xchk_iallocbt_chunk(bs, &irec, agino,
+ XFS_INODES_PER_CHUNK))
goto out;
goto check_clusters;
}
@@ -472,8 +613,6 @@ xchk_iallocbt_rec(
/* Check each chunk of a sparse inode cluster. */
holemask = irec.ir_holemask;
holecount = 0;
- len = XFS_B_TO_FSB(mp,
- XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
holes = ~xfs_inobt_irec_to_allocmask(&irec);
if ((holes & irec.ir_free) != holes ||
irec.ir_freecount > irec.ir_count)
@@ -482,8 +621,9 @@ xchk_iallocbt_rec(
for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
if (holemask & 1)
holecount += XFS_INODES_PER_HOLEMASK_BIT;
- else if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
- break;
+ else if (!xchk_iallocbt_chunk(bs, &irec, agino,
+ XFS_INODES_PER_HOLEMASK_BIT))
+ goto out;
holemask >>= 1;
agino += XFS_INODES_PER_HOLEMASK_BIT;
}
@@ -493,6 +633,9 @@ xchk_iallocbt_rec(
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
check_clusters:
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -622,18 +765,18 @@ xchk_xref_inode_check(
xfs_agblock_t agbno,
xfs_extlen_t len,
struct xfs_btree_cur **icur,
- bool should_have_inodes)
+ enum xbtree_recpacking expected)
{
- bool has_inodes;
+ enum xbtree_recpacking outcome;
int error;
if (!(*icur) || xchk_skip_xref(sc->sm))
return;
- error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
+ error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome);
if (!xchk_should_check_xref(sc, &error, icur))
return;
- if (has_inodes != should_have_inodes)
+ if (outcome != expected)
xchk_btree_xref_set_corrupt(sc, *icur, 0);
}
@@ -644,8 +787,10 @@ xchk_xref_is_not_inode_chunk(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false);
- xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false);
+ xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur,
+ XBTREE_RECPACKING_EMPTY);
+ xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur,
+ XBTREE_RECPACKING_EMPTY);
}
/* xref check that the extent is covered by inodes */
@@ -655,5 +800,6 @@ xchk_xref_is_inode_chunk(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true);
+ xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur,
+ XBTREE_RECPACKING_FULL);
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 7a2f38e5202c..3e1e02e340a6 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -11,8 +11,11 @@
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
+#include "xfs_icache.h"
#include "xfs_da_format.h"
#include "xfs_reflink.h"
#include "xfs_rmap.h"
@@ -20,45 +23,176 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Prepare the attached inode for scrubbing. */
+static inline int
+xchk_prepare_iscrub(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Install this scrub-by-handle inode and prepare it for scrubbing. */
+static inline int
+xchk_install_handle_iscrub(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ error = xchk_install_handle_inode(sc, ip);
+ if (error)
+ return error;
+
+ return xchk_prepare_iscrub(sc);
+}
/*
- * Grab total control of the inode metadata. It doesn't matter here if
- * the file data is still changing; exclusive access to the metadata is
- * the goal.
+ * Grab total control of the inode metadata. In the best case, we grab the
+ * incore inode and take all locks on it. If the incore inode cannot be
+ * constructed due to corruption problems, lock the AGI so that we can single
+ * step the loading process to fix everything that can go wrong.
*/
int
xchk_setup_inode(
struct xfs_scrub *sc)
{
+ struct xfs_imap imap;
+ struct xfs_inode *ip;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
int error;
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ /* We want to scan the opened inode, so lock it and exit. */
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+ sc->ip = ip_in;
+ return xchk_prepare_iscrub(sc);
+ }
+
+ /* Reject internal metadata files and obviously bad inode numbers. */
+ if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ return -ENOENT;
+ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+ return -ENOENT;
+
+ /* Try a regular untrusted iget. */
+ error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ if (!error)
+ return xchk_install_handle_iscrub(sc, ip);
+ if (error == -ENOENT)
+ return error;
+ if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+ goto out_error;
+
/*
- * Try to get the inode. If the verifiers fail, we try again
- * in raw mode.
+ * EINVAL with IGET_UNTRUSTED probably means one of several things:
+ * userspace gave us an inode number that doesn't correspond to fs
+ * space; the inode btree lacks a record for this inode; or there is
+ * a record, and it says this inode is free.
+ *
+ * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but
+ * some other metadata corruption (e.g. inode forks) prevented
+ * instantiation of the incore inode. Or it could mean the inobt is
+ * corrupt.
+ *
+ * We want to look up this inode in the inobt directly to distinguish
+ * three different scenarios: (1) the inobt says the inode is free,
+ * in which case there's nothing to do; (2) the inobt is corrupt so we
+ * should flag the corruption and exit to userspace to let it fix the
+ * inobt; and (3) the inobt says the inode is allocated, but loading it
+ * failed due to corruption.
+ *
+ * Allocate a transaction and grab the AGI to prevent inobt activity in
+ * this AG. Retry the iget in case someone allocated a new inode after
+ * the first iget failed.
*/
- error = xchk_get_inode(sc);
- switch (error) {
- case 0:
- break;
- case -EFSCORRUPTED:
- case -EFSBADCRC:
- return xchk_trans_alloc(sc, 0);
- default:
- return error;
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_error;
+
+ error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+ if (error == 0) {
+ /* Actually got the incore inode, so install it and proceed. */
+ xchk_trans_cancel(sc);
+ return xchk_install_handle_iscrub(sc, ip);
+ }
+ if (error == -ENOENT)
+ goto out_gone;
+ if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+ goto out_cancel;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ error = -ECANCELED;
+ goto out_cancel;
}
- /* Got the inode, lock it and we're ready to go. */
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
- error = xchk_trans_alloc(sc, 0);
+ /*
+ * Untrusted iget failed a second time. Let's try an inobt lookup.
+ * If the inobt doesn't think this is an allocated inode then we'll
+ * return ENOENT to signal that the check can be skipped.
+ *
+ * If the lookup signals corruption, we'll mark this inode corrupt and
+ * exit to userspace. There's little chance of fixing anything until
+ * the inobt is straightened out, but there's nothing we can do here.
+ *
+ * If the lookup encounters a runtime error, exit to userspace.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+ if (!pag) {
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
+
+ error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED);
+ xfs_perag_put(pag);
+ if (error == -EINVAL || error == -ENOENT)
+ goto out_gone;
if (error)
- goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ goto out_cancel;
-out:
- /* scrub teardown will unlock and release the inode for us */
+ /*
+ * The lookup succeeded. Chances are the ondisk inode is corrupt and
+ * preventing iget from reading it. Retain the scrub transaction and
+ * the AGI buffer to prevent anyone from allocating or freeing inodes.
+ * This ensures that we preserve the inconsistency between the inobt
+ * saying the inode is allocated and the icache being unable to load
+ * the inode until we can flag the corruption in xchk_inode. The
+ * scrub function has to note the corruption, since we're not really
+ * supposed to do that from the setup function.
+ */
+ return 0;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_error:
+ trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
return error;
+out_gone:
+ /* The file is gone, so there's nothing to check. */
+ xchk_trans_cancel(sc);
+ return -ENOENT;
}
/* Inode core */
@@ -553,8 +687,9 @@ xchk_inode_xref(
xchk_xref_is_used_space(sc, agbno, 1);
xchk_inode_xref_finobt(sc, ino);
- xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
+ xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_xref_is_not_cow_staging(sc, agbno, 1);
xchk_inode_xref_bmap(sc, dip);
out_free:
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index d8dff3fd8053..58d5dfb7ea21 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -16,6 +16,7 @@
#include "xfs_dir2_priv.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/readdir.h"
/* Set us up to scrub parents. */
int
@@ -30,122 +31,93 @@ xchk_setup_parent(
/* Look for an entry in a parent pointing to this inode. */
struct xchk_parent_ctx {
- struct dir_context dc;
struct xfs_scrub *sc;
- xfs_ino_t ino;
xfs_nlink_t nlink;
- bool cancelled;
};
/* Look for a single entry in a directory pointing to an inode. */
-STATIC bool
+STATIC int
xchk_parent_actor(
- struct dir_context *dc,
- const char *name,
- int namelen,
- loff_t pos,
- u64 ino,
- unsigned type)
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos,
+ const struct xfs_name *name,
+ xfs_ino_t ino,
+ void *priv)
{
- struct xchk_parent_ctx *spc;
+ struct xchk_parent_ctx *spc = priv;
int error = 0;
- spc = container_of(dc, struct xchk_parent_ctx, dc);
- if (spc->ino == ino)
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name->name, name->len))
+ error = -EFSCORRUPTED;
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+
+ if (sc->ip->i_ino == ino)
spc->nlink++;
- /*
- * If we're facing a fatal signal, bail out. Store the cancellation
- * status separately because the VFS readdir code squashes error codes
- * into short directory reads.
- */
if (xchk_should_terminate(spc->sc, &error))
- spc->cancelled = true;
+ return error;
- return !error;
+ return 0;
}
-/* Count the number of dentries in the parent dir that point to this inode. */
-STATIC int
-xchk_parent_count_parent_dentries(
- struct xfs_scrub *sc,
- struct xfs_inode *parent,
- xfs_nlink_t *nlink)
+/*
+ * Try to lock a parent directory for checking dirents. Returns the inode
+ * flags for the locks we now hold, or zero if we failed.
+ */
+STATIC unsigned int
+xchk_parent_ilock_dir(
+ struct xfs_inode *dp)
{
- struct xchk_parent_ctx spc = {
- .dc.actor = xchk_parent_actor,
- .ino = sc->ip->i_ino,
- .sc = sc,
- };
- size_t bufsize;
- loff_t oldpos;
- uint lock_mode;
- int error = 0;
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED))
+ return 0;
- /*
- * If there are any blocks, read-ahead block 0 as we're almost
- * certain to have the next operation be a read there. This is
- * how we guarantee that the parent's extent map has been loaded,
- * if there is one.
- */
- lock_mode = xfs_ilock_data_map_shared(parent);
- if (parent->i_df.if_nextents > 0)
- error = xfs_dir3_data_readahead(parent, 0, 0);
- xfs_iunlock(parent, lock_mode);
- if (error)
- return error;
+ if (!xfs_need_iread_extents(&dp->i_df))
+ return XFS_ILOCK_SHARED;
- /*
- * Iterate the parent dir to confirm that there is
- * exactly one entry pointing back to the inode being
- * scanned.
- */
- bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE,
- parent->i_disk_size);
- oldpos = 0;
- while (true) {
- error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
- if (error)
- goto out;
- if (spc.cancelled) {
- error = -EAGAIN;
- goto out;
- }
- if (oldpos == spc.dc.pos)
- break;
- oldpos = spc.dc.pos;
- }
- *nlink = spc.nlink;
-out:
- return error;
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL))
+ return 0;
+
+ return XFS_ILOCK_EXCL;
}
/*
- * Given the inode number of the alleged parent of the inode being
- * scrubbed, try to validate that the parent has exactly one directory
- * entry pointing back to the inode being scrubbed.
+ * Given the inode number of the alleged parent of the inode being scrubbed,
+ * try to validate that the parent has exactly one directory entry pointing
+ * back to the inode being scrubbed. Returns -EAGAIN if we need to revalidate
+ * the dotdot entry.
*/
STATIC int
xchk_parent_validate(
struct xfs_scrub *sc,
- xfs_ino_t dnum,
- bool *try_again)
+ xfs_ino_t parent_ino)
{
+ struct xchk_parent_ctx spc = {
+ .sc = sc,
+ .nlink = 0,
+ };
struct xfs_mount *mp = sc->mp;
struct xfs_inode *dp = NULL;
xfs_nlink_t expected_nlink;
- xfs_nlink_t nlink;
+ unsigned int lock_mode;
int error = 0;
- *try_again = false;
-
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ /* Is this the root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_rootip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
+ sc->ip->i_ino != parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
/* '..' must not point to ourselves. */
- if (sc->ip->i_ino == dnum) {
+ if (sc->ip->i_ino == parent_ino) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/*
@@ -155,106 +127,51 @@ xchk_parent_validate(
expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
/*
- * Grab this parent inode. We release the inode before we
- * cancel the scrub transaction. Since we're don't know a
- * priori that releasing the inode won't trigger eofblocks
- * cleanup (which allocates what would be a nested transaction)
- * if the parent pointer erroneously points to a file, we
- * can't use DONTCACHE here because DONTCACHE inodes can trigger
- * immediate inactive cleanup of the inode.
+ * Grab the parent directory inode. This must be released before we
+ * cancel the scrub transaction.
*
* If _iget returns -EINVAL or -ENOENT then the parent inode number is
* garbage and the directory is corrupt. If the _iget returns
* -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
* cross referencing error. Any other error is an operational error.
*/
- error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp);
+ error = xchk_iget(sc, parent_ino, &dp);
if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
- goto out;
+ return error;
}
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out_rele;
}
- /*
- * We prefer to keep the inode locked while we lock and search
- * its alleged parent for a forward reference. If we can grab
- * the iolock, validate the pointers and we're done. We must
- * use nowait here to avoid an ABBA deadlock on the parent and
- * the child inodes.
- */
- if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) {
- error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
- if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
- &error))
- goto out_unlock;
- if (nlink != expected_nlink)
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out_unlock;
- }
-
- /*
- * The game changes if we get here. We failed to lock the parent,
- * so we're going to try to verify both pointers while only holding
- * one lock so as to avoid deadlocking with something that's actually
- * trying to traverse down the directory tree.
- */
- xfs_iunlock(sc->ip, sc->ilock_flags);
- sc->ilock_flags = 0;
- error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED);
- if (error)
+ lock_mode = xchk_parent_ilock_dir(dp);
+ if (!lock_mode) {
+ xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ error = -EAGAIN;
goto out_rele;
+ }
- /* Go looking for our dentry. */
- error = xchk_parent_count_parent_dentries(sc, dp, &nlink);
+ /* Look for a directory entry in the parent pointing to the child. */
+ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out_unlock;
- /* Drop the parent lock, relock this inode. */
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
- error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
- if (error)
- goto out_rele;
- sc->ilock_flags = XFS_IOLOCK_EXCL;
-
- /*
- * If we're an unlinked directory, the parent /won't/ have a link
- * to us. Otherwise, it should have one link. We have to re-set
- * it here because we dropped the lock on sc->ip.
- */
- expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
-
- /* Look up '..' to see if the inode changed. */
- error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
- if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out_rele;
-
- /* Drat, parent changed. Try again! */
- if (dnum != dp->i_ino) {
- xfs_irele(dp);
- *try_again = true;
- return 0;
- }
- xfs_irele(dp);
-
/*
- * '..' didn't change, so check that there was only one entry
- * for us in the parent.
+ * Ensure that the parent has as many links to the child as the child
+ * thinks it has to the parent.
*/
- if (nlink != expected_nlink)
+ if (spc.nlink != expected_nlink)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- return error;
out_unlock:
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ xfs_iunlock(dp, lock_mode);
out_rele:
- xfs_irele(dp);
-out:
+ xchk_irele(sc, dp);
return error;
}
@@ -264,9 +181,7 @@ xchk_parent(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- xfs_ino_t dnum;
- bool try_again;
- int tries = 0;
+ xfs_ino_t parent_ino;
int error = 0;
/*
@@ -279,56 +194,29 @@ xchk_parent(
/* We're not a special inode, are we? */
if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
- /*
- * The VFS grabs a read or write lock via i_rwsem before it reads
- * or writes to a directory. If we've gotten this far we've
- * already obtained IOLOCK_EXCL, which (since 4.10) is the same as
- * getting a write lock on i_rwsem. Therefore, it is safe for us
- * to drop the ILOCK here in order to do directory lookups.
- */
- sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
- xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL);
-
- /* Look up '..' */
- error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL);
- if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
- if (!xfs_verify_dir_ino(mp, dnum)) {
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
- }
+ do {
+ if (xchk_should_terminate(sc, &error))
+ break;
- /* Is this the root dir? Then '..' must point to itself. */
- if (sc->ip == mp->m_rootip) {
- if (sc->ip->i_ino != mp->m_sb.sb_rootino ||
- sc->ip->i_ino != dnum)
+ /* Look up '..' */
+ error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot,
+ &parent_ino);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
+ return error;
+ if (!xfs_verify_dir_ino(mp, parent_ino)) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
- }
+ return 0;
+ }
- do {
- error = xchk_parent_validate(sc, dnum, &try_again);
- if (error)
- goto out;
- } while (try_again && ++tries < 20);
+ /*
+ * Check that the dotdot entry points to a parent directory
+ * containing a dirent pointing to this subdirectory.
+ */
+ error = xchk_parent_validate(sc, parent_ino);
+ } while (error == -EAGAIN);
- /*
- * We gave it our best shot but failed, so mark this scrub
- * incomplete. Userspace can decide if it wants to try again.
- */
- if (try_again && tries == 20)
- xchk_set_incomplete(sc);
-out:
- /*
- * If we failed to lock the parent inode even after a retry, just mark
- * this scrub incomplete and return.
- */
- if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
- error = 0;
- xchk_set_incomplete(sc);
- }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 9eeac8565394..e6caa358cbda 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -53,6 +53,9 @@ xchk_setup_quota(
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
error = xchk_setup_fs(sc);
if (error)
return error;
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
new file mode 100644
index 000000000000..e51c1544be63
--- /dev/null
+++ b/fs/xfs/scrub/readdir.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/readdir.h"
+
+/* Call a function for every entry in a shortform directory. */
+STATIC int
+xchk_dir_walk_sf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xchk_dirent_fn dirent_fn,
+ void *priv)
+{
+ struct xfs_name name = {
+ .name = ".",
+ .len = 1,
+ .type = XFS_DIR3_FT_DIR,
+ };
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_dir2_sf_entry *sfep;
+ struct xfs_dir2_sf_hdr *sfp;
+ xfs_ino_t ino;
+ xfs_dir2_dataptr_t dapos;
+ unsigned int i;
+ int error;
+
+ ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+ sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+
+ /* dot entry */
+ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ geo->data_entry_offset);
+
+ error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv);
+ if (error)
+ return error;
+
+ /* dotdot entry */
+ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ geo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, sizeof(".") - 1));
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
+ name.name = "..";
+ name.len = 2;
+
+ error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+ if (error)
+ return error;
+
+ /* iterate everything else */
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ for (i = 0; i < sfp->count; i++) {
+ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ xfs_dir2_sf_get_offset(sfep));
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ name.name = sfep->name;
+ name.len = sfep->namelen;
+ name.type = xfs_dir2_sf_get_ftype(mp, sfep);
+
+ error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+ if (error)
+ return error;
+
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
+ }
+
+ return 0;
+}
+
+/* Call a function for every entry in a block directory. */
+STATIC int
+xchk_dir_walk_block(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xchk_dirent_fn dirent_fn,
+ void *priv)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_buf *bp;
+ unsigned int off, next_off, end;
+ int error;
+
+ error = xfs_dir3_block_read(sc->tp, dp, &bp);
+ if (error)
+ return error;
+
+ /* Walk each directory entry. */
+ end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+ for (off = geo->data_entry_offset; off < end; off = next_off) {
+ struct xfs_name name = { };
+ struct xfs_dir2_data_unused *dup = bp->b_addr + off;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + off;
+ xfs_ino_t ino;
+ xfs_dir2_dataptr_t dapos;
+
+ /* Skip an empty entry. */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ next_off = off + be16_to_cpu(dup->length);
+ continue;
+ }
+
+ /* Otherwise, find the next entry and report it. */
+ next_off = off + xfs_dir2_data_entsize(mp, dep->namelen);
+ if (next_off > end)
+ break;
+
+ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off);
+ ino = be64_to_cpu(dep->inumber);
+ name.name = dep->name;
+ name.len = dep->namelen;
+ name.type = xfs_dir2_data_get_ftype(mp, dep);
+
+ error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+ if (error)
+ break;
+ }
+
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
+
+/* Read a leaf-format directory buffer. */
+STATIC int
+xchk_read_leaf_dir_buf(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_da_geometry *geo,
+ xfs_dir2_off_t *curoff,
+ struct xfs_buf **bpp)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec map;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
+ xfs_dablk_t last_da;
+ xfs_dablk_t map_off;
+ xfs_dir2_off_t new_off;
+
+ *bpp = NULL;
+
+ /*
+ * Look for mapped directory blocks at or above the current offset.
+ * Truncate down to the nearest directory block to start the scanning
+ * operation.
+ */
+ last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+ map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff));
+
+ if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
+ return 0;
+ if (map.br_startoff >= last_da)
+ return 0;
+ xfs_trim_extent(&map, map_off, last_da - map_off);
+
+ /* Read the directory block of that first mapping. */
+ new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+ if (new_off > *curoff)
+ *curoff = new_off;
+
+ return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp);
+}
+
+/* Call a function for every entry in a leaf directory. */
+STATIC int
+xchk_dir_walk_leaf(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xchk_dirent_fn dirent_fn,
+ void *priv)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_buf *bp = NULL;
+ xfs_dir2_off_t curoff = 0;
+ unsigned int offset = 0;
+ int error;
+
+ /* Iterate every directory offset in this directory. */
+ while (curoff < XFS_DIR2_LEAF_OFFSET) {
+ struct xfs_name name = { };
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_entry *dep;
+ xfs_ino_t ino;
+ unsigned int length;
+ xfs_dir2_dataptr_t dapos;
+
+ /*
+ * If we have no buffer, or we're off the end of the
+ * current buffer, need to get another one.
+ */
+ if (!bp || offset >= geo->blksize) {
+ if (bp) {
+ xfs_trans_brelse(sc->tp, bp);
+ bp = NULL;
+ }
+
+ error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff,
+ &bp);
+ if (error || !bp)
+ break;
+
+ /*
+ * Find our position in the block.
+ */
+ offset = geo->data_entry_offset;
+ curoff += geo->data_entry_offset;
+ }
+
+ /* Skip an empty entry. */
+ dup = bp->b_addr + offset;
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ length = be16_to_cpu(dup->length);
+ offset += length;
+ curoff += length;
+ continue;
+ }
+
+ /* Otherwise, find the next entry and report it. */
+ dep = bp->b_addr + offset;
+ length = xfs_dir2_data_entsize(mp, dep->namelen);
+
+ dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ ino = be64_to_cpu(dep->inumber);
+ name.name = dep->name;
+ name.len = dep->namelen;
+ name.type = xfs_dir2_data_get_ftype(mp, dep);
+
+ error = dirent_fn(sc, dp, dapos, &name, ino, priv);
+ if (error)
+ break;
+
+ /* Advance to the next entry. */
+ offset += length;
+ curoff += length;
+ }
+
+ if (bp)
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+}
+
+/*
+ * Call a function for every entry in a directory.
+ *
+ * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*.
+ */
+int
+xchk_dir_walk(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ xchk_dirent_fn dirent_fn,
+ void *priv)
+{
+ struct xfs_da_args args = {
+ .dp = dp,
+ .geo = dp->i_mount->m_dir_geo,
+ .trans = sc->tp,
+ };
+ bool isblock;
+ int error;
+
+ if (xfs_is_shutdown(dp->i_mount))
+ return -EIO;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
+ return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
+
+ /* dir2 functions require that the data fork is loaded */
+ error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xfs_dir2_isblock(&args, &isblock);
+ if (error)
+ return error;
+
+ if (isblock)
+ return xchk_dir_walk_block(sc, dp, dirent_fn, priv);
+
+ return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv);
+}
+
+/*
+ * Look up the inode number for an exact name in a directory.
+ *
+ * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. Names are not
+ * checked for correctness.
+ */
+int
+xchk_dir_lookup(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *ino)
+{
+ struct xfs_da_args args = {
+ .dp = dp,
+ .geo = dp->i_mount->m_dir_geo,
+ .trans = sc->tp,
+ .name = name->name,
+ .namelen = name->len,
+ .filetype = name->type,
+ .hashval = xfs_dir2_hashname(dp->i_mount, name),
+ .whichfork = XFS_DATA_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ };
+ bool isblock, isleaf;
+ int error;
+
+ if (xfs_is_shutdown(dp->i_mount))
+ return -EIO;
+
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ error = xfs_dir2_sf_lookup(&args);
+ goto out_check_rval;
+ }
+
+ /* dir2 functions require that the data fork is loaded */
+ error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ error = xfs_dir2_isblock(&args, &isblock);
+ if (error)
+ return error;
+
+ if (isblock) {
+ error = xfs_dir2_block_lookup(&args);
+ goto out_check_rval;
+ }
+
+ error = xfs_dir2_isleaf(&args, &isleaf);
+ if (error)
+ return error;
+
+ if (isleaf) {
+ error = xfs_dir2_leaf_lookup(&args);
+ goto out_check_rval;
+ }
+
+ error = xfs_dir2_node_lookup(&args);
+
+out_check_rval:
+ if (error == -EEXIST)
+ error = 0;
+ if (!error)
+ *ino = args.inumber;
+ return error;
+}
diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h
new file mode 100644
index 000000000000..55787f4df123
--- /dev/null
+++ b/fs/xfs/scrub/readdir.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_READDIR_H__
+#define __XFS_SCRUB_READDIR_H__
+
+typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp,
+ xfs_dir2_dataptr_t dapos, const struct xfs_name *name,
+ xfs_ino_t ino, void *priv);
+
+int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp,
+ xchk_dirent_fn dirent_fn, void *priv);
+
+int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp,
+ const struct xfs_name *name, xfs_ino_t *ino);
+
+#endif /* __XFS_SCRUB_READDIR_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index d9c1b3cea4a5..304ea1e1bfb0 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -1,21 +1,22 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_ag.h"
+#include "scrub/trace.h"
/*
* Set us up to scrub reference count btrees.
@@ -24,6 +25,8 @@ int
xchk_setup_ag_refcountbt(
struct xfs_scrub *sc)
{
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
return xchk_setup_ag_btree(sc, false);
}
@@ -300,8 +303,10 @@ xchk_refcountbt_xref_rmap(
goto out_free;
xchk_refcountbt_process_rmap_fragments(&refchk);
- if (irec->rc_refcount != refchk.seen)
+ if (irec->rc_refcount != refchk.seen) {
+ trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen);
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+ }
out_free:
list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
@@ -325,6 +330,107 @@ xchk_refcountbt_xref(
xchk_refcountbt_xref_rmap(sc, irec);
}
+struct xchk_refcbt_records {
+ /* Previous refcount record. */
+ struct xfs_refcount_irec prev_rec;
+
+ /* The next AG block where we aren't expecting shared extents. */
+ xfs_agblock_t next_unshared_agbno;
+
+ /* Number of CoW blocks we expect. */
+ xfs_agblock_t cow_blocks;
+
+ /* Was the last record a shared or CoW staging extent? */
+ enum xfs_refc_domain prev_domain;
+};
+
+STATIC int
+xchk_refcountbt_rmap_check_gap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ xfs_agblock_t *next_bno = priv;
+
+ if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno)
+ return -ECANCELED;
+
+ *next_bno = rec->rm_startblock + rec->rm_blockcount;
+ return 0;
+}
+
+/*
+ * Make sure that a gap in the reference count records does not correspond to
+ * overlapping records (i.e. shared extents) in the reverse mappings.
+ */
+static inline void
+xchk_refcountbt_xref_gaps(
+ struct xfs_scrub *sc,
+ struct xchk_refcbt_records *rrc,
+ xfs_agblock_t bno)
+{
+ struct xfs_rmap_irec low;
+ struct xfs_rmap_irec high;
+ xfs_agblock_t next_bno = NULLAGBLOCK;
+ int error;
+
+ if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur ||
+ xchk_skip_xref(sc->sm))
+ return;
+
+ memset(&low, 0, sizeof(low));
+ low.rm_startblock = rrc->next_unshared_agbno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno - 1;
+
+ error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
+ xchk_refcountbt_rmap_check_gap, &next_bno);
+ if (error == -ECANCELED)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+ else
+ xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur);
+}
+
+static inline bool
+xchk_refcount_mergeable(
+ struct xchk_refcbt_records *rrc,
+ const struct xfs_refcount_irec *r2)
+{
+ const struct xfs_refcount_irec *r1 = &rrc->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (r1->rc_blockcount > 0)
+ return false;
+
+ if (r1->rc_domain != r2->rc_domain)
+ return false;
+ if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
+ return false;
+ if (r1->rc_refcount != r2->rc_refcount)
+ return false;
+ if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
+ MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_refcountbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_refcbt_records *rrc,
+ const struct xfs_refcount_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_refcount_mergeable(rrc, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
+}
+
/* Scrub a refcountbt record. */
STATIC int
xchk_refcountbt_rec(
@@ -332,27 +438,37 @@ xchk_refcountbt_rec(
const union xfs_btree_rec *rec)
{
struct xfs_refcount_irec irec;
- xfs_agblock_t *cow_blocks = bs->private;
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
+ struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
-
- /* Check the domain and refcount are not incompatible. */
- if (!xfs_refcount_check_domain(&irec))
+ if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
- (*cow_blocks) += irec.rc_blockcount;
-
- /* Check the extent. */
- if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ rrc->cow_blocks += irec.rc_blockcount;
- if (irec.rc_refcount == 0)
+ /* Shared records always come before CoW records. */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
+ rrc->prev_domain == XFS_REFC_DOMAIN_COW)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ rrc->prev_domain = irec.rc_domain;
+ xchk_refcountbt_check_mergeable(bs, rrc, &irec);
xchk_refcountbt_xref(bs->sc, &irec);
+ /*
+ * If this is a record for a shared extent, check that all blocks
+ * between the previous record and this one have at most one reverse
+ * mapping.
+ */
+ if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
+ xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
+ rrc->next_unshared_agbno = irec.rc_startblock +
+ irec.rc_blockcount;
+ }
+
return 0;
}
@@ -394,15 +510,25 @@ int
xchk_refcountbt(
struct xfs_scrub *sc)
{
- xfs_agblock_t cow_blocks = 0;
+ struct xchk_refcbt_records rrc = {
+ .cow_blocks = 0,
+ .next_unshared_agbno = 0,
+ .prev_domain = XFS_REFC_DOMAIN_SHARED,
+ };
int error;
error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
- &XFS_RMAP_OINFO_REFC, &cow_blocks);
+ &XFS_RMAP_OINFO_REFC, &rrc);
if (error)
return error;
- xchk_refcount_xref_rmap(sc, cow_blocks);
+ /*
+ * Check that all blocks between the last refcount > 1 record and the
+ * end of the AG have at most one reverse mapping.
+ */
+ xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks);
+
+ xchk_refcount_xref_rmap(sc, rrc.cow_blocks);
return 0;
}
@@ -458,16 +584,37 @@ xchk_xref_is_not_shared(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- bool shared;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
+ return;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
+}
+
+/* xref check that the extent is not being used for CoW staging. */
+void
+xchk_xref_is_not_cow_staging(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ enum xbtree_recpacking outcome;
int error;
if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED,
- agbno, len, &shared);
+ error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
+ agbno, len, &outcome);
if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
return;
- if (shared)
+ if (outcome != XBTREE_RECPACKING_EMPTY)
xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b71174ec0d6..ac6d8803e660 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2018 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -60,6 +60,9 @@ xrep_attempt(
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
+ case -ECHRNG:
+ sc->flags |= XCHK_NEED_DRAIN;
+ return -EAGAIN;
case -EDEADLOCK:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
@@ -442,6 +445,30 @@ xrep_init_btblock(
* buffers associated with @bitmap.
*/
+static int
+xrep_invalidate_block(
+ uint64_t fsbno,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ struct xfs_buf *bp;
+ int error;
+
+ /* Skip AG headers and post-EOFS blocks */
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ return 0;
+
+ error = xfs_buf_incore(sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+ if (error)
+ return 0;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ return 0;
+}
+
/*
* Invalidate buffers for per-AG btree blocks we're dumping. This function
* is not intended for use with file data repairs; we have bunmapi for that.
@@ -451,11 +478,6 @@ xrep_invalidate_blocks(
struct xfs_scrub *sc,
struct xbitmap *bitmap)
{
- struct xbitmap_range *bmr;
- struct xbitmap_range *n;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
-
/*
* For each block in each extent, see if there's an incore buffer for
* exactly that block; if so, invalidate it. The buffer cache only
@@ -464,23 +486,7 @@ xrep_invalidate_blocks(
* because we never own those; and if we can't TRYLOCK the buffer we
* assume it's owned by someone else.
*/
- for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
- int error;
-
- /* Skip AG headers and post-EOFS blocks */
- if (!xfs_verify_fsbno(sc->mp, fsbno))
- continue;
- error = xfs_buf_incore(sc->mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
- if (error)
- continue;
-
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- }
-
- return 0;
+ return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
}
/* Ensure the freelist is the correct size. */
@@ -501,6 +507,15 @@ xrep_fix_freelist(
can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
}
+/* Information about reaping extents after a repair. */
+struct xrep_reap_state {
+ struct xfs_scrub *sc;
+
+ /* Reverse mapping owner and metadata reservation type. */
+ const struct xfs_owner_info *oinfo;
+ enum xfs_ag_resv_type resv;
+};
+
/*
* Put a block back on the AGFL.
*/
@@ -545,17 +560,23 @@ xrep_put_freelist(
/* Dispose of a single block. */
STATIC int
xrep_reap_block(
- struct xfs_scrub *sc,
- xfs_fsblock_t fsbno,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type resv)
+ uint64_t fsbno,
+ void *priv)
{
+ struct xrep_reap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
struct xfs_btree_cur *cur;
struct xfs_buf *agf_bp = NULL;
xfs_agblock_t agbno;
bool has_other_rmap;
int error;
+ ASSERT(sc->ip != NULL ||
+ XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
+ trace_xrep_dispose_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
+
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
@@ -574,7 +595,8 @@ xrep_reap_block(
cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
/* Can we find any other rmappings? */
- error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
+ &has_other_rmap);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_free;
@@ -594,11 +616,12 @@ xrep_reap_block(
*/
if (has_other_rmap)
error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
- 1, oinfo);
- else if (resv == XFS_AG_RESV_AGFL)
+ 1, rs->oinfo);
+ else if (rs->resv == XFS_AG_RESV_AGFL)
error = xrep_put_freelist(sc, agbno);
else
- error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
+ rs->resv);
if (agf_bp != sc->sa.agf_bp)
xfs_trans_brelse(sc->tp, agf_bp);
if (error)
@@ -622,26 +645,15 @@ xrep_reap_extents(
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- struct xbitmap_range *bmr;
- struct xbitmap_range *n;
- xfs_fsblock_t fsbno;
- int error = 0;
+ struct xrep_reap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = type,
+ };
ASSERT(xfs_has_rmapbt(sc->mp));
- for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
- ASSERT(sc->ip != NULL ||
- XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
- trace_xrep_dispose_btree_extent(sc->mp,
- XFS_FSB_TO_AGNO(sc->mp, fsbno),
- XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
-
- error = xrep_reap_block(sc, fsbno, oinfo, type);
- if (error)
- break;
- }
-
- return error;
+ return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
}
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 840f74ec431c..dce791c679ee 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2018 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_REPAIR_H__
#define __XFS_SCRUB_REPAIR_H__
@@ -31,6 +31,7 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
const struct xfs_buf_ops *ops);
struct xbitmap;
+struct xagb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 229826b2e1c0..d29a26ecddd6 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -1,21 +1,29 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
+#include "xfs_ag.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_refcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/bitmap.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -24,11 +32,39 @@ int
xchk_setup_ag_rmapbt(
struct xfs_scrub *sc)
{
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
return xchk_setup_ag_btree(sc, false);
}
/* Reverse-mapping scrubber. */
+struct xchk_rmap {
+ /*
+ * The furthest-reaching of the rmapbt records that we've already
+ * processed. This enables us to detect overlapping records for space
+ * allocations that cannot be shared.
+ */
+ struct xfs_rmap_irec overlap_rec;
+
+ /*
+ * The previous rmapbt record, so that we can check for two records
+ * that could be one.
+ */
+ struct xfs_rmap_irec prev_rec;
+
+ /* Bitmaps containing all blocks for each type of AG metadata. */
+ struct xagb_bitmap fs_owned;
+ struct xagb_bitmap log_owned;
+ struct xagb_bitmap ag_owned;
+ struct xagb_bitmap inobt_owned;
+ struct xagb_bitmap refcbt_owned;
+
+ /* Did we complete the AG space metadata bitmaps? */
+ bool bitmaps_complete;
+};
+
/* Cross-reference a rmap against the refcount btree. */
STATIC void
xchk_rmapbt_xref_refc(
@@ -84,80 +120,415 @@ xchk_rmapbt_xref(
xchk_rmapbt_xref_refc(sc, irec);
}
-/* Scrub an rmapbt record. */
-STATIC int
-xchk_rmapbt_rec(
- struct xchk_btree *bs,
- const union xfs_btree_rec *rec)
+/*
+ * Check for bogus UNWRITTEN flags in the rmapbt node block keys.
+ *
+ * In reverse mapping records, the file mapping extent state
+ * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not
+ * involved in lookups in any way. In older kernels, the functions that
+ * convert rmapbt records to keys forgot to filter out the extent state bit,
+ * even though the key comparison functions have filtered the flag correctly.
+ * If we spot an rmap key with the unwritten bit set in rm_offset, we should
+ * mark the btree as needing optimization to rebuild the btree without those
+ * flags.
+ */
+STATIC void
+xchk_rmapbt_check_unwritten_in_keyflags(
+ struct xchk_btree *bs)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
- struct xfs_rmap_irec irec;
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
- bool non_inode;
- bool is_unwritten;
- bool is_bmbt;
- bool is_attr;
- int error;
+ struct xfs_scrub *sc = bs->sc;
+ struct xfs_btree_cur *cur = bs->cur;
+ struct xfs_btree_block *keyblock;
+ union xfs_btree_key *lkey, *hkey;
+ __be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
+ unsigned int level;
- error = xfs_rmap_btrec_to_irec(rec, &irec);
- if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error))
- goto out;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+ return;
+
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ struct xfs_buf *bp;
+ unsigned int ptr;
+
+ /* Only check the first time we've seen this node block. */
+ if (cur->bc_levels[level].ptr > 1)
+ continue;
+
+ keyblock = xfs_btree_get_block(cur, level, &bp);
+ for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) {
+ lkey = xfs_btree_key_addr(cur, ptr, keyblock);
+
+ if (lkey->rmap.rm_offset & badflag) {
+ xchk_btree_set_preen(sc, cur, level);
+ break;
+ }
+
+ hkey = xfs_btree_high_key_addr(cur, ptr, keyblock);
+ if (hkey->rmap.rm_offset & badflag) {
+ xchk_btree_set_preen(sc, cur, level);
+ break;
+ }
+ }
+ }
+}
+
+static inline bool
+xchk_rmapbt_is_shareable(
+ struct xfs_scrub *sc,
+ const struct xfs_rmap_irec *irec)
+{
+ if (!xfs_has_reflink(sc->mp))
+ return false;
+ if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner))
+ return false;
+ if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK |
+ XFS_RMAP_UNWRITTEN))
+ return false;
+ return true;
+}
+
+/* Flag failures for records that overlap but cannot. */
+STATIC void
+xchk_rmapbt_check_overlapping(
+ struct xchk_btree *bs,
+ struct xchk_rmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ xfs_agblock_t pnext, inext;
+
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ /* No previous record? */
+ if (cr->overlap_rec.rm_blockcount == 0)
+ goto set_prev;
+
+ /* Do overlap_rec and irec overlap? */
+ pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount;
+ if (pnext <= irec->rm_startblock)
+ goto set_prev;
- /* Check extent. */
- if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock)
+ /* Overlap is only allowed if both records are data fork mappings. */
+ if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) ||
+ !xchk_rmapbt_is_shareable(bs->sc, irec))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- if (irec.rm_owner == XFS_RMAP_OWN_FS) {
+ /* Save whichever rmap record extends furthest. */
+ inext = irec->rm_startblock + irec->rm_blockcount;
+ if (pnext > inext)
+ return;
+
+set_prev:
+ memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Decide if two reverse-mapping records can be merged. */
+static inline bool
+xchk_rmap_mergeable(
+ struct xchk_rmap *cr,
+ const struct xfs_rmap_irec *r2)
+{
+ const struct xfs_rmap_irec *r1 = &cr->prev_rec;
+
+ /* Ignore if prev_rec is not yet initialized. */
+ if (cr->prev_rec.rm_blockcount == 0)
+ return false;
+
+ if (r1->rm_owner != r2->rm_owner)
+ return false;
+ if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock)
+ return false;
+ if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount >
+ XFS_RMAP_LEN_MAX)
+ return false;
+ if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner))
+ return true;
+ /* must be an inode owner below here */
+ if (r1->rm_flags != r2->rm_flags)
+ return false;
+ if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ return true;
+ return r1->rm_offset + r1->rm_blockcount == r2->rm_offset;
+}
+
+/* Flag failures for records that could be merged. */
+STATIC void
+xchk_rmapbt_check_mergeable(
+ struct xchk_btree *bs,
+ struct xchk_rmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ if (xchk_rmap_mergeable(cr, irec))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+
+ memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec));
+}
+
+/* Compare an rmap for AG metadata against the metadata walk. */
+STATIC int
+xchk_rmapbt_mark_bitmap(
+ struct xchk_btree *bs,
+ struct xchk_rmap *cr,
+ const struct xfs_rmap_irec *irec)
+{
+ struct xfs_scrub *sc = bs->sc;
+ struct xagb_bitmap *bmp = NULL;
+ xfs_extlen_t fsbcount = irec->rm_blockcount;
+
+ /*
+ * Skip corrupt records. It is essential that we detect records in the
+ * btree that cannot overlap but do, flag those as CORRUPT, and skip
+ * the bitmap comparison to avoid generating false XCORRUPT reports.
+ */
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /*
+ * If the AG metadata walk didn't complete, there's no point in
+ * comparing against partial results.
+ */
+ if (!cr->bitmaps_complete)
+ return 0;
+
+ switch (irec->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ bmp = &cr->fs_owned;
+ break;
+ case XFS_RMAP_OWN_LOG:
+ bmp = &cr->log_owned;
+ break;
+ case XFS_RMAP_OWN_AG:
+ bmp = &cr->ag_owned;
+ break;
+ case XFS_RMAP_OWN_INOBT:
+ bmp = &cr->inobt_owned;
+ break;
+ case XFS_RMAP_OWN_REFC:
+ bmp = &cr->refcbt_owned;
+ break;
+ }
+
+ if (!bmp)
+ return 0;
+
+ if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) {
/*
- * xfs_verify_agbno returns false for static fs metadata.
- * Since that only exists at the start of the AG, validate
- * that by hand.
+ * The start of this reverse mapping corresponds to a set
+ * region in the bitmap. If the mapping covers more area than
+ * the set region, then it covers space that wasn't found by
+ * the AG metadata walk.
*/
- if (irec.rm_startblock != 0 ||
- irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1)
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ if (fsbcount < irec->rm_blockcount)
+ xchk_btree_xref_set_corrupt(bs->sc,
+ bs->sc->sa.rmap_cur, 0);
} else {
/*
- * Otherwise we must point somewhere past the static metadata
- * but before the end of the FS. Run the regular check.
+ * The start of this reverse mapping does not correspond to a
+ * completely set region in the bitmap. The region wasn't
+ * fully set by walking the AG metadata, so this is a
+ * cross-referencing corruption.
*/
- if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
- !xfs_verify_agbno(pag, irec.rm_startblock +
- irec.rm_blockcount - 1))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0);
}
- /* Check flags. */
- non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner);
- is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK;
- is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK;
- is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN;
+ /* Unset the region so that we can detect missing rmap records. */
+ return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount);
+}
- if (is_bmbt && irec.rm_offset != 0)
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+/* Scrub an rmapbt record. */
+STATIC int
+xchk_rmapbt_rec(
+ struct xchk_btree *bs,
+ const union xfs_btree_rec *rec)
+{
+ struct xchk_rmap *cr = bs->private;
+ struct xfs_rmap_irec irec;
- if (non_inode && irec.rm_offset != 0)
+ if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
+ xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
- if (is_unwritten && (is_bmbt || non_inode || is_attr))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_rmapbt_check_unwritten_in_keyflags(bs);
+ xchk_rmapbt_check_mergeable(bs, cr, &irec);
+ xchk_rmapbt_check_overlapping(bs, cr, &irec);
+ xchk_rmapbt_xref(bs->sc, &irec);
- if (non_inode && (is_bmbt || is_unwritten || is_attr))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return xchk_rmapbt_mark_bitmap(bs, cr, &irec);
+}
- if (!non_inode) {
- if (!xfs_verify_ino(mp, irec.rm_owner))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- } else {
- /* Non-inode owner within the magic values? */
- if (irec.rm_owner <= XFS_RMAP_OWN_MIN ||
- irec.rm_owner > XFS_RMAP_OWN_FS)
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xchk_rmapbt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+
+ return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/*
+ * Set up bitmaps mapping all the AG metadata to compare with the rmapbt
+ * records.
+ *
+ * Grab our own btree cursors here if the scrub setup function didn't give us a
+ * btree cursor due to reports of poor health. We need to find out if the
+ * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being
+ * XCORRUPT.
+ */
+STATIC int
+xchk_rmapbt_walk_ag_metadata(
+ struct xfs_scrub *sc,
+ struct xchk_rmap *cr)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ /* OWN_FS: AG headers */
+ error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp),
+ XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1);
+ if (error)
+ goto out;
+
+ /* OWN_LOG: Internal log */
+ if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) {
+ error = xagb_bitmap_set(&cr->log_owned,
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
+ mp->m_sb.sb_logblocks);
+ if (error)
+ goto out;
+ }
+
+ /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
+ cur = sc->sa.bno_cur;
+ if (!cur)
+ cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
+ error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
+ if (cur != sc->sa.bno_cur)
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+
+ cur = sc->sa.cnt_cur;
+ if (!cur)
+ cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
+ error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
+ if (cur != sc->sa.cnt_cur)
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+
+ error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur);
+ if (error)
+ goto out;
+
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ goto out;
+
+ error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl,
+ &cr->ag_owned);
+ xfs_trans_brelse(sc->tp, agfl_bp);
+ if (error)
+ goto out;
+
+ /* OWN_INOBT: inobt, finobt */
+ cur = sc->sa.ino_cur;
+ if (!cur)
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
+ XFS_BTNUM_INO);
+ error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
+ if (cur != sc->sa.ino_cur)
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+
+ if (xfs_has_finobt(sc->mp)) {
+ cur = sc->sa.fino_cur;
+ if (!cur)
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+ sc->sa.agi_bp, XFS_BTNUM_FINO);
+ error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
+ if (cur != sc->sa.fino_cur)
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+ }
+
+ /* OWN_REFC: refcountbt */
+ if (xfs_has_reflink(sc->mp)) {
+ cur = sc->sa.refc_cur;
+ if (!cur)
+ cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp,
+ sc->sa.agf_bp, sc->sa.pag);
+ error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur);
+ if (cur != sc->sa.refc_cur)
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
}
- xchk_rmapbt_xref(bs->sc, &irec);
out:
- return error;
+ /*
+ * If there's an error, set XFAIL and disable the bitmap
+ * cross-referencing checks, but proceed with the scrub anyway.
+ */
+ if (error)
+ xchk_btree_xref_process_error(sc, sc->sa.rmap_cur,
+ sc->sa.rmap_cur->bc_nlevels - 1, &error);
+ else
+ cr->bitmaps_complete = true;
+ return 0;
+}
+
+/*
+ * Check for set regions in the bitmaps; if there are any, the rmap records do
+ * not describe all the AG metadata.
+ */
+STATIC void
+xchk_rmapbt_check_bitmaps(
+ struct xfs_scrub *sc,
+ struct xchk_rmap *cr)
+{
+ struct xfs_btree_cur *cur = sc->sa.rmap_cur;
+ unsigned int level;
+
+ if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XFAIL))
+ return;
+ if (!cur)
+ return;
+ level = cur->bc_nlevels - 1;
+
+ /*
+ * Any bitmap with bits still set indicates that the reverse mapping
+ * doesn't cover the entire primary structure.
+ */
+ if (xagb_bitmap_hweight(&cr->fs_owned) != 0)
+ xchk_btree_xref_set_corrupt(sc, cur, level);
+
+ if (xagb_bitmap_hweight(&cr->log_owned) != 0)
+ xchk_btree_xref_set_corrupt(sc, cur, level);
+
+ if (xagb_bitmap_hweight(&cr->ag_owned) != 0)
+ xchk_btree_xref_set_corrupt(sc, cur, level);
+
+ if (xagb_bitmap_hweight(&cr->inobt_owned) != 0)
+ xchk_btree_xref_set_corrupt(sc, cur, level);
+
+ if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0)
+ xchk_btree_xref_set_corrupt(sc, cur, level);
}
/* Scrub the rmap btree for some AG. */
@@ -165,42 +536,63 @@ int
xchk_rmapbt(
struct xfs_scrub *sc)
{
- return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
- &XFS_RMAP_OINFO_AG, NULL);
+ struct xchk_rmap *cr;
+ int error;
+
+ cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS);
+ if (!cr)
+ return -ENOMEM;
+
+ xagb_bitmap_init(&cr->fs_owned);
+ xagb_bitmap_init(&cr->log_owned);
+ xagb_bitmap_init(&cr->ag_owned);
+ xagb_bitmap_init(&cr->inobt_owned);
+ xagb_bitmap_init(&cr->refcbt_owned);
+
+ error = xchk_rmapbt_walk_ag_metadata(sc, cr);
+ if (error)
+ goto out;
+
+ error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
+ &XFS_RMAP_OINFO_AG, cr);
+ if (error)
+ goto out;
+
+ xchk_rmapbt_check_bitmaps(sc, cr);
+
+out:
+ xagb_bitmap_destroy(&cr->refcbt_owned);
+ xagb_bitmap_destroy(&cr->inobt_owned);
+ xagb_bitmap_destroy(&cr->ag_owned);
+ xagb_bitmap_destroy(&cr->log_owned);
+ xagb_bitmap_destroy(&cr->fs_owned);
+ kfree(cr);
+ return error;
}
-/* xref check that the extent is owned by a given owner */
-static inline void
-xchk_xref_check_owner(
+/* xref check that the extent is owned only by a given owner */
+void
+xchk_xref_is_only_owned_by(
struct xfs_scrub *sc,
xfs_agblock_t bno,
xfs_extlen_t len,
- const struct xfs_owner_info *oinfo,
- bool should_have_rmap)
+ const struct xfs_owner_info *oinfo)
{
- bool has_rmap;
+ struct xfs_rmap_matches res;
int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
- &has_rmap);
+ error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
- if (has_rmap != should_have_rmap)
+ if (res.matches != 1)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+ if (res.bad_non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+ if (res.non_owner_matches)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
-}
-
-/* xref check that the extent is owned by a given owner */
-void
-xchk_xref_is_owned_by(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- const struct xfs_owner_info *oinfo)
-{
- xchk_xref_check_owner(sc, bno, len, oinfo, true);
}
/* xref check that the extent is not owned by a given owner */
@@ -211,7 +603,19 @@ xchk_xref_is_not_owned_by(
xfs_extlen_t len,
const struct xfs_owner_info *oinfo)
{
- xchk_xref_check_owner(sc, bno, len, oinfo, false);
+ struct xfs_rmap_matches res;
+ int error;
+
+ if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
+ return;
+
+ error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
+ return;
+ if (res.matches != 0)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
+ if (res.bad_non_owner_matches)
+ xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
/* xref check that the extent has no reverse mapping at all */
@@ -221,15 +625,15 @@ xchk_xref_has_no_owner(
xfs_agblock_t bno,
xfs_extlen_t len)
{
- bool has_rmap;
+ enum xbtree_recpacking outcome;
int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
- error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
+ error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
- if (has_rmap)
+ if (outcome != XBTREE_RECPACKING_EMPTY)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 0a3bde64c675..e7dace7b4be8 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 07a7a75f987f..02819bedc5b1 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
@@ -145,6 +145,21 @@ xchk_probe(
/* Scrub setup and teardown */
+static inline void
+xchk_fsgates_disable(
+ struct xfs_scrub *sc)
+{
+ if (!(sc->flags & XCHK_FSGATES_ALL))
+ return;
+
+ trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
+
+ if (sc->flags & XCHK_FSGATES_DRAIN)
+ xfs_drain_wait_disable();
+
+ sc->flags &= ~XCHK_FSGATES_ALL;
+}
+
/* Free all the resources and finish the transactions. */
STATIC int
xchk_teardown(
@@ -166,7 +181,7 @@ xchk_teardown(
xfs_iunlock(sc->ip, sc->ilock_flags);
if (sc->ip != ip_in &&
!xfs_internal_inum(sc->mp, sc->ip->i_ino))
- xfs_irele(sc->ip);
+ xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
@@ -174,9 +189,14 @@ xchk_teardown(
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
if (sc->buf) {
+ if (sc->buf_cleanup)
+ sc->buf_cleanup(sc->buf);
kvfree(sc->buf);
+ sc->buf_cleanup = NULL;
sc->buf = NULL;
}
+
+ xchk_fsgates_disable(sc);
return error;
}
@@ -191,25 +211,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_SB] = { /* superblock */
.type = ST_PERAG,
- .setup = xchk_setup_fs,
+ .setup = xchk_setup_agheader,
.scrub = xchk_superblock,
.repair = xrep_superblock,
},
[XFS_SCRUB_TYPE_AGF] = { /* agf */
.type = ST_PERAG,
- .setup = xchk_setup_fs,
+ .setup = xchk_setup_agheader,
.scrub = xchk_agf,
.repair = xrep_agf,
},
[XFS_SCRUB_TYPE_AGFL]= { /* agfl */
.type = ST_PERAG,
- .setup = xchk_setup_fs,
+ .setup = xchk_setup_agheader,
.scrub = xchk_agfl,
.repair = xrep_agfl,
},
[XFS_SCRUB_TYPE_AGI] = { /* agi */
.type = ST_PERAG,
- .setup = xchk_setup_fs,
+ .setup = xchk_setup_agheader,
.scrub = xchk_agi,
.repair = xrep_agi,
},
@@ -491,23 +511,20 @@ retry_op:
/* Set up for the operation. */
error = sc->ops->setup(sc);
+ if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
+ goto try_harder;
+ if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
+ goto need_drain;
if (error)
goto out_teardown;
/* Scrub for errors. */
error = sc->ops->scrub(sc);
- if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
- /*
- * Scrubbers return -EDEADLOCK to mean 'try harder'.
- * Tear down everything we hold, then set up again with
- * preparation for worst-case scenarios.
- */
- error = xchk_teardown(sc, 0);
- if (error)
- goto out_sc;
- sc->flags |= XCHK_TRY_HARDER;
- goto retry_op;
- } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
+ if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
+ goto try_harder;
+ if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
+ goto need_drain;
+ if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
goto out_teardown;
xchk_update_health(sc);
@@ -565,4 +582,21 @@ out:
error = 0;
}
return error;
+need_drain:
+ error = xchk_teardown(sc, 0);
+ if (error)
+ goto out_sc;
+ sc->flags |= XCHK_NEED_DRAIN;
+ goto retry_op;
+try_harder:
+ /*
+ * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down
+ * everything we hold, then set up again with preparation for
+ * worst-case scenarios.
+ */
+ error = xchk_teardown(sc, 0);
+ if (error)
+ goto out_sc;
+ sc->flags |= XCHK_TRY_HARDER;
+ goto retry_op;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index b4d391b4c938..e71903474cd7 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_SCRUB_H__
#define __XFS_SCRUB_SCRUB_H__
@@ -77,7 +77,17 @@ struct xfs_scrub {
*/
struct xfs_inode *ip;
+ /* Kernel memory buffer used by scrubbers; freed at teardown. */
void *buf;
+
+ /*
+ * Clean up resources owned by whatever is in the buffer. Cleanup can
+ * be deferred with this hook as a means for scrub functions to pass
+ * data to repair functions. This function must not free the buffer
+ * itself.
+ */
+ void (*buf_cleanup)(void *buf);
+
uint ilock_flags;
/* See the XCHK/XREP state flags below. */
@@ -96,9 +106,19 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
-#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
+#define XCHK_REAPING_DISABLED (1 << 1) /* background block reaping paused */
+#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */
+#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
+/*
+ * The XCHK_FSGATES* flags reflect functionality in the main filesystem that
+ * are only enabled for this particular online fsck. When not in use, the
+ * features are gated off via dynamic code patching, which is why the state
+ * must be enabled during scrub setup and can only be torn down afterwards.
+ */
+#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN)
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -152,7 +172,7 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
-void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
+void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len, const struct xfs_owner_info *oinfo);
void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len, const struct xfs_owner_info *oinfo);
@@ -162,6 +182,8 @@ void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
xfs_extlen_t len);
void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno,
xfs_extlen_t len);
+void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
+ xfs_extlen_t len);
#ifdef CONFIG_XFS_RT
void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
xfs_extlen_t len);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index c1c99ffe7408..38708fb9a5d7 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index b5f94676c37c..0a975439d2b6 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 93ece6df02e3..68efd6fda61c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*
* NOTE: none of these tracepoints shall be considered a stable kernel ABI
* as they can change at any time. See xfs_trace.h for documentation of
@@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
+TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
+
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
@@ -93,6 +96,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_OFLAG_WARNING, "warning" }, \
{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }
+#define XFS_SCRUB_STATE_STRINGS \
+ { XCHK_TRY_HARDER, "try_harder" }, \
+ { XCHK_REAPING_DISABLED, "reaping_disabled" }, \
+ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
+ { XCHK_NEED_DRAIN, "need_drain" }, \
+ { XREP_ALREADY_FIXED, "already_fixed" }
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -139,6 +149,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry);
DEFINE_SCRUB_EVENT(xrep_attempt);
DEFINE_SCRUB_EVENT(xrep_done);
+DECLARE_EVENT_CLASS(xchk_fsgate_class,
+ TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags),
+ TP_ARGS(sc, fsgate_flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(unsigned int, fsgate_flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->fsgate_flags = fsgate_flags;
+ ),
+ TP_printk("dev %d:%d type %s fsgates '%s'",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS))
+)
+
+#define DEFINE_SCRUB_FSHOOK_EVENT(name) \
+DEFINE_EVENT(xchk_fsgate_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \
+ TP_ARGS(sc, fsgates_flags))
+
+DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable);
+DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable);
+
TRACE_EVENT(xchk_op_error,
TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int error, void *ret_ip),
@@ -657,6 +694,38 @@ TRACE_EVENT(xchk_fscounters_within_range,
__entry->old_value)
)
+TRACE_EVENT(xchk_refcount_incorrect,
+ TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
+ xfs_nlink_t seen),
+ TP_ARGS(pag, irec, seen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ __field(xfs_nlink_t, seen)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->domain = irec->rc_domain;
+ __entry->startblock = irec->rc_startblock;
+ __entry->blockcount = irec->rc_blockcount;
+ __entry->refcount = irec->rc_refcount;
+ __entry->seen = seen;
+ ),
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->refcount,
+ __entry->seen)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h
index 2ceae614ade8..a39befa743ce 100644
--- a/fs/xfs/scrub/xfs_scrub.h
+++ b/fs/xfs/scrub/xfs_scrub.h
@@ -1,7 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Copyright (C) 2017 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_SCRUB_H__
#define __XFS_SCRUB_H__
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 6e2f0013380a..7551c3ec4ea5 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -24,6 +24,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_ag.h"
struct kmem_cache *xfs_bui_cache;
struct kmem_cache *xfs_bud_cache;
@@ -363,6 +364,34 @@ xfs_bmap_update_create_done(
return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
}
+/* Take a passive ref to the AG containing the space we're mapping. */
+void
+xfs_bmap_update_get_group(
+ struct xfs_mount *mp,
+ struct xfs_bmap_intent *bi)
+{
+ xfs_agnumber_t agno;
+
+ agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
+
+ /*
+ * Bump the intent count on behalf of the deferred rmap and refcount
+ * intent items that that we can queue when we finish this bmap work.
+ * This new intent item will bump the intent count before the bmap
+ * intent drops the intent count, ensuring that the intent count
+ * remains nonzero across the transaction roll.
+ */
+ bi->bi_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing mapping work. */
+static inline void
+xfs_bmap_update_put_group(
+ struct xfs_bmap_intent *bi)
+{
+ xfs_perag_intent_put(bi->bi_pag);
+}
+
/* Process a deferred rmap update. */
STATIC int
xfs_bmap_update_finish_item(
@@ -381,6 +410,8 @@ xfs_bmap_update_finish_item(
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
+
+ xfs_bmap_update_put_group(bi);
kmem_cache_free(xfs_bmap_intent_cache, bi);
return error;
}
@@ -393,7 +424,7 @@ xfs_bmap_update_abort_intent(
xfs_bui_release(BUI_ITEM(intent));
}
-/* Cancel a deferred rmap update. */
+/* Cancel a deferred bmap update. */
STATIC void
xfs_bmap_update_cancel_item(
struct list_head *item)
@@ -401,6 +432,8 @@ xfs_bmap_update_cancel_item(
struct xfs_bmap_intent *bi;
bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ xfs_bmap_update_put_group(bi);
kmem_cache_free(xfs_bmap_intent_cache, bi);
}
@@ -509,10 +542,12 @@ xfs_bui_item_recover(
fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ xfs_bmap_update_get_group(mp, &fake);
error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
sizeof(*map));
+ xfs_bmap_update_put_group(&fake);
if (error)
goto err_cancel;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a09dd2606479..f032d3a4b727 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -314,15 +314,13 @@ xfs_getbmap_report_one(
if (isnullstartblock(got->br_startblock) ||
got->br_startblock == DELAYSTARTBLOCK) {
/*
- * Delalloc extents that start beyond EOF can occur due to
- * speculative EOF allocation when the delalloc extent is larger
- * than the largest freespace extent at conversion time. These
- * extents cannot be converted by data writeback, so can exist
- * here even if we are not supposed to be finding delalloc
- * extents.
+ * Take the flush completion as being a point-in-time snapshot
+ * where there are no delalloc extents, and if any new ones
+ * have been created racily, just skip them as being 'after'
+ * the flush and so don't get reported.
*/
- if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
- ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+ if (!(bmv->bmv_iflags & BMV_IF_DELALLOC))
+ return 0;
p->bmv_oflags |= BMV_OF_DELALLOC;
p->bmv_block = -2;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c..15d1e5a7c2d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -286,8 +286,7 @@ xfs_buf_free_pages(
if (bp->b_pages[i])
__free_page(bp->b_pages[i]);
}
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += bp->b_page_count;
+ mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
kmem_free(bp->b_pages);
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index ffa94102094d..43167f543afc 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -943,6 +943,16 @@ xlog_recover_buf_commit_pass2(
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+
+ /*
+ * We're skipping replay of this buffer log item due to the log
+ * item LSN being behind the ondisk buffer. Verify the buffer
+ * contents since we aren't going to run the write verifier.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_read(bp);
+ error = bp->b_error;
+ }
goto out_release;
}
diff --git a/fs/xfs/xfs_dahash_test.c b/fs/xfs/xfs_dahash_test.c
index 230651ab5ce4..0dab5941e080 100644
--- a/fs/xfs/xfs_dahash_test.c
+++ b/fs/xfs/xfs_dahash_test.c
@@ -9,6 +9,9 @@
#include "xfs_format.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_dir2_priv.h"
#include "xfs_dahash_test.h"
/* 4096 random bytes */
@@ -533,108 +536,109 @@ static struct dahash_test {
uint16_t start; /* random 12 bit offset in buf */
uint16_t length; /* random 8 bit length of test */
xfs_dahash_t dahash; /* expected dahash result */
+ xfs_dahash_t ascii_ci_dahash; /* expected ascii-ci dahash result */
} test[] __initdata =
{
- {0x0567, 0x0097, 0x96951389},
- {0x0869, 0x0055, 0x6455ab4f},
- {0x0c51, 0x00be, 0x8663afde},
- {0x044a, 0x00fc, 0x98fbe432},
- {0x0f29, 0x0079, 0x42371997},
- {0x08ba, 0x0052, 0x942be4f7},
- {0x01f2, 0x0013, 0x5262687e},
- {0x09e3, 0x00e2, 0x8ffb0908},
- {0x007c, 0x0051, 0xb3158491},
- {0x0854, 0x001f, 0x83bb20d9},
- {0x031b, 0x0008, 0x98970bdf},
- {0x0de7, 0x0027, 0xbfbf6f6c},
- {0x0f76, 0x0005, 0x906a7105},
- {0x092e, 0x00d0, 0x86631850},
- {0x0233, 0x0082, 0xdbdd914e},
- {0x04c9, 0x0075, 0x5a400a9e},
- {0x0b66, 0x0099, 0xae128b45},
- {0x000d, 0x00ed, 0xe61c216a},
- {0x0a31, 0x003d, 0xf69663b9},
- {0x00a3, 0x0052, 0x643c39ae},
- {0x0125, 0x00d5, 0x7c310b0d},
- {0x0105, 0x004a, 0x06a77e74},
- {0x0858, 0x008e, 0x265bc739},
- {0x045e, 0x0095, 0x13d6b192},
- {0x0dab, 0x003c, 0xc4498704},
- {0x00cd, 0x00b5, 0x802a4e2d},
- {0x069b, 0x008c, 0x5df60f71},
- {0x0454, 0x006c, 0x5f03d8bb},
- {0x040e, 0x0032, 0x0ce513b5},
- {0x0874, 0x00e2, 0x6a811fb3},
- {0x0521, 0x00b4, 0x93296833},
- {0x0ddc, 0x00cf, 0xf9305338},
- {0x0a70, 0x0023, 0x239549ea},
- {0x083e, 0x0027, 0x2d88ba97},
- {0x0241, 0x00a7, 0xfe0b32e1},
- {0x0dfc, 0x0096, 0x1a11e815},
- {0x023e, 0x001e, 0xebc9a1f3},
- {0x067e, 0x0066, 0xb1067f81},
- {0x09ea, 0x000e, 0x46fd7247},
- {0x036b, 0x008c, 0x1a39acdf},
- {0x078f, 0x0030, 0x964042ab},
- {0x085c, 0x008f, 0x1829edab},
- {0x02ec, 0x009f, 0x6aefa72d},
- {0x043b, 0x00ce, 0x65642ff5},
- {0x0a32, 0x00b8, 0xbd82759e},
- {0x0d3c, 0x0087, 0xf4d66d54},
- {0x09ec, 0x008a, 0x06bfa1ff},
- {0x0902, 0x0015, 0x755025d2},
- {0x08fe, 0x000e, 0xf690ce2d},
- {0x00fb, 0x00dc, 0xe55f1528},
- {0x0eaa, 0x003a, 0x0fe0a8d7},
- {0x05fb, 0x0006, 0x86281cfb},
- {0x0dd1, 0x00a7, 0x60ab51b4},
- {0x0005, 0x001b, 0xf51d969b},
- {0x077c, 0x00dd, 0xc2fed268},
- {0x0575, 0x00f5, 0x432c0b1a},
- {0x05be, 0x0088, 0x78baa04b},
- {0x0c89, 0x0068, 0xeda9e428},
- {0x0f5c, 0x0068, 0xec143c76},
- {0x06a8, 0x0009, 0xd72651ce},
- {0x060f, 0x008e, 0x765426cd},
- {0x07b1, 0x0047, 0x2cfcfa0c},
- {0x04f1, 0x0041, 0x55b172f9},
- {0x0e05, 0x00ac, 0x61efde93},
- {0x0bf7, 0x0097, 0x05b83eee},
- {0x04e9, 0x00f3, 0x9928223a},
- {0x023a, 0x0005, 0xdfada9bc},
- {0x0acb, 0x000e, 0x2217cecd},
- {0x0148, 0x0060, 0xbc3f7405},
- {0x0764, 0x0059, 0xcbc201b1},
- {0x021f, 0x0059, 0x5d6b2256},
- {0x0f1e, 0x006c, 0xdefeeb45},
- {0x071c, 0x00b9, 0xb9b59309},
- {0x0564, 0x0063, 0xae064271},
- {0x0b14, 0x0044, 0xdb867d9b},
- {0x0e5a, 0x0055, 0xff06b685},
- {0x015e, 0x00ba, 0x1115ccbc},
- {0x0379, 0x00e6, 0x5f4e58dd},
- {0x013b, 0x0067, 0x4897427e},
- {0x0e64, 0x0071, 0x7af2b7a4},
- {0x0a11, 0x0050, 0x92105726},
- {0x0109, 0x0055, 0xd0d000f9},
- {0x00aa, 0x0022, 0x815d229d},
- {0x09ac, 0x004f, 0x02f9d985},
- {0x0e1b, 0x00ce, 0x5cf92ab4},
- {0x08af, 0x00d8, 0x17ca72d1},
- {0x0e33, 0x000a, 0xda2dba6b},
- {0x0ee3, 0x006a, 0xb00048e5},
- {0x0648, 0x001a, 0x2364b8cb},
- {0x0315, 0x0085, 0x0596fd0d},
- {0x0fbb, 0x003e, 0x298230ca},
- {0x0422, 0x006a, 0x78ada4ab},
- {0x04ba, 0x0073, 0xced1fbc2},
- {0x007d, 0x0061, 0x4b7ff236},
- {0x070b, 0x00d0, 0x261cf0ae},
- {0x0c1a, 0x0035, 0x8be92ee2},
- {0x0af8, 0x0063, 0x824dcf03},
- {0x08f8, 0x006d, 0xd289710c},
- {0x021b, 0x00ee, 0x6ac1c41d},
- {0x05b5, 0x00da, 0x8e52f0e2},
+ {0x0567, 0x0097, 0x96951389, 0xc153aa0d},
+ {0x0869, 0x0055, 0x6455ab4f, 0xd07f69bf},
+ {0x0c51, 0x00be, 0x8663afde, 0xf9add90c},
+ {0x044a, 0x00fc, 0x98fbe432, 0xbf2abb76},
+ {0x0f29, 0x0079, 0x42371997, 0x282588b3},
+ {0x08ba, 0x0052, 0x942be4f7, 0x2e023547},
+ {0x01f2, 0x0013, 0x5262687e, 0x5266287e},
+ {0x09e3, 0x00e2, 0x8ffb0908, 0x1da892f3},
+ {0x007c, 0x0051, 0xb3158491, 0xb67f9e63},
+ {0x0854, 0x001f, 0x83bb20d9, 0x22bb21db},
+ {0x031b, 0x0008, 0x98970bdf, 0x9cd70adf},
+ {0x0de7, 0x0027, 0xbfbf6f6c, 0xae3f296c},
+ {0x0f76, 0x0005, 0x906a7105, 0x906a7105},
+ {0x092e, 0x00d0, 0x86631850, 0xa3f6ac04},
+ {0x0233, 0x0082, 0xdbdd914e, 0x5d8c7aac},
+ {0x04c9, 0x0075, 0x5a400a9e, 0x12f60711},
+ {0x0b66, 0x0099, 0xae128b45, 0x7551310d},
+ {0x000d, 0x00ed, 0xe61c216a, 0xc22d3c4c},
+ {0x0a31, 0x003d, 0xf69663b9, 0x51960bf8},
+ {0x00a3, 0x0052, 0x643c39ae, 0xa93c73a8},
+ {0x0125, 0x00d5, 0x7c310b0d, 0xf221cbb3},
+ {0x0105, 0x004a, 0x06a77e74, 0xa4ef4561},
+ {0x0858, 0x008e, 0x265bc739, 0xd6c36d9b},
+ {0x045e, 0x0095, 0x13d6b192, 0x5f5c1d62},
+ {0x0dab, 0x003c, 0xc4498704, 0x10414654},
+ {0x00cd, 0x00b5, 0x802a4e2d, 0xfbd17c9d},
+ {0x069b, 0x008c, 0x5df60f71, 0x91ddca5f},
+ {0x0454, 0x006c, 0x5f03d8bb, 0x5c59fce0},
+ {0x040e, 0x0032, 0x0ce513b5, 0xa8cd99b1},
+ {0x0874, 0x00e2, 0x6a811fb3, 0xca028316},
+ {0x0521, 0x00b4, 0x93296833, 0x2c4d4880},
+ {0x0ddc, 0x00cf, 0xf9305338, 0x2c94210d},
+ {0x0a70, 0x0023, 0x239549ea, 0x22b561aa},
+ {0x083e, 0x0027, 0x2d88ba97, 0x5cd8bb9d},
+ {0x0241, 0x00a7, 0xfe0b32e1, 0x17b506b8},
+ {0x0dfc, 0x0096, 0x1a11e815, 0xee4141bd},
+ {0x023e, 0x001e, 0xebc9a1f3, 0x5689a1f3},
+ {0x067e, 0x0066, 0xb1067f81, 0xd9952571},
+ {0x09ea, 0x000e, 0x46fd7247, 0x42b57245},
+ {0x036b, 0x008c, 0x1a39acdf, 0x58bf1586},
+ {0x078f, 0x0030, 0x964042ab, 0xb04218b9},
+ {0x085c, 0x008f, 0x1829edab, 0x9ceca89c},
+ {0x02ec, 0x009f, 0x6aefa72d, 0x634cc2a7},
+ {0x043b, 0x00ce, 0x65642ff5, 0x6c8a584e},
+ {0x0a32, 0x00b8, 0xbd82759e, 0x0f96a34f},
+ {0x0d3c, 0x0087, 0xf4d66d54, 0xb71ba5f4},
+ {0x09ec, 0x008a, 0x06bfa1ff, 0x576ca80f},
+ {0x0902, 0x0015, 0x755025d2, 0x517225c2},
+ {0x08fe, 0x000e, 0xf690ce2d, 0xf690cf3d},
+ {0x00fb, 0x00dc, 0xe55f1528, 0x707d7d92},
+ {0x0eaa, 0x003a, 0x0fe0a8d7, 0x87638cc5},
+ {0x05fb, 0x0006, 0x86281cfb, 0x86281cf9},
+ {0x0dd1, 0x00a7, 0x60ab51b4, 0xe28ef00c},
+ {0x0005, 0x001b, 0xf51d969b, 0xe71dd6d3},
+ {0x077c, 0x00dd, 0xc2fed268, 0xdc30c555},
+ {0x0575, 0x00f5, 0x432c0b1a, 0x81dd7d16},
+ {0x05be, 0x0088, 0x78baa04b, 0xd69b433e},
+ {0x0c89, 0x0068, 0xeda9e428, 0xe9b4fa0a},
+ {0x0f5c, 0x0068, 0xec143c76, 0x9947067a},
+ {0x06a8, 0x0009, 0xd72651ce, 0xd72651ee},
+ {0x060f, 0x008e, 0x765426cd, 0x2099626f},
+ {0x07b1, 0x0047, 0x2cfcfa0c, 0x1a4baa07},
+ {0x04f1, 0x0041, 0x55b172f9, 0x15331a79},
+ {0x0e05, 0x00ac, 0x61efde93, 0x320568cc},
+ {0x0bf7, 0x0097, 0x05b83eee, 0xc72fb7a3},
+ {0x04e9, 0x00f3, 0x9928223a, 0xe8c77de2},
+ {0x023a, 0x0005, 0xdfada9bc, 0xdfadb9be},
+ {0x0acb, 0x000e, 0x2217cecd, 0x0017d6cd},
+ {0x0148, 0x0060, 0xbc3f7405, 0xf5fd6615},
+ {0x0764, 0x0059, 0xcbc201b1, 0xbb089bf4},
+ {0x021f, 0x0059, 0x5d6b2256, 0xa16a0a59},
+ {0x0f1e, 0x006c, 0xdefeeb45, 0xfc34f9d6},
+ {0x071c, 0x00b9, 0xb9b59309, 0xb645eae2},
+ {0x0564, 0x0063, 0xae064271, 0x954dc6d1},
+ {0x0b14, 0x0044, 0xdb867d9b, 0xdf432309},
+ {0x0e5a, 0x0055, 0xff06b685, 0xa65ff257},
+ {0x015e, 0x00ba, 0x1115ccbc, 0x11c365f4},
+ {0x0379, 0x00e6, 0x5f4e58dd, 0x2d176d31},
+ {0x013b, 0x0067, 0x4897427e, 0xc40532fe},
+ {0x0e64, 0x0071, 0x7af2b7a4, 0x1fb7bf43},
+ {0x0a11, 0x0050, 0x92105726, 0xb1185e51},
+ {0x0109, 0x0055, 0xd0d000f9, 0x60a60bfd},
+ {0x00aa, 0x0022, 0x815d229d, 0x215d379c},
+ {0x09ac, 0x004f, 0x02f9d985, 0x10b90b20},
+ {0x0e1b, 0x00ce, 0x5cf92ab4, 0x6a477573},
+ {0x08af, 0x00d8, 0x17ca72d1, 0x385af156},
+ {0x0e33, 0x000a, 0xda2dba6b, 0xda2dbb69},
+ {0x0ee3, 0x006a, 0xb00048e5, 0xa9a2decc},
+ {0x0648, 0x001a, 0x2364b8cb, 0x3364b1cb},
+ {0x0315, 0x0085, 0x0596fd0d, 0xa651740f},
+ {0x0fbb, 0x003e, 0x298230ca, 0x7fc617c7},
+ {0x0422, 0x006a, 0x78ada4ab, 0xc576ae2a},
+ {0x04ba, 0x0073, 0xced1fbc2, 0xaac8455b},
+ {0x007d, 0x0061, 0x4b7ff236, 0x347d5739},
+ {0x070b, 0x00d0, 0x261cf0ae, 0xc7fb1c10},
+ {0x0c1a, 0x0035, 0x8be92ee2, 0x8be9b4e1},
+ {0x0af8, 0x0063, 0x824dcf03, 0x53010388},
+ {0x08f8, 0x006d, 0xd289710c, 0x30418edd},
+ {0x021b, 0x00ee, 0x6ac1c41d, 0x2557e9a3},
+ {0x05b5, 0x00da, 0x8e52f0e2, 0x98531012},
};
int __init
@@ -644,12 +648,19 @@ xfs_dahash_test(void)
unsigned int errors = 0;
for (i = 0; i < ARRAY_SIZE(test); i++) {
+ struct xfs_name xname = { };
xfs_dahash_t hash;
hash = xfs_da_hashname(test_buf + test[i].start,
test[i].length);
if (hash != test[i].dahash)
errors++;
+
+ xname.name = test_buf + test[i].start;
+ xname.len = test[i].length;
+ hash = xfs_ascii_ci_hashname(&xname);
+ if (hash != test[i].ascii_ci_dahash)
+ errors++;
}
if (errors) {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8fb90da89787..7f071757f278 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert(
error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
/* Duplicate found! Caller must try again. */
- WARN_ON(error != -EEXIST);
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
return error;
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
new file mode 100644
index 000000000000..005a66be44a2
--- /dev/null
+++ b/fs/xfs/xfs_drain.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/*
+ * Use a static key here to reduce the overhead of xfs_drain_rele. If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no xfs_drain_wait callers. Online fsck is currently
+ * the only caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate);
+
+void
+xfs_drain_wait_disable(void)
+{
+ static_branch_dec(&xfs_drain_waiter_gate);
+}
+
+void
+xfs_drain_wait_enable(void)
+{
+ static_branch_inc(&xfs_drain_waiter_gate);
+}
+
+void
+xfs_defer_drain_init(
+ struct xfs_defer_drain *dr)
+{
+ atomic_set(&dr->dr_count, 0);
+ init_waitqueue_head(&dr->dr_waiters);
+}
+
+void
+xfs_defer_drain_free(struct xfs_defer_drain *dr)
+{
+ ASSERT(atomic_read(&dr->dr_count) == 0);
+}
+
+/* Increase the pending intent count. */
+static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr)
+{
+ atomic_inc(&dr->dr_count);
+}
+
+static inline bool has_waiters(struct wait_queue_head *wq_head)
+{
+ /*
+ * This memory barrier is paired with the one in set_current_state on
+ * the waiting side.
+ */
+ smp_mb__after_atomic();
+ return waitqueue_active(wq_head);
+}
+
+/* Decrease the pending intent count, and wake any waiters, if appropriate. */
+static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr)
+{
+ if (atomic_dec_and_test(&dr->dr_count) &&
+ static_branch_unlikely(&xfs_drain_waiter_gate) &&
+ has_waiters(&dr->dr_waiters))
+ wake_up(&dr->dr_waiters);
+}
+
+/* Are there intents pending? */
+static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr)
+{
+ return atomic_read(&dr->dr_count) > 0;
+}
+
+/*
+ * Wait for the pending intent count for a drain to hit zero.
+ *
+ * Callers must not hold any locks that would prevent intents from being
+ * finished.
+ */
+static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr)
+{
+ return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr));
+}
+
+/*
+ * Get a passive reference to an AG and declare an intent to update its
+ * metadata.
+ */
+struct xfs_perag *
+xfs_perag_intent_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ if (!pag)
+ return NULL;
+
+ xfs_perag_intent_hold(pag);
+ return pag;
+}
+
+/*
+ * Release our intent to update this AG's metadata, and then release our
+ * passive ref to the AG.
+ */
+void
+xfs_perag_intent_put(
+ struct xfs_perag *pag)
+{
+ xfs_perag_intent_rele(pag);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Declare an intent to update AG metadata. Other threads that need exclusive
+ * access can decide to back off if they see declared intentions.
+ */
+void
+xfs_perag_intent_hold(
+ struct xfs_perag *pag)
+{
+ trace_xfs_perag_intent_hold(pag, __return_address);
+ xfs_defer_drain_grab(&pag->pag_intents_drain);
+}
+
+/* Release our intent to update this AG's metadata. */
+void
+xfs_perag_intent_rele(
+ struct xfs_perag *pag)
+{
+ trace_xfs_perag_intent_rele(pag, __return_address);
+ xfs_defer_drain_rele(&pag->pag_intents_drain);
+}
+
+/*
+ * Wait for the intent update count for this AG to hit zero.
+ * Callers must not hold any AG header buffers.
+ */
+int
+xfs_perag_intent_drain(
+ struct xfs_perag *pag)
+{
+ trace_xfs_perag_wait_intents(pag, __return_address);
+ return xfs_defer_drain_wait(&pag->pag_intents_drain);
+}
+
+/* Has anyone declared an intent to update this AG? */
+bool
+xfs_perag_intent_busy(
+ struct xfs_perag *pag)
+{
+ return xfs_defer_drain_busy(&pag->pag_intents_drain);
+}
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
new file mode 100644
index 000000000000..50a5772a8296
--- /dev/null
+++ b/fs/xfs/xfs_drain.h
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_DRAIN_H_
+#define XFS_DRAIN_H_
+
+struct xfs_perag;
+
+#ifdef CONFIG_XFS_DRAIN_INTENTS
+/*
+ * Passive drain mechanism. This data structure tracks a count of some items
+ * and contains a waitqueue for callers who would like to wake up when the
+ * count hits zero.
+ */
+struct xfs_defer_drain {
+ /* Number of items pending in some part of the filesystem. */
+ atomic_t dr_count;
+
+ /* Queue to wait for dri_count to go to zero */
+ struct wait_queue_head dr_waiters;
+};
+
+void xfs_defer_drain_init(struct xfs_defer_drain *dr);
+void xfs_defer_drain_free(struct xfs_defer_drain *dr);
+
+void xfs_drain_wait_disable(void);
+void xfs_drain_wait_enable(void);
+
+/*
+ * Deferred Work Intent Drains
+ * ===========================
+ *
+ * When a writer thread executes a chain of log intent items, the AG header
+ * buffer locks will cycle during a transaction roll to get from one intent
+ * item to the next in a chain. Although scrub takes all AG header buffer
+ * locks, this isn't sufficient to guard against scrub checking an AG while
+ * that writer thread is in the middle of finishing a chain because there's no
+ * higher level locking primitive guarding allocation groups.
+ *
+ * When there's a collision, cross-referencing between data structures (e.g.
+ * rmapbt and refcountbt) yields false corruption events; if repair is running,
+ * this results in incorrect repairs, which is catastrophic.
+ *
+ * The solution is to the perag structure the count of active intents and make
+ * scrub wait until it has both AG header buffer locks and the intent counter
+ * reaches zero. It is therefore critical that deferred work threads hold the
+ * AGI or AGF buffers when decrementing the intent counter.
+ *
+ * Given a list of deferred work items, the deferred work manager will complete
+ * a work item and all the sub-items that the parent item creates before moving
+ * on to the next work item in the list. This is also true for all levels of
+ * sub-items. Writer threads are permitted to queue multiple work items
+ * targetting the same AG, so a deferred work item (such as a BUI) that creates
+ * sub-items (such as RUIs) must bump the intent counter and maintain it until
+ * the sub-items can themselves bump the intent counter.
+ *
+ * Therefore, the intent count tracks entire lifetimes of deferred work items.
+ * All functions that create work items must increment the intent counter as
+ * soon as the item is added to the transaction and cannot drop the counter
+ * until the item is finished or cancelled.
+ */
+struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
+ xfs_agnumber_t agno);
+void xfs_perag_intent_put(struct xfs_perag *pag);
+
+void xfs_perag_intent_hold(struct xfs_perag *pag);
+void xfs_perag_intent_rele(struct xfs_perag *pag);
+
+int xfs_perag_intent_drain(struct xfs_perag *pag);
+bool xfs_perag_intent_busy(struct xfs_perag *pag);
+#else
+struct xfs_defer_drain { /* empty */ };
+
+#define xfs_defer_drain_free(dr) ((void)0)
+#define xfs_defer_drain_init(dr) ((void)0)
+
+#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno))
+#define xfs_perag_intent_put(pag) xfs_perag_put(pag)
+
+static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { }
+static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { }
+
+#endif /* CONFIG_XFS_DRAIN_INTENTS */
+
+#endif /* XFS_DRAIN_H_ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 011b50469301..f9e36b810663 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -351,8 +351,6 @@ xfs_trans_free_extent(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent *extp;
uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp,
- xefi->xefi_startblock);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
xefi->xefi_startblock);
int error;
@@ -363,12 +361,13 @@ xfs_trans_free_extent(
if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno,
- xefi->xefi_blockcount);
+ trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+ agbno, xefi->xefi_blockcount);
- error = __xfs_free_extent(tp, xefi->xefi_startblock,
+ error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+
/*
* Mark the transaction dirty, even on error. This ensures the
* transaction is aborted, which:
@@ -396,14 +395,13 @@ xfs_extent_free_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_mount *mp = priv;
struct xfs_extent_free_item *ra;
struct xfs_extent_free_item *rb;
ra = container_of(a, struct xfs_extent_free_item, xefi_list);
rb = container_of(b, struct xfs_extent_free_item, xefi_list);
- return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+
+ return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
}
/* Log a free extent to the intent item. */
@@ -462,6 +460,26 @@ xfs_extent_free_create_done(
return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
}
+/* Take a passive ref to the AG containing the space we're freeing. */
+void
+xfs_extent_free_get_group(
+ struct xfs_mount *mp,
+ struct xfs_extent_free_item *xefi)
+{
+ xfs_agnumber_t agno;
+
+ agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+ xefi->xefi_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after some freeing work. */
+static inline void
+xfs_extent_free_put_group(
+ struct xfs_extent_free_item *xefi)
+{
+ xfs_perag_intent_put(xefi->xefi_pag);
+}
+
/* Process a free extent. */
STATIC int
xfs_extent_free_finish_item(
@@ -476,6 +494,8 @@ xfs_extent_free_finish_item(
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+
+ xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -496,6 +516,8 @@ xfs_extent_free_cancel_item(
struct xfs_extent_free_item *xefi;
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
@@ -526,24 +548,21 @@ xfs_agfl_free_finish_item(
struct xfs_extent *extp;
struct xfs_buf *agbp;
int error;
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
uint next_extent;
- struct xfs_perag *pag;
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
ASSERT(xefi->xefi_blockcount == 1);
- agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
oinfo.oi_owner = xefi->xefi_owner;
- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount);
+ trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno,
+ xefi->xefi_blockcount);
- pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+ error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
- xfs_perag_put(pag);
+ error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
+ agbno, agbp, &oinfo);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -562,6 +581,7 @@ xfs_agfl_free_finish_item(
extp->ext_len = xefi->xefi_blockcount;
efdp->efd_next_extent++;
+ xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -632,7 +652,9 @@ xfs_efi_item_recover(
fake.xefi_startblock = extp->ext_start;
fake.xefi_blockcount = extp->ext_len;
+ xfs_extent_free_get_group(mp, &fake);
error = xfs_trans_free_extent(tp, efdp, &fake);
+ xfs_extent_free_put_group(&fake);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 863289aaa441..aede746541f8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1389,25 +1389,10 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static vm_fault_t
-xfs_filemap_map_pages(
- struct vm_fault *vmf,
- pgoff_t start_pgoff,
- pgoff_t end_pgoff)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- vm_fault_t ret;
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- return ret;
-}
-
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = xfs_filemap_map_pages,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c9a7e270a428..351849fc18ff 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -767,7 +767,8 @@ again:
return 0;
out_error_or_again:
- if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
+ if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
+ error == -EAGAIN) {
delay(1);
goto again;
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 6cd180721659..87910191a9dd 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,10 +34,13 @@ struct xfs_icwalk {
/*
* Flags for xfs_iget()
*/
-#define XFS_IGET_CREATE 0x1
-#define XFS_IGET_UNTRUSTED 0x2
-#define XFS_IGET_DONTCACHE 0x4
-#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
+#define XFS_IGET_CREATE (1U << 0)
+#define XFS_IGET_UNTRUSTED (1U << 1)
+#define XFS_IGET_DONTCACHE (1U << 2)
+/* don't read from disk or reinit */
+#define XFS_IGET_INCORE (1U << 3)
+/* Return -EAGAIN immediately if the inode is unavailable. */
+#define XFS_IGET_NORETRY (1U << 4)
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
index 43005ce8bd48..2ddccb172fa0 100644
--- a/fs/xfs/xfs_iunlink_item.c
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -168,9 +168,7 @@ xfs_iunlink_log_inode(
iup->ip = ip;
iup->next_agino = next_agino;
iup->old_agino = ip->i_next_unlinked;
-
- atomic_inc(&pag->pag_ref);
- iup->pag = pag;
+ iup->pag = xfs_perag_hold(pag);
xfs_trans_add_item(tp, &iup->item);
tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 21be93bf006d..b3275e8d47b6 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -667,11 +667,10 @@ xfs_iwalk_threaded(
iwag->mp = mp;
/*
- * perag is being handed off to async work, so take another
+ * perag is being handed off to async work, so take a passive
* reference for the async work to release.
*/
- atomic_inc(&pag->pag_ref);
- iwag->pag = pag;
+ iwag->pag = xfs_perag_hold(pag);
iwag->iwalk_fn = iwalk_fn;
iwag->data = data;
iwag->startino = startino;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e88f18f85e4b..74dcb05069e8 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -80,6 +80,7 @@ typedef __u32 xfs_nlink_t;
#include "xfs_cksum.h"
#include "xfs_buf.h"
#include "xfs_message.h"
+#include "xfs_drain.h"
#ifdef __BIG_ENDIAN
#define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 48d771a76add..edd8587658d5 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -20,6 +20,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_ag.h"
struct kmem_cache *xfs_cui_cache;
struct kmem_cache *xfs_cud_cache;
@@ -279,14 +280,13 @@ xfs_refcount_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_mount *mp = priv;
struct xfs_refcount_intent *ra;
struct xfs_refcount_intent *rb;
ra = container_of(a, struct xfs_refcount_intent, ri_list);
rb = container_of(b, struct xfs_refcount_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+
+ return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
}
/* Set the phys extent flags for this reverse mapping. */
@@ -365,6 +365,26 @@ xfs_refcount_update_create_done(
return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
}
+/* Take a passive ref to the AG containing the space we're refcounting. */
+void
+xfs_refcount_update_get_group(
+ struct xfs_mount *mp,
+ struct xfs_refcount_intent *ri)
+{
+ xfs_agnumber_t agno;
+
+ agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock);
+ ri->ri_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing refcounting work. */
+static inline void
+xfs_refcount_update_put_group(
+ struct xfs_refcount_intent *ri)
+{
+ xfs_perag_intent_put(ri->ri_pag);
+}
+
/* Process a deferred refcount update. */
STATIC int
xfs_refcount_update_finish_item(
@@ -386,6 +406,8 @@ xfs_refcount_update_finish_item(
ri->ri_type == XFS_REFCOUNT_DECREASE);
return -EAGAIN;
}
+
+ xfs_refcount_update_put_group(ri);
kmem_cache_free(xfs_refcount_intent_cache, ri);
return error;
}
@@ -406,6 +428,8 @@ xfs_refcount_update_cancel_item(
struct xfs_refcount_intent *ri;
ri = container_of(item, struct xfs_refcount_intent, ri_list);
+
+ xfs_refcount_update_put_group(ri);
kmem_cache_free(xfs_refcount_intent_cache, ri);
}
@@ -520,9 +544,13 @@ xfs_cui_item_recover(
fake.ri_startblock = pmap->pe_startblock;
fake.ri_blockcount = pmap->pe_len;
- if (!requeue_only)
+
+ if (!requeue_only) {
+ xfs_refcount_update_get_group(mp, &fake);
error = xfs_trans_log_finish_refcount_update(tp, cudp,
&fake, &rcur);
+ xfs_refcount_update_put_group(&fake);
+ }
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&cuip->cui_format,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a1619d67015f..520c7ebdfed8 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -20,6 +20,7 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_ag.h"
struct kmem_cache *xfs_rui_cache;
struct kmem_cache *xfs_rud_cache;
@@ -320,14 +321,13 @@ xfs_rmap_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_mount *mp = priv;
struct xfs_rmap_intent *ra;
struct xfs_rmap_intent *rb;
ra = container_of(a, struct xfs_rmap_intent, ri_list);
rb = container_of(b, struct xfs_rmap_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+
+ return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
}
/* Log rmap updates in the intent item. */
@@ -390,6 +390,26 @@ xfs_rmap_update_create_done(
return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
}
+/* Take a passive ref to the AG containing the space we're rmapping. */
+void
+xfs_rmap_update_get_group(
+ struct xfs_mount *mp,
+ struct xfs_rmap_intent *ri)
+{
+ xfs_agnumber_t agno;
+
+ agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
+ ri->ri_pag = xfs_perag_intent_get(mp, agno);
+}
+
+/* Release a passive AG ref after finishing rmapping work. */
+static inline void
+xfs_rmap_update_put_group(
+ struct xfs_rmap_intent *ri)
+{
+ xfs_perag_intent_put(ri->ri_pag);
+}
+
/* Process a deferred rmap update. */
STATIC int
xfs_rmap_update_finish_item(
@@ -405,6 +425,8 @@ xfs_rmap_update_finish_item(
error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
state);
+
+ xfs_rmap_update_put_group(ri);
kmem_cache_free(xfs_rmap_intent_cache, ri);
return error;
}
@@ -425,6 +447,8 @@ xfs_rmap_update_cancel_item(
struct xfs_rmap_intent *ri;
ri = container_of(item, struct xfs_rmap_intent, ri_list);
+
+ xfs_rmap_update_put_group(ri);
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
@@ -559,11 +583,13 @@ xfs_rui_item_recover(
fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ xfs_rmap_update_get_group(mp, &fake);
error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
&rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
map, sizeof(*map));
+ xfs_rmap_update_put_group(&fake);
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4f814f9e12ab..4d2e87462ac4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1548,6 +1548,19 @@ xfs_fs_fill_super(
#endif
}
+ /* ASCII case insensitivity is undergoing deprecation. */
+ if (xfs_has_asciici(mp)) {
+#ifdef CONFIG_XFS_SUPPORT_ASCII_CI
+ xfs_warn_once(mp,
+ "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030.");
+#else
+ xfs_warn(mp,
+ "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+#endif
+ }
+
/* Filesystem claims it needs repair, so refuse the mount. */
if (xfs_has_needsrepair(mp)) {
xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 546a6cd96729..fade33735393 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -210,28 +210,10 @@ static struct ctl_table xfs_table[] = {
{}
};
-static struct ctl_table xfs_dir_table[] = {
- {
- .procname = "xfs",
- .mode = 0555,
- .child = xfs_table
- },
- {}
-};
-
-static struct ctl_table xfs_root_table[] = {
- {
- .procname = "fs",
- .mode = 0555,
- .child = xfs_dir_table
- },
- {}
-};
-
int
xfs_sysctl_register(void)
{
- xfs_table_header = register_sysctl_table(xfs_root_table);
+ xfs_table_header = register_sysctl("fs/xfs", xfs_table);
if (!xfs_table_header)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9c0006c55fec..cd4ca5b1fcb0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -190,6 +190,7 @@ DEFINE_EVENT(xfs_perag_class, name, \
TP_ARGS(pag, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_hold);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag);
@@ -2686,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
+ void *item),
+ TP_ARGS(mp, dfp, item),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, type)
+ __field(void *, intent)
+ __field(void *, item)
+ __field(char, committed)
+ __field(int, nr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp ? mp->m_super->s_dev : 0;
+ __entry->type = dfp->dfp_type;
+ __entry->intent = dfp->dfp_intent;
+ __entry->item = item;
+ __entry->committed = dfp->dfp_done != NULL;
+ __entry->nr = dfp->dfp_count;
+ ),
+ TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->type,
+ __entry->intent,
+ __entry->item,
+ __entry->committed,
+ __entry->nr)
+)
+#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_defer_pending_item_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \
+ void *item), \
+ TP_ARGS(mp, dfp, item))
+
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item);
+DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
+
/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4325,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown,
__entry->line_num)
);
+#ifdef CONFIG_XFS_DRAIN_INTENTS
+DECLARE_EVENT_CLASS(xfs_perag_intents_class,
+ TP_PROTO(struct xfs_perag *pag, void *caller_ip),
+ TP_ARGS(pag, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(long, nr_intents)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->nr_intents,
+ __entry->caller_ip)
+);
+
+#define DEFINE_PERAG_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_perag_intents_class, name, \
+ TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
+ TP_ARGS(pag, caller_ip))
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele);
+DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+
+#endif /* CONFIG_XFS_DRAIN_INTENTS */
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH