summaryrefslogtreecommitdiff
path: root/fs/ceph/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/inode.c')
-rw-r--r--fs/ceph/inode.c217
1 files changed, 136 insertions, 81 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 315ef02f9a3f..2966f88310e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -86,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
goto out_err;
}
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
inode->i_mode = *mode;
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
@@ -155,12 +201,12 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
ceph_present_inode(inode), ceph_vinop(inode), inode,
- !!(inode->i_state & I_NEW));
+ !!(inode_state_read_once(inode) & I_NEW));
return inode;
}
/*
- * get/constuct snapdir inode for a given directory
+ * get/construct snapdir inode for a given directory
*/
struct inode *ceph_get_snapdir(struct inode *parent)
{
@@ -182,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
goto err;
}
- if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
goto err;
@@ -215,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
}
#endif
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
@@ -224,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
err:
- if ((inode->i_state & I_NEW))
+ if ((inode_state_read_once(inode) & I_NEW))
discard_new_inode(inode);
else
iput(inode);
@@ -665,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
#ifdef CONFIG_FS_ENCRYPTION
+ ci->i_crypt_info = NULL;
ci->fscrypt_auth = NULL;
ci->fscrypt_auth_len = 0;
#endif
@@ -697,7 +744,7 @@ void ceph_evict_inode(struct inode *inode)
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
@@ -832,7 +879,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
{
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct timespec64 iatime = inode_get_atime(inode);
struct timespec64 ictime = inode_get_ctime(inode);
+ struct timespec64 imtime = inode_get_mtime(inode);
int warn = 0;
if (issued & (CEPH_CAP_FILE_EXCL|
@@ -842,39 +891,26 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
timespec64_compare(ctime, &ictime) > 0) {
- doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
- ictime.tv_sec, ictime.tv_nsec,
- ctime->tv_sec, ctime->tv_nsec);
+ doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
inode_set_ctime_to_ts(inode, *ctime);
}
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
- doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
- inode_get_mtime_sec(inode),
- inode_get_mtime_nsec(inode),
- mtime->tv_sec, mtime->tv_nsec,
- ci->i_time_warp_seq, (int)time_warp_seq);
+ doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
+ ci->i_time_warp_seq, (int)time_warp_seq);
inode_set_mtime_to_ts(inode, *mtime);
inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else if (time_warp_seq == ci->i_time_warp_seq) {
- struct timespec64 ts;
-
/* nobody did utimes(); take the max */
- ts = inode_get_mtime(inode);
- if (timespec64_compare(mtime, &ts) > 0) {
- doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
- ts.tv_sec, ts.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec);
+ if (timespec64_compare(mtime, &imtime) > 0) {
+ doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
inode_set_mtime_to_ts(inode, *mtime);
}
- ts = inode_get_atime(inode);
- if (timespec64_compare(atime, &ts) > 0) {
- doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
- ts.tv_sec, ts.tv_nsec,
- atime->tv_sec, atime->tv_nsec);
+ if (timespec64_compare(atime, &iatime) > 0) {
+ doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
inode_set_atime_to_ts(inode, *atime);
}
} else if (issued & CEPH_CAP_FILE_EXCL) {
@@ -911,7 +947,7 @@ static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
if (!sym)
return -ENOMEM;
- declen = ceph_base64_decode(encsym, enclen, sym);
+ declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP);
if (declen < 0) {
pr_err_client(cl,
"can't decode symlink (%d). Content: %.*s\n",
@@ -966,7 +1002,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
@@ -1043,7 +1079,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
#ifdef CONFIG_FS_ENCRYPTION
if (iinfo->fscrypt_auth_len &&
- ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
kfree(ci->fscrypt_auth);
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
ci->fscrypt_auth = iinfo->fscrypt_auth;
@@ -1523,6 +1559,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct ceph_vino tvino, dvino;
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
doutc(cl, "%p is_dentry %d is_target %d\n", req,
@@ -1536,10 +1573,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
- if (dir) {
- err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
@@ -1548,14 +1592,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
struct fscrypt_str oname = FSTR_INIT(NULL, 0);
- struct ceph_fname fname = { .dir = dir,
+ struct ceph_fname fname = { .dir = parent_dir,
.name = rinfo->dname,
.ctext = rinfo->altname,
.name_len = rinfo->dname_len,
@@ -1564,10 +1608,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
- parent = d_find_any_alias(dir);
+ parent = d_find_any_alias(parent_dir);
BUG_ON(!parent);
- err = ceph_fname_alloc_buffer(dir, &oname);
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
if (err < 0) {
dput(parent);
goto done;
@@ -1576,7 +1620,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
if (err < 0) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
goto done;
}
dname.name = oname.name;
@@ -1595,7 +1639,7 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
err = -ENOMEM;
goto done;
}
@@ -1610,12 +1654,12 @@ retry_lookup:
ceph_snap(d_inode(dn)) != tvino.snap)) {
doutc(cl, " dn %p points to wrong inode %p\n",
dn, d_inode(dn));
- ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(parent_dir);
d_delete(dn);
dput(dn);
goto retry_lookup;
}
- ceph_fname_free_buffer(dir, &oname);
+ ceph_fname_free_buffer(parent_dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1637,13 +1681,13 @@ retry_lookup:
pr_err_client(cl, "badness %p %llx.%llx\n", in,
ceph_vinop(in));
req->r_target_inode = NULL;
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
discard_new_inode(in);
else
iput(in);
goto done;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
}
@@ -1739,6 +1783,11 @@ retry_lookup:
goto done;
}
+ if (unlikely(!in)) {
+ err = -EINVAL;
+ goto done;
+ }
+
/* attach proper inode */
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
@@ -1774,6 +1823,12 @@ retry_lookup:
doutc(cl, " linking snapped dir %p to dn %p\n", in,
req->r_dentry);
ceph_dir_clear_ordered(dir);
+
+ if (unlikely(!in)) {
+ err = -EINVAL;
+ goto done;
+ }
+
ihold(in);
err = splice_dentry(&req->r_dentry, in);
if (err < 0)
@@ -1794,6 +1849,9 @@ retry_lookup:
&dvino, ptvino);
}
done:
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
doutc(cl, "done err=%d\n", err);
return err;
}
@@ -1829,11 +1887,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
pr_err_client(cl, "inode badness on %p got %d\n", in,
rc);
err = rc;
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- } else if (in->i_state & I_NEW) {
+ } else if (inode_state_read_once(in) & I_NEW) {
unlock_new_inode(in);
}
@@ -1845,10 +1903,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
- if (ctl->page) {
- kunmap(ctl->page);
- put_page(ctl->page);
- ctl->page = NULL;
+ if (ctl->folio) {
+ folio_release_kmap(ctl->folio, ctl->dentries);
+ ctl->folio = NULL;
}
}
@@ -1862,20 +1919,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
- if (!ctl->page || pgoff != ctl->page->index) {
+ if (!ctl->folio || pgoff != ctl->folio->index) {
ceph_readdir_cache_release(ctl);
+ fgf_t fgf = FGP_LOCK;
+
if (idx == 0)
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
- else
- ctl->page = find_lock_page(&dir->i_data, pgoff);
- if (!ctl->page) {
+ fgf |= FGP_ACCESSED | FGP_CREAT;
+
+ ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
+ fgf, mapping_gfp_mask(&dir->i_data));
+ if (IS_ERR(ctl->folio)) {
+ int err = PTR_ERR(ctl->folio);
+
+ ctl->folio = NULL;
ctl->index = -1;
- return idx == 0 ? -ENOMEM : 0;
+ return idx == 0 ? err : 0;
}
/* reading/filling the cache are serialized by
- * i_rwsem, no need to use page lock */
- unlock_page(ctl->page);
- ctl->dentries = kmap(ctl->page);
+ * i_rwsem, no need to use folio lock */
+ folio_unlock(ctl->folio);
+ ctl->dentries = kmap_local_folio(ctl->folio, 0);
if (idx == 0)
memset(ctl->dentries, 0, PAGE_SIZE);
}
@@ -2040,7 +2103,7 @@ retry_lookup:
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
ceph_vinop(in));
if (d_really_is_negative(dn)) {
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
@@ -2050,7 +2113,7 @@ retry_lookup:
err = ret;
goto next_item;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
if (d_really_is_negative(dn)) {
@@ -2362,7 +2425,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
- loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
ret = filemap_write_and_wait_range(inode->i_mapping,
orig_pos, lend);
@@ -2431,8 +2494,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* encrypt the last block */
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
CEPH_FSCRYPT_BLOCK_SIZE,
- 0, block,
- GFP_KERNEL);
+ 0, block);
if (ret)
goto out;
}
@@ -2483,22 +2545,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -2631,10 +2692,8 @@ retry:
if (ia_valid & ATTR_ATIME) {
struct timespec64 atime = inode_get_atime(inode);
- doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
- inode, ceph_vinop(inode),
- atime.tv_sec, atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+ doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &atime, &attr->ia_atime);
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_atime_to_ts(inode, attr->ia_atime);
@@ -2708,10 +2767,8 @@ retry:
if (ia_valid & ATTR_MTIME) {
struct timespec64 mtime = inode_get_mtime(inode);
- doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
- inode, ceph_vinop(inode),
- mtime.tv_sec, mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+ doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
inode_set_mtime_to_ts(inode, attr->ia_mtime);
@@ -2732,13 +2789,11 @@ retry:
/* these do nothing */
if (ia_valid & ATTR_CTIME) {
+ struct timespec64 ictime = inode_get_ctime(inode);
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
- inode, ceph_vinop(inode),
- inode_get_ctime_sec(inode),
- inode_get_ctime_nsec(inode),
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+ doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
+ inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
only ? "ctime only" : "ignored");
if (only) {
/*