diff options
Diffstat (limited to 'fs/ceph/inode.c')
| -rw-r--r-- | fs/ceph/inode.c | 1478 |
1 files changed, 1128 insertions, 350 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ef4a980a7bf3..2966f88310e3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -14,10 +14,12 @@ #include <linux/random.h> #include <linux/sort.h> #include <linux/iversion.h> +#include <linux/fscrypt.h> #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> /* @@ -33,6 +35,7 @@ */ static const struct inode_operations ceph_symlink_iops; +static const struct inode_operations ceph_encrypted_symlink_iops; static void ceph_inode_work(struct work_struct *work); @@ -52,60 +55,213 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + +/** + * ceph_new_inode - allocate a new inode in advance of an expected create + * @dir: parent directory for new inode + * @dentry: dentry that may eventually point to new inode + * @mode: mode of new inode + * @as_ctx: pointer to inherited security context + * + * Allocate a new inode in advance of an operation to create a new inode. + * This allocates the inode and sets up the acl_sec_ctx with appropriate + * info for the new inode. + * + * Returns a pointer to the new inode or an ERR_PTR. + */ +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) { + int err; + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + + if (!S_ISLNK(*mode)) { + err = ceph_pre_init_acls(dir, mode, as_ctx); + if (err < 0) + goto out_err; + } + + inode_state_assign_raw(inode, 0); + inode->i_mode = *mode; + + err = ceph_security_init_secctx(dentry, *mode, as_ctx); + if (err < 0) + goto out_err; + + /* + * We'll skip setting fscrypt context for snapshots, leaving that for + * the handle_reply(). + */ + if (ceph_snap(dir) != CEPH_SNAPDIR) { + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; + } + + return inode; +out_err: + iput(inode); + return ERR_PTR(err); +} + +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ + if (as_ctx->pagelist) { + req->r_pagelist = as_ctx->pagelist; + as_ctx->pagelist = NULL; + } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); +} + +/** + * ceph_get_inode - find or create/hash a new inode + * @sb: superblock to search and allocate in + * @vino: vino to search for + * @newino: optional new inode to insert if one isn't found (may be NULL) + * + * Search for or insert a new inode into the hash for the given vino, and + * return a reference to it. If new is non-NULL, its reference is consumed. + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, + struct inode *newino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; if (ceph_vino_is_reserved(vino)) return ERR_PTR(-EREMOTEIO); - inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, - ceph_set_ino_cb, &vino); - if (!inode) + if (newino) { + inode = inode_insert5(newino, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode != newino) + iput(newino); + } else { + inode = iget5_locked(sb, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + } + + if (!inode) { + doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap); return ERR_PTR(-ENOMEM); + } - dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), - ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); + doutc(cl, "on %llx=%llx.%llx got %p new %d\n", + ceph_present_inode(inode), ceph_vinop(inode), inode, + !!(inode_state_read_once(inode) & I_NEW)); return inode; } /* - * get/constuct snapdir inode for a given directory + * get/construct snapdir inode for a given directory */ struct inode *ceph_get_snapdir(struct inode *parent) { + struct ceph_client *cl = ceph_inode_to_client(parent); struct ceph_vino vino = { .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; - struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); struct ceph_inode_info *ci = ceph_inode(inode); + int ret = -ENOTDIR; if (IS_ERR(inode)) return inode; if (!S_ISDIR(parent->i_mode)) { - pr_warn_once("bad snapdir parent type (mode=0%o)\n", - parent->i_mode); - return ERR_PTR(-ENOTDIR); + pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n", + parent->i_mode); + goto err; } - if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { - pr_warn_once("bad snapdir inode type (mode=0%o)\n", - inode->i_mode); - return ERR_PTR(-ENOTDIR); + if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { + pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", + inode->i_mode); + goto err; } inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_mtime = parent->i_mtime; - inode->i_ctime = parent->i_ctime; - inode->i_atime = parent->i_atime; + inode_set_mtime_to_ts(inode, inode_get_mtime(parent)); + inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); + inode_set_atime_to_ts(inode, inode_get_atime(parent)); ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; - if (inode->i_state & I_NEW) { +#ifdef CONFIG_FS_ENCRYPTION + /* if encrypted, just borrow fscrypt_auth from parent */ + if (IS_ENCRYPTED(parent)) { + struct ceph_inode_info *pci = ceph_inode(parent); + + ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, + pci->fscrypt_auth_len, + GFP_KERNEL); + if (ci->fscrypt_auth) { + inode->i_flags |= S_ENCRYPTED; + ci->fscrypt_auth_len = pci->fscrypt_auth_len; + } else { + doutc(cl, "Failed to alloc snapdir fscrypt_auth\n"); + ret = -ENOMEM; + goto err; + } + } +#endif + if (inode_state_read_once(inode) & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ @@ -113,6 +269,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode_state_read_once(inode) & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(ret); } const struct inode_operations ceph_file_iops = { @@ -120,7 +282,7 @@ const struct inode_operations ceph_file_iops = { .setattr = ceph_setattr, .getattr = ceph_getattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; @@ -139,6 +301,8 @@ const struct inode_operations ceph_file_iops = { static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, u32 f) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_frag *frag; @@ -169,8 +333,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_link_node(&frag->node, parent, p); rb_insert_color(&frag->node, &ci->i_fragtree); - dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f); return frag; } @@ -203,6 +366,7 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; unsigned nway, i; @@ -226,8 +390,8 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, /* choose child */ nway = 1 << frag->split_by; - dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, - frag->split_by, nway); + doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); for (i = 0; i < nway; i++) { n = ceph_frag_make_child(t, frag->split_by, i); if (ceph_frag_contains_value(n, v)) { @@ -237,7 +401,7 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } BUG_ON(i == nway); } - dout("choose_frag(%x) = %x\n", v, t); + doutc(cl, "frag(%x) = %x\n", v, t); return t; } @@ -261,6 +425,7 @@ static int ceph_fill_dirfrag(struct inode *inode, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); @@ -285,14 +450,14 @@ static int ceph_fill_dirfrag(struct inode *inode, goto out; if (frag->split_by == 0) { /* tree leaf, remove */ - dout("fill_dirfrag removed %llx.%llx frag %x" - " (no ref)\n", ceph_vinop(inode), id); + doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n", + inode, ceph_vinop(inode), id); rb_erase(&frag->node, &ci->i_fragtree); kfree(frag); } else { /* tree branch, keep and clear */ - dout("fill_dirfrag cleared %llx.%llx frag %x" - " referral\n", ceph_vinop(inode), id); + doutc(cl, "cleared %p %llx.%llx frag %x referral\n", + inode, ceph_vinop(inode), id); frag->mds = -1; frag->ndist = 0; } @@ -305,8 +470,9 @@ static int ceph_fill_dirfrag(struct inode *inode, if (IS_ERR(frag)) { /* this is not the end of the world; we can continue with bad/inaccurate delegation info */ - pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", - ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n", + inode, ceph_vinop(inode), + le32_to_cpu(dirinfo->frag)); err = -ENOMEM; goto out; } @@ -315,8 +481,8 @@ static int ceph_fill_dirfrag(struct inode *inode, frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); for (i = 0; i < frag->ndist; i++) frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); - dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", - ceph_vinop(inode), frag->frag, frag->ndist); + doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode, + ceph_vinop(inode), frag->frag, frag->ndist); out: mutex_unlock(&ci->i_fragtree_mutex); @@ -344,6 +510,7 @@ static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; @@ -356,7 +523,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32() % nsplits; + i = get_random_u32_below(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@ -379,15 +546,15 @@ static int ceph_fill_fragtree(struct inode *inode, frag_tree_split_cmp, NULL); } - dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); split_by = le32_to_cpu(fragtree->splits[i].by); if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { - pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " - "frag %x split by %d\n", ceph_vinop(inode), - i, nsplits, id, split_by); + pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", inode, + ceph_vinop(inode), i, nsplits, id, split_by); continue; } frag = NULL; @@ -419,7 +586,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by == 0) ci->i_fragtree_nsplits++; frag->split_by = split_by; - dout(" frag %x split by %d\n", frag->frag, frag->split_by); + doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by); prev_frag = frag; } while (rb_node) { @@ -444,14 +611,18 @@ out_unlock: */ struct inode *ceph_alloc_inode(struct super_block *sb) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_inode_info *ci; int i; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + doutc(fsc->client, "%p\n", &ci->netfs.inode); + + /* Set parameters for the netfs library */ + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); @@ -508,6 +679,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -538,10 +710,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - - ceph_fscache_inode_init(ci); - - return &ci->vfs_inode; +#ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif + return &ci->netfs.inode; } void ceph_free_inode(struct inode *inode) @@ -549,6 +723,10 @@ void ceph_free_inode(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif + fscrypt_free_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); } @@ -556,23 +734,26 @@ void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; struct rb_node *n; - dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode)); percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_FSCACHE_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* @@ -581,8 +762,8 @@ void ceph_evict_inode(struct inode *inode) */ if (ci->i_snap_realm) { if (ceph_snap(inode) == CEPH_NOSNAP) { - dout(" dropping residual ref to snap realm %p\n", - ci->i_snap_realm); + doutc(cl, " dropping residual ref to snap realm %p\n", + ci->i_snap_realm); ceph_change_snap_realm(inode, NULL); } else { ceph_put_snapid_map(mdsc, ci->i_snapid_map); @@ -623,15 +804,16 @@ static inline blkcnt_t calc_inode_blocks(u64 size) int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > isize)) { - dout("size %lld -> %llu\n", isize, size); + doutc(cl, "size %lld -> %llu\n", isize, size); if (size > 0 && S_ISDIR(inode->i_mode)) { - pr_err("fill_file_size non-zero size for directory\n"); + pr_err_client(cl, "non-zero size for directory\n"); size = 0; } i_size_write(inode, size); @@ -644,14 +826,12 @@ int ceph_fill_file_size(struct inode *inode, int issued, ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", - ci->i_truncate_seq, truncate_seq); + doutc(cl, "truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; /* the MDS should have revoked these caps */ - WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR | + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're @@ -668,11 +848,27 @@ int ceph_fill_file_size(struct inode *inode, int issued, } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n", + ci->i_truncate_size, truncate_size, + !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; + + if (IS_ENCRYPTED(inode)) { + doutc(cl, "truncate_pagecache_size %lld -> %llu\n", + ci->i_truncate_pagecache_size, size); + ci->i_truncate_pagecache_size = size; + } else { + ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } @@ -681,7 +877,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 iatime = inode_get_atime(inode); + struct timespec64 ictime = inode_get_ctime(inode); + struct timespec64 imtime = inode_get_mtime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -690,39 +890,28 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec64_compare(ctime, &inode->i_ctime) > 0) { - dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + timespec64_compare(ctime, &ictime) > 0) { + doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime); + inode_set_ctime_to_ts(inode, *ctime); } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ - dout("mtime %lld.%09ld -> %lld.%09ld " - "tw %d -> %d\n", - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec, - ci->i_time_warp_seq, (int)time_warp_seq); - - inode->i_mtime = *mtime; - inode->i_atime = *atime; + doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec64_compare(mtime, &inode->i_mtime) > 0) { - dout("mtime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_mtime.tv_sec, - inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + if (timespec64_compare(mtime, &imtime) > 0) { + doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime); + inode_set_mtime_to_ts(inode, *mtime); } - if (timespec64_compare(atime, &inode->i_atime) > 0) { - dout("atime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_atime.tv_sec, - inode->i_atime.tv_nsec, - atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + if (timespec64_compare(atime, &iatime) > 0) { + doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime); + inode_set_atime_to_ts(inode, *atime); } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -732,19 +921,53 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_ctime_to_ts(inode, *ctime); + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; } } if (warn) /* time_warp_seq shouldn't go backwards */ - dout("%p mds time_warp_seq %llu < %u\n", - inode, time_warp_seq, ci->i_time_warp_seq); + doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode, + time_warp_seq, ci->i_time_warp_seq); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int enclen, u8 **decsym) +{ + struct ceph_client *cl = mdsc->fsc->client; + int declen; + u8 *sym; + + sym = kmalloc(enclen + 1, GFP_NOFS); + if (!sym) + return -ENOMEM; + + declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP); + if (declen < 0) { + pr_err_client(cl, + "can't decode symlink (%d). Content: %.*s\n", + declen, enclen, encsym); + kfree(sym); + return -EIO; + } + sym[declen + 1] = '\0'; + *decsym = sym; + return declen; +} +#else +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int symlen, u8 **decsym) +{ + return -EOPNOTSUPP; +} +#endif + /* * Populate an inode based on info from mds. May be called on new or * existing inodes. @@ -756,6 +979,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_cap_reservation *caps_reservation) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued, new_issued, info_caps; @@ -774,25 +998,26 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, lockdep_assert_held(&mdsc->snap_rwsem); - dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, - inode, ceph_vinop(inode), le64_to_cpu(info->version), - ci->i_version); + doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), + le64_to_cpu(info->version), ci->i_version); /* Once I_NEW is cleared, we can't change type or dev numbers */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_mode = mode; } else { if (inode_wrong_type(inode, mode)) { - pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", - ceph_vinop(inode), inode->i_mode, mode); + pr_warn_once_client(cl, + "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); return -ESTALE; } if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { - pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", - ceph_vinop(inode), MAJOR(inode->i_rdev), - MINOR(inode->i_rdev), MAJOR(rdev), - MINOR(rdev)); + pr_warn_once_client(cl, + "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); return -ESTALE; } } @@ -814,8 +1039,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (iinfo->xattr_len > 4) { xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) - pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, - iinfo->xattr_len); + pr_err_client(cl, "ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); } if (iinfo->pool_ns_len > 0) @@ -850,27 +1075,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* directories have fl_stripe_unit set to zero */ - if (le32_to_cpu(info->layout.fl_stripe_unit)) - inode->i_blkbits = - fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - else - inode->i_blkbits = CEPH_BLOCK_SHIFT; - __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && + ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); - dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - from_kuid(&init_user_ns, inode->i_uid), - from_kgid(&init_user_ns, inode->i_gid)); + doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, + ceph_vinop(inode), inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } + /* directories have fl_stripe_unit set to zero */ + if (IS_ENCRYPTED(inode)) + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + else if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); @@ -892,6 +1132,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; @@ -905,15 +1146,28 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, pool_ns = old_ns; + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn_client(cl, + "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); + size); /* only update max_size on auth cap */ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); + doutc(cl, "max_size %lld -> %llu\n", + ci->i_max_size, le64_to_cpu(info->max_size)); ci->i_max_size = le64_to_cpu(info->max_size); } } @@ -968,26 +1222,45 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_file_fops; break; case S_IFLNK: - inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { u32 symlen = iinfo->symlink_len; char *sym; spin_unlock(&ci->i_ceph_lock); - if (symlen != i_size_read(inode)) { - pr_err("%s %llx.%llx BAD symlink " - "size %lld\n", __func__, - ceph_vinop(inode), - i_size_read(inode)); + if (IS_ENCRYPTED(inode)) { + if (symlen != i_size_read(inode)) + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + + err = decode_encrypted_symlink(mdsc, iinfo->symlink, + symlen, (u8 **)&sym); + if (err < 0) { + pr_err_client(cl, + "decoding encrypted symlink failed: %d\n", + err); + goto out; + } + symlen = err; i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); - } + } else { + if (symlen != i_size_read(inode)) { + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } - err = -ENOMEM; - sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); - if (!sym) - goto out; + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + } spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -995,15 +1268,25 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } - inode->i_link = ci->i_symlink; + + if (IS_ENCRYPTED(inode)) { + /* + * Encrypted symlinks need to be decrypted before we can + * cache their targets in i_link. Don't touch it here. + */ + inode->i_op = &ceph_encrypted_symlink_iops; + } else { + inode->i_link = ci->i_symlink; + inode->i_op = &ceph_symlink_iops; + } break; case S_IFDIR: inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; break; default: - pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, - ceph_vinop(inode), inode->i_mode); + pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode, + ceph_vinop(inode), inode->i_mode); } /* were we issued a capability? */ @@ -1024,7 +1307,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, (info_caps & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); + doutc(cl, " marking %p complete (empty)\n", + inode); i_size_write(inode, 0); __ceph_dir_set_complete(ci, atomic64_read(&ci->i_release_count), @@ -1033,8 +1317,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, wake = true; } else { - dout(" %p got snap_caps %s\n", inode, - ceph_cap_string(info_caps)); + doutc(cl, " %p got snap_caps %s\n", inode, + ceph_cap_string(info_caps)); ci->i_snap_caps |= info_caps; } } @@ -1043,15 +1327,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, iinfo->inline_version >= ci->i_inline_version) { int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; ci->i_inline_version = iinfo->inline_version; - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (ceph_has_inline_data(ci) && (locked_page || (info_caps & cache_caps))) fill_inline = true; } if (cap_fmode >= 0) { if (!info_caps) - pr_warn("mds issued no caps on %llx.%llx\n", - ceph_vinop(inode)); + pr_warn_client(cl, "mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); __ceph_touch_fmode(ci, mdsc, cap_fmode); } @@ -1097,14 +1381,14 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, unsigned long from_time, struct ceph_mds_session **old_lease_session) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); unsigned mask = le16_to_cpu(lease->mask); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; - dout("update_dentry_lease %p duration %lu ms ttl %lu\n", - dentry, duration, ttl); + doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl); /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) @@ -1201,10 +1485,11 @@ out_unlock: /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. + * caller must hold directory i_rwsem for this to be safe. */ static int splice_dentry(struct dentry **pdn, struct inode *in) { + struct ceph_client *cl = ceph_inode_to_client(in); struct dentry *dn = *pdn; struct dentry *realdn; @@ -1236,23 +1521,21 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) d_drop(dn); realdn = d_splice_alias(in, dn); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", - PTR_ERR(realdn), dn, in, ceph_vinop(in)); + pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); return PTR_ERR(realdn); } if (realdn) { - dout("dn %p (%d) spliced with %p (%d) " - "inode %p ino %llx.%llx\n", - dn, d_count(dn), - realdn, d_count(realdn), - d_inode(realdn), ceph_vinop(d_inode(realdn))); + doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n", + dn, d_count(dn), realdn, d_count(realdn), + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); *pdn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", - dn, d_inode(dn), ceph_vinop(d_inode(dn))); + doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn, + d_inode(dn), ceph_vinop(d_inode(dn))); } return 0; } @@ -1274,24 +1557,33 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; struct ceph_vino tvino, dvino; - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; - dout("fill_trace %p is_dentry %d is_target %d\n", req, - rinfo->head->is_dentry, rinfo->head->is_target); + doutc(cl, "%p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); if (!rinfo->head->is_target && !rinfo->head->is_dentry) { - dout("fill_trace reply is empty!\n"); + doutc(cl, "reply is empty!\n"); if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1300,48 +1592,74 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = parent_dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - dname.name = rinfo->dname; - dname.len = rinfo->dname_len; + err = ceph_fname_alloc_buffer(parent_dir, &oname); + if (err < 0) { + dput(parent); + goto done; + } + + err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); + if (err < 0) { + dput(parent); + ceph_fname_free_buffer(parent_dir, &oname); + goto done; + } + dname.name = oname.name; + dname.len = oname.len; dname.hash = full_name_hash(parent, dname.name, dname.len); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); - dout("d_lookup on parent=%p name=%.*s got %p\n", - parent, dname.len, dname.name, dn); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); if (!dn) { dn = d_alloc(parent, &dname); - dout("d_alloc %p '%.*s' = %p\n", parent, - dname.len, dname.name, dn); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); if (!dn) { dput(parent); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } + if (is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } err = 0; } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { - dout(" dn %p points to wrong inode %p\n", - dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1360,16 +1678,16 @@ retry_lookup: rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { - pr_err("ceph_fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); + pr_err_client(cl, "badness %p %llx.%llx\n", in, + ceph_vinop(in)); req->r_target_inode = NULL; - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) discard_new_inode(in); else iput(in); goto done; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); } @@ -1411,36 +1729,32 @@ retry_lookup: have_lease = have_dir_cap || le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) - dout("fill_trace no dentry lease or dir cap\n"); + doutc(cl, "no dentry lease or dir cap\n"); /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { struct inode *olddir = req->r_old_dentry_dir; BUG_ON(!olddir); - dout(" src %p '%pd' dst %p '%pd'\n", - req->r_old_dentry, - req->r_old_dentry, - dn, dn); - dout("fill_trace doing d_move %p -> %p\n", - req->r_old_dentry, dn); + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); + doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn); /* d_move screws up sibling dentries' offsets */ ceph_dir_clear_ordered(dir); ceph_dir_clear_ordered(olddir); d_move(req->r_old_dentry, dn); - dout(" src %p '%pd' dst %p '%pd'\n", - req->r_old_dentry, - req->r_old_dentry, - dn, dn); + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, - ceph_dentry(req->r_old_dentry)->offset); + doutc(cl, "dn %p gets new offset %lld\n", + req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); /* swap r_dentry and r_old_dentry in case that * splice_dentry() gets called later. This is safe @@ -1452,18 +1766,25 @@ retry_lookup: /* null dentry? */ if (!rinfo->head->is_target) { - dout("fill_trace null dentry\n"); + doutc(cl, "null dentry\n"); if (d_really_is_positive(dn)) { - dout("d_delete %p\n", dn); + doutc(cl, "d_delete %p\n", dn); ceph_dir_clear_ordered(dir); d_delete(dn); } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } + goto done; + } + + if (unlikely(!in)) { + err = -EINVAL; goto done; } @@ -1476,9 +1797,9 @@ retry_lookup: goto done; dn = req->r_dentry; /* may have spliced */ } else if (d_really_is_positive(dn) && d_inode(dn) != in) { - dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, d_inode(dn), ceph_vinop(d_inode(dn)), - ceph_vinop(in)); + doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn)), + ceph_vinop(in)); d_invalidate(dn); have_lease = false; } @@ -1488,7 +1809,7 @@ retry_lookup: rinfo->dlease, session, req->r_request_started); } - dout(" final dn %p\n", dn); + doutc(cl, " final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && @@ -1499,14 +1820,21 @@ retry_lookup: BUG_ON(!dir); BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); BUG_ON(!req->r_dentry); - dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry); + doutc(cl, " linking snapped dir %p to dn %p\n", in, + req->r_dentry); ceph_dir_clear_ordered(dir); + + if (unlikely(!in)) { + err = -EINVAL; + goto done; + } + ihold(in); err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; } else if (rinfo->head->is_dentry && req->r_dentry) { - /* parent inode is not locked, be carefull */ + /* parent inode is not locked, be careful */ struct ceph_vino *ptvino = NULL; dvino.ino = le64_to_cpu(rinfo->diri.in->ino); dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); @@ -1521,7 +1849,10 @@ retry_lookup: &dvino, ptvino); } done: - dout("fill_trace done err=%d\n", err); + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); + doutc(cl, "done err=%d\n", err); return err; } @@ -1532,6 +1863,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { @@ -1543,23 +1875,23 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); - in = ceph_get_inode(req->r_dentry->d_sb, vino); + in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); - dout("new_inode badness got %d\n", err); + doutc(cl, "badness got %d\n", err); continue; } rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, -1, &req->r_caps_reservation); if (rc < 0) { - pr_err("ceph_fill_inode badness on %p got %d\n", - in, rc); + pr_err_client(cl, "inode badness on %p got %d\n", in, + rc); err = rc; - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } - } else if (in->i_state & I_NEW) { + } else if (inode_state_read_once(in) & I_NEW) { unlock_new_inode(in); } @@ -1571,10 +1903,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { - if (ctl->page) { - kunmap(ctl->page); - put_page(ctl->page); - ctl->page = NULL; + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; } } @@ -1582,36 +1913,43 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, struct ceph_readdir_cache_control *ctl, struct ceph_mds_request *req) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_inode_info *ci = ceph_inode(dir); unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != page_index(ctl->page)) { + if (!ctl->folio || pgoff != ctl->folio->index) { ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + if (idx == 0) - ctl->page = grab_cache_page(&dir->i_data, pgoff); - else - ctl->page = find_lock_page(&dir->i_data, pgoff); - if (!ctl->page) { + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; ctl->index = -1; - return idx == 0 ? -ENOMEM : 0; + return idx == 0 ? err : 0; } /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(ctl->page); - ctl->dentries = kmap(ctl->page); + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); if (idx == 0) memset(ctl->dentries, 0, PAGE_SIZE); } if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { - dout("readdir cache dn %p idx %d\n", dn, ctl->index); + doutc(cl, "dn %p idx %d\n", dn, ctl->index); ctl->dentries[idx] = dn; ctl->index++; } else { - dout("disable readdir cache\n"); + doutc(cl, "disable readdir cache\n"); ctl->index = -1; } return 0; @@ -1621,8 +1959,10 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + struct inode *inode = d_inode(parent); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; struct qstr dname; struct dentry *dn; struct inode *in; @@ -1650,19 +1990,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { - dout("readdir_prepopulate got new frag %x -> %x\n", - frag, le32_to_cpu(rinfo->dir_dir->frag)); + doutc(cl, "got new frag %x -> %x\n", frag, + le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (!rinfo->hash_order) req->r_readdir_offset = 2; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); } else { - dout("readdir_prepopulate %d items under dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); @@ -1695,9 +2034,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { - u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - rde->name, rde->name_len); - hash = ceph_frag_value(hash); + u32 hash = ceph_frag_value(rde->raw_hash); if (hash != last_hash) fpos_offset = 2; last_hash = hash; @@ -1708,24 +2045,29 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, retry_lookup: dn = d_lookup(parent, &dname); - dout("d_lookup on parent=%p name=%.*s got %p\n", - parent, dname.len, dname.name, dn); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); if (!dn) { dn = d_alloc(parent, &dname); - dout("d_alloc %p '%.*s' = %p\n", parent, - dname.len, dname.name, dn); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); if (!dn) { - dout("d_alloc badness\n"); + doutc(cl, "d_alloc badness\n"); err = -ENOMEM; goto out; } + if (rde->is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { struct ceph_dentry_info *di = ceph_dentry(dn); - dout(" dn %p points to wrong inode %p\n", - dn, d_inode(dn)); + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); spin_lock(&dn->d_lock); if (di->offset > 0 && @@ -1745,9 +2087,9 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, tvino); + in = ceph_get_inode(parent->d_sb, tvino, NULL); if (IS_ERR(in)) { - dout("new_inode badness\n"); + doutc(cl, "new_inode badness\n"); d_drop(dn); dput(dn); err = PTR_ERR(in); @@ -1758,9 +2100,10 @@ retry_lookup: ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, -1, &req->r_caps_reservation); if (ret < 0) { - pr_err("ceph_fill_inode badness on %p\n", in); + pr_err_client(cl, "badness on %p %llx.%llx\n", in, + ceph_vinop(in)); if (d_really_is_negative(dn)) { - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } @@ -1770,13 +2113,13 @@ retry_lookup: err = ret; goto next_item; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); if (d_really_is_negative(dn)) { if (ceph_security_xattr_deadlock(in)) { - dout(" skip splicing dn %p to inode %p" - " (security xattr deadlock)\n", dn, in); + doutc(cl, " skip splicing dn %p to inode %p" + " (security xattr deadlock)\n", dn, in); iput(in); skipped++; goto next_item; @@ -1808,17 +2151,18 @@ out: req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); - dout("readdir_prepopulate done\n"); + doutc(cl, "done\n"); return err; } bool ceph_inode_set_size(struct inode *inode, loff_t size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); + doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); ceph_fscache_update(inode); inode->i_blocks = calc_inode_blocks(size); @@ -1832,22 +2176,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) void ceph_queue_inode_work(struct inode *inode, int work_bit) { - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); set_bit(work_bit, &ci->i_work_mask); ihold(inode); if (queue_work(fsc->inode_wq, &ci->i_work)) { - dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask); + doutc(cl, "%p %llx.%llx mask=%lx\n", inode, + ceph_vinop(inode), ci->i_work_mask); } else { - dout("queue_inode_work %p already queued, mask=%lx\n", - inode, ci->i_work_mask); + doutc(cl, "%p %llx.%llx already queued, mask=%lx\n", + inode, ceph_vinop(inode), ci->i_work_mask); iput(inode); } } static void ceph_do_invalidate_pages(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 orig_gen; int check = 0; @@ -1857,8 +2204,9 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_lock(&ci->i_truncate_mutex); if (ceph_inode_is_shutdown(inode)) { - pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n", - __func__, ceph_vinop(inode)); + pr_warn_ratelimited_client(cl, + "%p %llx.%llx is shut down\n", inode, + ceph_vinop(inode)); mapping_set_error(inode->i_mapping, -EIO); truncate_pagecache(inode, 0); mutex_unlock(&ci->i_truncate_mutex); @@ -1866,8 +2214,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) } spin_lock(&ci->i_ceph_lock); - dout("invalidate_pages %p gen %d revoking %d\n", inode, - ci->i_rdcache_gen, ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) check = 1; @@ -1878,23 +2226,22 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { - pr_err("invalidate_inode_pages2 %llx.%llx failed\n", - ceph_vinop(inode)); + pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n", + ceph_vinop(inode)); } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && orig_gen == ci->i_rdcache_revoking) { - dout("invalidate_pages %p gen %d successful\n", inode, - ci->i_rdcache_gen); + doutc(cl, "%p %llx.%llx gen %d successful\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen); ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", - inode, orig_gen, ci->i_rdcache_gen, - ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n", + inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) check = 1; } @@ -1902,7 +2249,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_unlock(&ci->i_truncate_mutex); out: if (check) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); } /* @@ -1911,6 +2258,7 @@ out: */ void __ceph_do_pending_vmtruncate(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 to; int wrbuffer_refs, finish = 0; @@ -1919,7 +2267,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + doutc(cl, "%p %llx.%llx none pending\n", inode, + ceph_vinop(inode)); spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); return; @@ -1931,8 +2280,8 @@ retry: */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { spin_unlock(&ci->i_ceph_lock); - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); + doutc(cl, "%p %llx.%llx flushing snaps first\n", inode, + ceph_vinop(inode)); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1941,17 +2290,17 @@ retry: /* there should be no reader or writer */ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); - to = ci->i_truncate_size; + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, - ci->i_truncate_pending, to); + doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode), + ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); ceph_fscache_resize(inode, to); truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -1962,7 +2311,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); wake_up_all(&ci->i_cap_wq); } @@ -1971,10 +2320,11 @@ static void ceph_inode_work(struct work_struct *work) { struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { - dout("writeback %p\n", inode); + doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode)); filemap_fdatawrite(&inode->i_data); } if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) @@ -1984,7 +2334,7 @@ static void ceph_inode_work(struct work_struct *work) __ceph_do_pending_vmtruncate(inode); if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) ceph_flush_snaps(ci, NULL); @@ -1992,6 +2342,32 @@ static void ceph_inode_work(struct work_struct *work) iput(inode); } +static const char *ceph_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!dentry) + return ERR_PTR(-ECHILD); + + return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), + done); +} + +static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int ret; + + ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); + if (ret) + return ret; + return fscrypt_symlink_getattr(path, stat); +} + /* * symlinks */ @@ -2002,20 +2378,200 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, }; -int __ceph_setattr(struct inode *inode, struct iattr *attr) +static const struct inode_operations ceph_encrypted_symlink_iops = { + .get_link = ceph_encrypted_get_link, + .setattr = ceph_setattr, + .getattr = ceph_encrypted_symlink_getattr, + .listxattr = ceph_listxattr, +}; + +/* + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. + */ +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n", + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff, + CEPH_FSCRYPT_BLOCK_SIZE); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode, + ceph_vinop(inode), ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + +int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, + struct iattr *attr, struct ceph_iattr *cia) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ + struct dentry *dentry; + char *path; + bool do_sync = false; + + dentry = d_find_alias(inode); + if (!dentry) { + do_sync = true; + } else { + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); + if (IS_ERR(path)) { + do_sync = true; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, MAY_WRITE); + } + ceph_mdsc_free_path_info(&path_info); + dput(dentry); + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + do_sync = true; + err = 0; + } + } + +retry: prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2027,6 +2583,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) return PTR_ERR(req); } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); @@ -2041,42 +2598,86 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } - dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode), + ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode, + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { - dout("setattr %p uid %d -> %d\n", inode, - from_kuid(&init_user_ns, inode->i_uid), - from_kuid(&init_user_ns, attr->ia_uid)); - if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_uid = attr->ia_uid; + kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); + + doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode, + ceph_vinop(inode), + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + inode->i_uid = fsuid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !uid_eq(attr->ia_uid, inode->i_uid)) { + !uid_eq(fsuid, inode->i_uid)) { req->r_args.setattr.uid = cpu_to_le32( - from_kuid(&init_user_ns, attr->ia_uid)); + from_kuid(&init_user_ns, fsuid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { - dout("setattr %p gid %d -> %d\n", inode, - from_kgid(&init_user_ns, inode->i_gid), - from_kgid(&init_user_ns, attr->ia_gid)); - if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_gid = attr->ia_gid; + kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); + + doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode, + ceph_vinop(inode), + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + inode->i_gid = fsgid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !gid_eq(attr->ia_gid, inode->i_gid)) { + !gid_eq(fsgid, inode->i_gid)) { req->r_args.setattr.gid = cpu_to_le32( - from_kgid(&init_user_ns, attr->ia_gid)); + from_kgid(&init_user_ns, fsgid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_MODE) { - dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, - attr->ia_mode); - if (issued & CEPH_CAP_AUTH_EXCL) { + doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode, + ceph_vinop(inode), inode->i_mode, attr->ia_mode); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { inode->i_mode = attr->ia_mode; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || @@ -2089,20 +2690,21 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } if (ia_valid & ATTR_ATIME) { - dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + struct timespec64 atime = inode_get_atime(inode); + + doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &atime, &attr->ia_atime); + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_atime, - &attr->ia_atime) < 0) { - inode->i_atime = attr->ia_atime; + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&atime, + &attr->ia_atime) < 0) { + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + !timespec64_equal(&atime, &attr->ia_atime)) { ceph_encode_timespec64(&req->r_args.setattr.atime, &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; @@ -2111,10 +2713,28 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - loff_t isize = i_size_read(inode); - - dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode, + ceph_vinop(inode), isize, attr->ia_size); + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2124,28 +2744,41 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { - dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + struct timespec64 mtime = inode_get_mtime(inode); + + doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &mtime, &attr->ia_mtime); + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_mtime, - &attr->ia_mtime) < 0) { - inode->i_mtime = attr->ia_mtime; + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&mtime, &attr->ia_mtime) < 0) { + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + !timespec64_equal(&mtime, &attr->ia_mtime)) { ceph_encode_timespec64(&req->r_args.setattr.mtime, &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; @@ -2156,12 +2789,12 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* these do nothing */ if (ia_valid & ATTR_CTIME) { + struct timespec64 ictime = inode_get_ctime(inode); bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; - dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, - only ? "ctime only" : "ignored"); + doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n", + inode, ceph_vinop(inode), &ictime, &attr->ia_ctime, + only ? "ctime only" : "ignored"); if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -2179,18 +2812,22 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_FILE) - dout("setattr %p ATTR_FILE ... hrm!\n", inode); + doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode, + ceph_vinop(inode)); if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); + inode_inc_iversion_raw(inode); } release &= issued; spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) + if (lock_snap_rwsem) { up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -2202,10 +2839,32 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n", + inode, ceph_vinop(inode), err, + ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } - dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, - ceph_cap_string(dirtied), mask); +out: + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode, + ceph_vinop(inode), err, ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); @@ -2219,11 +2878,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -2232,7 +2891,11 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = setattr_prepare(idmap, dentry, attr); if (err != 0) return err; @@ -2244,14 +2907,44 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; - err = __ceph_setattr(inode, attr); + err = __ceph_setattr(idmap, inode, attr, NULL); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); + err = posix_acl_chmod(idmap, dentry, attr->ia_mode); return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2259,23 +2952,25 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int mode; int err; if (ceph_snap(inode) == CEPH_SNAPDIR) { - dout("do_getattr inode %p SNAPDIR\n", inode); + doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode, + ceph_vinop(inode)); return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", - inode, ceph_cap_string(mask), inode->i_mode); + doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode, + ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode); if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); @@ -2290,14 +2985,68 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (inline_version == 0) { /* the reply is supposed to contain inline data */ err = -EINVAL; - } else if (inline_version == CEPH_INLINE_NONE) { + } else if (inline_version == CEPH_INLINE_NONE || + inline_version == 1) { err = -ENODATA; } else { err = req->r_reply_info.targeti.inline_len; } } ceph_mdsc_put_request(req); - dout("do_getattr result=%d\n", err); + doutc(cl, "result=%d\n", err); + return err; +} + +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + doutc(cl, "result=%d\n", err); return err; } @@ -2306,7 +3055,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, +int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int err; @@ -2317,7 +3066,7 @@ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(&init_user_ns, inode, mask); + err = generic_permission(idmap, inode, mask); return err; } @@ -2326,10 +3075,10 @@ static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; - if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@ -2340,11 +3089,10 @@ static int statx_to_caps(u32 want, umode_t mode) mask |= CEPH_CAP_LINK_SHARED; } - if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED; - if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED; return mask; @@ -2354,10 +3102,11 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, +int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2366,7 +3115,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); @@ -2374,7 +3123,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return err; } - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2386,17 +3135,40 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, valid_mask |= STATX_BTIME; } + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (IS_ERR(parent)) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* @@ -2409,6 +3181,12 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | + STATX_ATTR_ENCRYPTED); + stat->result_mask = request_mask & valid_mask; return err; } |
