summaryrefslogtreecommitdiff
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Kconfig38
-rw-r--r--fs/ceph/Makefile8
-rw-r--r--fs/ceph/acl.c265
-rw-r--r--fs/ceph/addr.c3125
-rw-r--r--fs/ceph/cache.c112
-rw-r--r--fs/ceph/cache.h117
-rw-r--r--fs/ceph/caps.c4760
-rw-r--r--fs/ceph/ceph_frag.c1
-rw-r--r--fs/ceph/crypto.c604
-rw-r--r--fs/ceph/crypto.h272
-rw-r--r--fs/ceph/debugfs.c391
-rw-r--r--fs/ceph/dir.c2254
-rw-r--r--fs/ceph/export.c667
-rw-r--r--fs/ceph/file.c3217
-rw-r--r--fs/ceph/inode.c3071
-rw-r--r--fs/ceph/io.c213
-rw-r--r--fs/ceph/io.h14
-rw-r--r--fs/ceph/ioctl.c234
-rw-r--r--fs/ceph/ioctl.h1
-rw-r--r--fs/ceph/locks.c507
-rw-r--r--fs/ceph/mds_client.c5592
-rw-r--r--fs/ceph/mds_client.h424
-rw-r--r--fs/ceph/mdsmap.c395
-rw-r--r--fs/ceph/mdsmap.h79
-rw-r--r--fs/ceph/metric.c362
-rw-r--r--fs/ceph/metric.h244
-rw-r--r--fs/ceph/quota.c547
-rw-r--r--fs/ceph/snap.c853
-rw-r--r--fs/ceph/strings.c9
-rw-r--r--fs/ceph/super.c1605
-rw-r--r--fs/ceph/super.h1074
-rw-r--r--fs/ceph/util.c100
-rw-r--r--fs/ceph/xattr.c1209
33 files changed, 24909 insertions, 7455 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 49bc78243db9..3e7def3d31c1 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,10 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO
+ select NETFS_SUPPORT
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
default n
help
Choose Y or M here to include support for mounting the
@@ -12,7 +15,38 @@ config CEPH_FS
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
- More information at http://ceph.newdream.net/.
+ More information at https://ceph.io/.
If unsure, say N.
+if CEPH_FS
+config CEPH_FSCACHE
+ bool "Enable Ceph client caching support"
+ depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
+ help
+ Choose Y here to enable persistent, read-only local
+ caching support for Ceph clients using FS-Cache
+
+endif
+
+config CEPH_FS_POSIX_ACL
+ bool "Ceph POSIX Access Control Lists"
+ depends on CEPH_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ If you don't know what Access Control Lists are, say N
+
+config CEPH_FS_SECURITY_LABEL
+ bool "CephFS Security Labels"
+ depends on CEPH_FS && SECURITY
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the Ceph filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bd352125e829..1f77ca04c426 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for CEPH filesystem.
#
@@ -5,7 +6,10 @@
obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
- export.o caps.o snap.o xattr.o \
+ export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
- debugfs.o
+ debugfs.o util.o metric.o
+ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 000000000000..1564eacc253d
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+ int type, struct posix_acl *acl)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0))
+ set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int size;
+ unsigned int retry_cnt = 0;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+retry:
+ size = __ceph_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __ceph_getxattr(inode, name, value, size);
+ }
+
+ if (size == -ERANGE && retry_cnt < 10) {
+ retry_cnt++;
+ kfree(value);
+ value = NULL;
+ goto retry;
+ }
+
+ if (size > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ } else if (size == -ENODATA || size == 0) {
+ acl = NULL;
+ } else {
+ pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n",
+ ceph_vinop(inode), size);
+ acl = ERR_PTR(-EIO);
+ }
+
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ ceph_set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int ret = 0, size = 0;
+ const char *name = NULL;
+ char *value = NULL;
+ struct iattr newattrs;
+ struct inode *inode = d_inode(dentry);
+ struct timespec64 old_ctime = inode_get_ctime(inode);
+ umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ if (acl) {
+ ret = posix_acl_update_mode(idmap, inode,
+ &new_mode, &acl);
+ if (ret)
+ goto out;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = acl ? -EINVAL : 0;
+ goto out;
+ }
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ if (new_mode != old_mode) {
+ newattrs.ia_ctime = current_time(inode);
+ newattrs.ia_mode = new_mode;
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ ret = __ceph_setattr(idmap, inode, &newattrs, NULL);
+ if (ret)
+ goto out_free;
+ }
+
+ ret = __ceph_setxattr(inode, name, value, size, 0);
+ if (ret) {
+ if (new_mode != old_mode) {
+ newattrs.ia_ctime = old_ctime;
+ newattrs.ia_mode = old_mode;
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ __ceph_setattr(idmap, inode, &newattrs, NULL);
+ }
+ goto out_free;
+ }
+
+ ceph_set_cached_acl(inode, type, acl);
+
+out_free:
+ kfree(value);
+out:
+ return ret;
+}
+
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ struct posix_acl *acl, *default_acl;
+ size_t val_size1 = 0, val_size2 = 0;
+ struct ceph_pagelist *pagelist = NULL;
+ void *tmp_buf = NULL;
+ int err;
+
+ err = posix_acl_create(dir, mode, &default_acl, &acl);
+ if (err)
+ return err;
+
+ if (acl) {
+ err = posix_acl_equiv_mode(acl, mode);
+ if (err < 0)
+ goto out_err;
+ if (err == 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+ }
+ }
+
+ if (!default_acl && !acl)
+ return 0;
+
+ if (acl)
+ val_size1 = posix_acl_xattr_size(acl->a_count);
+ if (default_acl)
+ val_size2 = posix_acl_xattr_size(default_acl->a_count);
+
+ err = -ENOMEM;
+ tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
+ if (!tmp_buf)
+ goto out_err;
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist)
+ goto out_err;
+
+ err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+ if (err)
+ goto out_err;
+
+ ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
+
+ if (acl) {
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
+ if (err)
+ goto out_err;
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
+ len);
+ err = posix_acl_to_xattr(&init_user_ns, acl,
+ tmp_buf, val_size1);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size1);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size1);
+ }
+ if (default_acl) {
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
+ if (err)
+ goto out_err;
+ ceph_pagelist_encode_string(pagelist,
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
+ err = posix_acl_to_xattr(&init_user_ns, default_acl,
+ tmp_buf, val_size2);
+ if (err < 0)
+ goto out_err;
+ ceph_pagelist_encode_32(pagelist, val_size2);
+ ceph_pagelist_append(pagelist, tmp_buf, val_size2);
+ }
+
+ kfree(tmp_buf);
+
+ as_ctx->acl = acl;
+ as_ctx->default_acl = default_acl;
+ as_ctx->pagelist = pagelist;
+ return 0;
+
+out_err:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ kfree(tmp_buf);
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
+ return err;
+}
+
+void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx)
+{
+ if (!inode)
+ return;
+ ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl);
+ ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl);
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5318a3b704f6..63b75d214210 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,17 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <linux/writeback.h> /* generic_writepages */
#include <linux/slab.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/signal.h>
+#include <linux/iversion.h>
+#include <linux/ktime.h>
+#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
+#include "metric.h"
+#include "crypto.h"
#include <linux/ceph/osd_client.h>
+#include <linux/ceph/striper.h>
/*
* Ceph address space ops.
@@ -54,6 +64,9 @@
(CONGESTION_ON_THRESH(congestion_kb) - \
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct folio **foliop, void **_fsdata);
+
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
if (PagePrivate(page))
@@ -65,1198 +78,2524 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode;
+ struct inode *inode = mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
- int undo = 0;
struct ceph_snap_context *snapc;
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
-
- if (TestSetPageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- return 0;
+ if (folio_test_dirty(folio)) {
+ doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
+ ceph_vinop(inode), folio, folio->index);
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
+ return false;
}
- inode = mapping->host;
- ci = ceph_inode(inode);
+ atomic64_inc(&mdsc->dirty_folios);
- /*
- * Note that we're grabbing a snapc ref here without holding
- * any locks!
- */
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+ ci = ceph_inode(inode);
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
- if (ci->i_head_snapc == NULL)
- ci->i_head_snapc = ceph_get_snap_context(snapc);
- ++ci->i_wrbuffer_ref_head;
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ capsnap->dirty_pages++;
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ ++ci->i_wrbuffer_ref_head;
+ }
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
- "snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
- ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- snapc, snapc->seq, snapc->num_snaps);
+ doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ ceph_vinop(inode), folio, folio->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
- /* now adjust page */
- spin_lock_irq(&mapping->tree_lock);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, page->mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ /*
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
+ */
+ VM_WARN_ON_FOLIO(folio->private, folio);
+ folio_attach_private(folio, snapc);
+
+ return ceph_fscache_dirty_folio(mapping, folio);
+}
- /*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
- */
- page->private = (unsigned long)snapc;
- SetPagePrivate(page);
- } else {
- dout("ANON set_page_dirty %p (raced truncate?)\n", page);
- undo = 1;
+/*
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
+ */
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
+{
+ struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+
+
+ if (offset != 0 || length != folio_size(folio)) {
+ doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
+ ceph_vinop(inode), folio->index, offset, length);
+ return;
}
- spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_test_private(folio)) {
+ doutc(cl, "%llx.%llx idx %lu full dirty page\n",
+ ceph_vinop(inode), folio->index);
- if (undo)
- /* whoops, we failed to dirty the page */
+ snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ }
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
- BUG_ON(!PageDirty(page));
- return 1;
+ netfs_invalidate_folio(folio, offset, length);
}
-/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
- */
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_snap_context *snapc = page_snap_context(page);
+ struct inode *inode = rreq->inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_layout *lo = &ci->i_layout;
+ unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
+ loff_t end = rreq->start + rreq->len, new_end;
+ struct ceph_netfs_request_data *priv = rreq->netfs_priv;
+ unsigned long max_len;
+ u32 blockoff;
+
+ if (priv) {
+ /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */
+ if (priv->file_ra_disabled)
+ max_pages = 0;
+ else
+ max_pages = priv->file_ra_pages;
- BUG_ON(!PageLocked(page));
- BUG_ON(!PagePrivate(page));
- BUG_ON(!page->mapping);
+ }
+
+ /* Readahead is disabled */
+ if (!max_pages)
+ return;
- inode = page->mapping->host;
+ max_len = max_pages << PAGE_SHIFT;
/*
- * We can get non-dirty pages here due to races between
- * set_page_dirty and truncate_complete_page; just spit out a
- * warning, in case we end up with accounting problems later.
+ * Try to expand the length forward by rounding up it to the next
+ * block, but do not exceed the file size, unless the original
+ * request already exceeds it.
*/
- if (!PageDirty(page))
- pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-
- if (offset == 0 && length == PAGE_CACHE_SIZE)
- ClearPageChecked(page);
-
- ci = ceph_inode(inode);
- if (offset == 0 && length == PAGE_CACHE_SIZE) {
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
- ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
- ceph_put_snap_context(snapc);
- page->private = 0;
- ClearPagePrivate(page);
- } else {
- dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
- inode, page, page->index, offset, length);
+ new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
+ if (new_end > end && new_end <= rreq->start + max_len)
+ rreq->len = new_end - rreq->start;
+
+ /* Try to expand the start downward */
+ div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+ if (rreq->len + blockoff <= max_len) {
+ rreq->start -= blockoff;
+ rreq->len += blockoff;
}
}
-/* just a sanity check */
-static int ceph_releasepage(struct page *page, gfp_t g)
+static void finish_netfs_read(struct ceph_osd_request *req)
{
- struct inode *inode = page->mapping ? page->mapping->host : NULL;
- dout("%p releasepage %p idx %lu\n", inode, page, page->index);
- WARN_ON(PageDirty(page));
- WARN_ON(PagePrivate(page));
- return 0;
+ struct inode *inode = req->r_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct netfs_io_subrequest *subreq = req->r_priv;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
+ int err = req->r_result;
+ bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
+
+ ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, osd_data->length, err);
+
+ doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
+ subreq->len, i_size_read(req->r_inode));
+
+ /* no object means success but no data */
+ if (err == -ENOENT) {
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
+ err = 0;
+ } else if (err == -EBLOCKLISTED) {
+ fsc->blocklisted = true;
+ }
+
+ if (err >= 0) {
+ if (sparse && err > 0)
+ err = ceph_sparse_ext_map_end(op);
+ if (err < subreq->len &&
+ subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (IS_ENCRYPTED(inode) && err > 0) {
+ err = ceph_fscrypt_decrypt_extents(inode,
+ osd_data->pages, subreq->start,
+ op->extent.sparse_ext,
+ op->extent.sparse_ext_cnt);
+ if (err > subreq->len)
+ err = subreq->len;
+ }
+ if (err > 0)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ }
+
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ ceph_put_page_vector(osd_data->pages,
+ calc_pages_for(osd_data->alignment,
+ osd_data->length), false);
+ }
+ if (err > 0) {
+ subreq->transferred = err;
+ err = 0;
+ }
+ subreq->error = err;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
+ netfs_read_subreq_terminated(subreq);
+ iput(req->r_inode);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
-/*
- * read a single page, without unlocking it.
- */
-static int readpage_nounlock(struct file *filp, struct page *page)
+static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = file_inode(filp);
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
- int err = 0;
- u64 len = PAGE_CACHE_SIZE;
-
- dout("readpage inode %p file %p page %p index %lu\n",
- inode, filp, page, page->index);
- err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- (u64) page_offset(page), &len,
- ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1, 0);
- if (err == -ENOENT)
- err = 0;
- if (err < 0) {
- SetPageError(page);
+ ssize_t err = 0;
+ size_t len;
+ int mode;
+
+ if (rreq->origin != NETFS_UNBUFFERED_READ &&
+ rreq->origin != NETFS_DIO_READ)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
goto out;
- } else if (err < PAGE_CACHE_SIZE) {
- /* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
+ if (err == 0) {
+ err = -EFAULT;
+ } else {
+ subreq->transferred += err;
+ err = 0;
}
- SetPageUptodate(page);
+ ceph_mdsc_put_request(req);
out:
- return err < 0 ? err : 0;
+ subreq->error = err;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
+ netfs_read_subreq_terminated(subreq);
+ return true;
}
-static int ceph_readpage(struct file *filp, struct page *page)
+static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
{
- int r = readpage_nounlock(filp, page);
- unlock_page(page);
- return r;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Truncate the extent at the end of the current block */
+ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+ &objno, &objoff, &xlen);
+ rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
+ return 0;
}
-/*
- * Finish an async read(ahead) op.
- */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
- struct inode *inode = req->r_inode;
- struct ceph_osd_data *osd_data;
- int rc = req->r_result;
- int bytes = le32_to_cpu(msg->hdr.data_len);
- int num_pages;
- int i;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_request *req = NULL;
+ struct ceph_vino vino = ceph_vino(inode);
+ int err;
+ u64 len;
+ bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+ u64 off = subreq->start;
+ int extent_cnt;
- dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+ if (ceph_inode_is_shutdown(inode)) {
+ err = -EIO;
+ goto out;
+ }
- /* unlock all pages, zeroing any data we didn't read */
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- for (i = 0; i < num_pages; i++) {
- struct page *page = osd_data->pages[i];
-
- if (bytes < (int)PAGE_CACHE_SIZE) {
- /* zero (remainder of) page */
- int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
+ if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
+ return;
+
+ // TODO: This rounding here is slightly dodgy. It *should* work, for
+ // now, as the cache only deals in blocks that are a multiple of
+ // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
+ // happen is for the fscrypt driving to be moved into netfslib and the
+ // data in the cache also to be stored encrypted.
+ len = subreq->len;
+ ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
+ off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ req = NULL;
+ goto out;
+ }
+
+ if (sparse) {
+ extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
+ if (err)
+ goto out;
+ }
+
+ doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
+ ceph_vinop(inode), subreq->start, subreq->len, len);
+
+ /*
+ * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
+ * encrypted inodes. We'd need infrastructure that handles an iov_iter
+ * instead of page arrays, and we don't have that as of yet. Once the
+ * dust settles on the write helpers and encrypt/decrypt routines for
+ * netfs, we should be able to rework this.
+ */
+ if (IS_ENCRYPTED(inode)) {
+ struct page **pages;
+ size_t page_off;
+
+ /*
+ * FIXME: io_iter.count needs to be corrected to aligned
+ * length. Otherwise, iov_iter_get_pages_alloc2() operates
+ * with the initial unaligned length value. As a result,
+ * ceph_msg_data_cursor_init() triggers BUG_ON() in the case
+ * if msg->sparse_read_total > msg->data_length.
+ */
+ subreq->io_iter.count = len;
+
+ err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
+ if (err < 0) {
+ doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
+ ceph_vinop(inode), err);
+ goto out;
}
- dout("finish_read %p uptodate %p idx %lu\n", inode, page,
- page->index);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- page_cache_release(page);
- bytes -= PAGE_CACHE_SIZE;
+
+ /* should always give us a page-aligned read */
+ WARN_ON_ONCE(page_off);
+ len = err;
+ err = 0;
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
+ false);
+ } else {
+ osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
+ }
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ err = -EIO;
+ goto out;
+ }
+ req->r_callback = finish_netfs_read;
+ req->r_priv = subreq;
+ req->r_inode = inode;
+ ihold(inode);
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ ceph_osdc_start_request(req->r_osdc, req);
+out:
+ ceph_osdc_put_request(req);
+ if (err) {
+ subreq->error = err;
+ netfs_read_subreq_terminated(subreq);
}
- kfree(osd_data->pages);
+ doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
-static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
- int i;
+ struct inode *inode = rreq->inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int got = 0, want = CEPH_CAP_FILE_CACHE;
+ struct ceph_netfs_request_data *priv;
+ int ret = 0;
- for (i = 0; i < num_pages; i++)
- unlock_page(pages[i]);
-}
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
-/*
- * start an async read(ahead) operation. return nr_pages we submitted
- * a read for on success, or negative error code.
- */
-static int start_read(struct inode *inode, struct list_head *page_list, int max)
-{
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct page *page = list_entry(page_list->prev, struct page, lru);
- struct ceph_vino vino;
- struct ceph_osd_request *req;
- u64 off;
- u64 len;
- int i;
- struct page **pages;
- pgoff_t next_index;
- int nr_pages = 0;
- int ret;
+ if (rreq->origin != NETFS_READAHEAD)
+ return 0;
- off = (u64) page_offset(page);
+ priv = kzalloc(sizeof(*priv), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
- /* count pages */
- next_index = page->index;
- list_for_each_entry_reverse(page, page_list, lru) {
- if (page->index != next_index)
- break;
- nr_pages++;
- next_index++;
- if (max && nr_pages == max)
- break;
+ if (file) {
+ struct ceph_rw_context *rw_ctx;
+ struct ceph_file_info *fi = file->private_data;
+
+ priv->file_ra_pages = file->f_ra.ra_pages;
+ priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
+
+ rw_ctx = ceph_find_rw_context(fi);
+ if (rw_ctx) {
+ rreq->netfs_priv = priv;
+ return 0;
+ }
}
- len = nr_pages << PAGE_CACHE_SHIFT;
- dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
- off, len);
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
- 1, CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ, NULL,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
- /* build page vector */
- nr_pages = calc_pages_for(0, len);
- pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
- ret = -ENOMEM;
- if (!pages)
+ /*
+ * readahead callers do not necessarily hold Fcb caps
+ * (e.g. fadvise, madvise).
+ */
+ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
+ if (ret < 0) {
+ doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
goto out;
- for (i = 0; i < nr_pages; ++i) {
- page = list_entry(page_list->prev, struct page, lru);
- BUG_ON(PageLocked(page));
- list_del(&page->lru);
-
- dout("start_read %p adding %p idx %lu\n", inode, page,
- page->index);
- if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_NOFS)) {
- page_cache_release(page);
- dout("start_read %p add_to_page_cache failed %p\n",
- inode, page);
- nr_pages = i;
- goto out_pages;
- }
- pages[i] = page;
}
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
- req->r_callback = finish_read;
- req->r_inode = inode;
- ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+ if (!(got & want)) {
+ doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
+ ret = -EACCES;
+ goto out;
+ }
+ if (ret == 0) {
+ ret = -EACCES;
+ goto out;
+ }
- dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto out_pages;
- ceph_osdc_put_request(req);
- return nr_pages;
+ priv->caps = got;
+ rreq->netfs_priv = priv;
+ rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;
-out_pages:
- ceph_unlock_page_vector(pages, nr_pages);
- ceph_release_page_vector(pages, nr_pages);
out:
- ceph_osdc_put_request(req);
+ if (ret < 0) {
+ if (got)
+ ceph_put_cap_refs(ceph_inode(inode), got);
+ kfree(priv);
+ }
+
return ret;
}
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
+{
+ struct ceph_netfs_request_data *priv = rreq->netfs_priv;
-/*
- * Read multiple pages. Leave pages we don't read + unlock in page_list;
- * the caller (VM) cleans them up.
- */
-static int ceph_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned nr_pages)
+ if (!priv)
+ return;
+
+ if (priv->caps)
+ ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps);
+ kfree(priv);
+ rreq->netfs_priv = NULL;
+}
+
+const struct netfs_request_ops ceph_netfs_ops = {
+ .init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
+ .prepare_read = ceph_netfs_prepare_read,
+ .issue_read = ceph_netfs_issue_read,
+ .expand_readahead = ceph_netfs_expand_readahead,
+ .check_write_begin = ceph_netfs_check_write_begin,
+};
+
+#ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
{
- struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- int rc = 0;
- int max = 0;
-
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
- max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
- >> PAGE_SHIFT;
-
- dout("readpages %p file %p nr_pages %d max %d\n", inode,
- file, nr_pages,
- max);
- while (!list_empty(page_list)) {
- rc = start_read(inode, page_list, max);
- if (rc < 0)
- goto out;
- BUG_ON(rc == 0);
- }
-out:
- dout("readpages %p file %p ret %d\n", inode, file, rc);
- return rc;
+ folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
}
+static void ceph_fscache_write_terminated(void *priv, ssize_t error)
+{
+ struct inode *inode = priv;
+
+ if (IS_ERR_VALUE(error) && error != -ENOBUFS)
+ ceph_fscache_invalidate(inode, false);
+}
+
+static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
+
+ fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
+ ceph_fscache_write_terminated, inode, true, caching);
+}
+#else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
+
+static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
+{
+}
+#endif /* CONFIG_CEPH_FSCACHE */
+
+struct ceph_writeback_ctl
+{
+ loff_t i_size;
+ u64 truncate_size;
+ u32 truncate_seq;
+ bool size_stable;
+
+ bool head_snapc;
+ struct ceph_snap_context *snapc;
+ struct ceph_snap_context *last_snapc;
+
+ bool done;
+ bool should_loop;
+ bool range_whole;
+ pgoff_t start_index;
+ pgoff_t index;
+ pgoff_t end;
+ xa_mark_t tag;
+
+ pgoff_t strip_unit_end;
+ unsigned int wsize;
+ unsigned int nr_folios;
+ unsigned int max_pages;
+ unsigned int locked_pages;
+
+ int op_idx;
+ int num_ops;
+ u64 offset;
+ u64 len;
+
+ struct folio_batch fbatch;
+ unsigned int processed_in_fbatch;
+
+ bool from_pool;
+ struct page **pages;
+ struct page **data_pages;
+};
+
/*
* Get ref for the oldest snapc for an inode with dirty data... that is, the
* only snap context we are allowed to write back.
*/
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- u64 *snap_size)
+static struct ceph_snap_context *
+get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
+ struct ceph_snap_context *page_snapc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc = NULL;
struct ceph_cap_snap *capsnap = NULL;
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
- capsnap->context, capsnap->dirty_pages);
- if (capsnap->dirty_pages) {
- snapc = ceph_get_snap_context(capsnap->context);
- if (snap_size)
- *snap_size = capsnap->size;
- break;
+ doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
+ capsnap, capsnap->context, capsnap->dirty_pages);
+ if (!capsnap->dirty_pages)
+ continue;
+
+ /* get i_size, truncate_{seq,size} for page_snapc? */
+ if (snapc && capsnap->context != page_snapc)
+ continue;
+
+ if (ctl) {
+ if (capsnap->writing) {
+ ctl->i_size = i_size_read(inode);
+ ctl->size_stable = false;
+ } else {
+ ctl->i_size = capsnap->size;
+ ctl->size_stable = true;
+ }
+ ctl->truncate_size = capsnap->truncate_size;
+ ctl->truncate_seq = capsnap->truncate_seq;
+ ctl->head_snapc = false;
}
+
+ if (snapc)
+ break;
+
+ snapc = ceph_get_snap_context(capsnap->context);
+ if (!page_snapc ||
+ page_snapc == snapc ||
+ page_snapc->seq > snapc->seq)
+ break;
}
if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
- dout(" head snapc %p has %d dirty pages\n",
- snapc, ci->i_wrbuffer_ref_head);
+ doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
+ ci->i_wrbuffer_ref_head);
+ if (ctl) {
+ ctl->i_size = i_size_read(inode);
+ ctl->truncate_size = ci->i_truncate_size;
+ ctl->truncate_seq = ci->i_truncate_seq;
+ ctl->size_stable = false;
+ ctl->head_snapc = true;
+ }
}
spin_unlock(&ci->i_ceph_lock);
return snapc;
}
+static u64 get_writepages_data_length(struct inode *inode,
+ struct page *page, u64 start)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_cap_snap *capsnap = NULL;
+ u64 end = i_size_read(inode);
+ u64 ret;
+
+ snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
+ if (snapc != ci->i_head_snapc) {
+ bool found = false;
+ spin_lock(&ci->i_ceph_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->context == snapc) {
+ if (!capsnap->writing)
+ end = capsnap->size;
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ WARN_ON(!found);
+ }
+ if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
+ end = ceph_fscrypt_page_offset(page) + thp_size(page);
+ ret = end > start ? end - start : 0;
+ if (ret && fscrypt_is_bounce_page(page))
+ ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
+ return ret;
+}
+
/*
- * Write a single page, but leave the page locked.
+ * Write a folio, but leave it locked.
*
- * If we get a write error, set the page error bit, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
+ * If we get a write error, mark the mapping for error, but still adjust the
+ * dirty page accounting (i.e., folio is no longer dirty).
*/
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+static int write_folio_nounlock(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct inode *inode;
- struct ceph_inode_info *ci;
- struct ceph_fs_client *fsc;
- struct ceph_osd_client *osdc;
+ struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
- loff_t page_off = page_offset(page);
- long writeback_stat;
- u64 truncate_size, snap_size = 0;
- u32 truncate_seq;
- int err = 0, len = PAGE_CACHE_SIZE;
+ loff_t page_off = folio_pos(folio);
+ int err;
+ loff_t len = folio_size(folio);
+ loff_t wlen;
+ struct ceph_writeback_ctl ceph_wbc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_osd_request *req;
+ bool caching = ceph_is_cache_enabled(inode);
+ struct page *bounce_page = NULL;
- dout("writepage %p idx %lu\n", page, page->index);
+ doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
+ folio->index);
- if (!page->mapping || !page->mapping->host) {
- dout("writepage %p - no mapping\n", page);
- return -EFAULT;
- }
- inode = page->mapping->host;
- ci = ceph_inode(inode);
- fsc = ceph_inode_to_client(inode);
- osdc = &fsc->client->osdc;
+ if (ceph_inode_is_shutdown(inode))
+ return -EIO;
/* verify this is a writeable snap context */
- snapc = page_snap_context(page);
- if (snapc == NULL) {
- dout("writepage %p page %p not dirty?\n", inode, page);
- goto out;
+ snapc = page_snap_context(&folio->page);
+ if (!snapc) {
+ doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
+ folio);
+ return 0;
}
- oldest = get_oldest_context(inode, &snap_size);
+ oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- dout("writepage %p page %p snapc %p not writeable - noop\n",
- inode, page, snapc);
+ doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), folio, snapc);
/* we should only noop if called by kswapd */
- WARN_ON((current->flags & PF_MEMALLOC) == 0);
+ WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
- goto out;
+ folio_redirty_for_writepage(wbc, folio);
+ return 0;
}
ceph_put_snap_context(oldest);
- spin_lock(&ci->i_ceph_lock);
- truncate_seq = ci->i_truncate_seq;
- truncate_size = ci->i_truncate_size;
- if (!snap_size)
- snap_size = i_size_read(inode);
- spin_unlock(&ci->i_ceph_lock);
-
/* is this a partial page at end of file? */
- if (page_off >= snap_size) {
- dout("%p page eof %llu\n", page, snap_size);
- goto out;
+ if (page_off >= ceph_wbc.i_size) {
+ doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
+ ceph_vinop(inode), folio->index, ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
+ return 0;
}
- if (snap_size < page_off + len)
- len = snap_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
- inode, page, page->index, page_off, len, snapc);
+ if (ceph_wbc.i_size < page_off + len)
+ len = ceph_wbc.i_size - page_off;
+
+ wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
+ doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
+ snapc->seq);
- writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat >
+ if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
-
- set_page_writeback(page);
- err = ceph_osdc_writepages(osdc, ceph_vino(inode),
- &ci->i_layout, snapc,
- page_off, len,
- truncate_seq, truncate_size,
- &inode->i_mtime, &page, 1);
+ fsc->write_congested = true;
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+ page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, true);
+ if (IS_ERR(req)) {
+ folio_redirty_for_writepage(wbc, folio);
+ return PTR_ERR(req);
+ }
+
+ if (wlen < len)
+ len = wlen;
+
+ folio_start_writeback(folio);
+ if (caching)
+ ceph_set_page_fscache(&folio->page);
+ ceph_fscache_write_to_cache(inode, page_off, len, caching);
+
+ if (IS_ENCRYPTED(inode)) {
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
+ CEPH_FSCRYPT_BLOCK_SIZE, 0,
+ GFP_NOFS);
+ if (IS_ERR(bounce_page)) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
+ ceph_osdc_put_request(req);
+ return PTR_ERR(bounce_page);
+ }
+ }
+
+ /* it may be a short write due to an object boundary */
+ WARN_ON_ONCE(len > folio_size(folio));
+ osd_req_op_extent_osd_data_pages(req, 0,
+ bounce_page ? &bounce_page : &page, wlen, 0,
+ false, false);
+ doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
+ ceph_vinop(inode), page_off, len, wlen,
+ IS_ENCRYPTED(inode) ? "" : "not ");
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(osdc, req);
+ err = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, err);
+ fscrypt_free_bounce_page(bounce_page);
+ ceph_osdc_put_request(req);
+ if (err == 0)
+ err = len;
+
if (err < 0) {
- dout("writepage setting page/mapping error %d %p\n", err, page);
- SetPageError(page);
+ struct writeback_control tmp_wbc;
+ if (!wbc)
+ wbc = &tmp_wbc;
+ if (err == -ERESTARTSYS) {
+ /* killed by SIGKILL */
+ doutc(cl, "%llx.%llx interrupted page %p\n",
+ ceph_vinop(inode), folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
+ return err;
+ }
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+ doutc(cl, "%llx.%llx setting mapping error %d %p\n",
+ ceph_vinop(inode), err, folio);
mapping_set_error(&inode->i_data, err);
- if (wbc)
- wbc->pages_skipped++;
+ wbc->pages_skipped++;
} else {
- dout("writepage cleaned page %p\n", page);
+ doutc(cl, "%llx.%llx cleaned page %p\n",
+ ceph_vinop(inode), folio);
err = 0; /* vfs expects us to return 0 */
}
- page->private = 0;
- ClearPagePrivate(page);
- end_page_writeback(page);
+ oldest = folio_detach_private(folio);
+ WARN_ON_ONCE(oldest != snapc);
+ folio_end_writeback(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
-out:
- return err;
-}
-
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err;
- struct inode *inode = page->mapping->host;
- BUG_ON(!inode);
- ihold(inode);
- err = writepage_nounlock(page, wbc);
- unlock_page(page);
- iput(inode);
- return err;
-}
-
-/*
- * lame release_pages helper. release_pages() isn't exported to
- * modules.
- */
-static void ceph_release_pages(struct page **pages, int num)
-{
- struct pagevec pvec;
- int i;
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+ fsc->write_congested = false;
- pagevec_init(&pvec, 0);
- for (i = 0; i < num; i++) {
- if (pagevec_add(&pvec, pages[i]) == 0)
- pagevec_release(&pvec);
- }
- pagevec_release(&pvec);
+ return err;
}
-
/*
* async writeback completion handler.
*
* If we get an error, set the mapping error bit, but not the individual
* page error bits.
*/
-static void writepages_finish(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data;
- unsigned wrote;
struct page *page;
- int num_pages;
- int i;
+ int num_pages, total_pages = 0;
+ int i, j;
+ int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- int rc = req->r_result;
- u64 bytes = req->r_ops[0].extent.length;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- long writeback_stat;
- unsigned issued = ceph_caps_issued(ci);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ unsigned int len = 0;
+ bool remove_page;
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- if (rc >= 0) {
- /*
- * Assume we wrote the pages we originally sent. The
- * osd might reply with fewer pages if our writeback
- * raced with a truncation and was adjusted at the osd,
- * so don't believe the reply.
- */
- wrote = num_pages;
- } else {
- wrote = 0;
+ doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
+ if (rc < 0) {
mapping_set_error(mapping, rc);
+ ceph_set_error_write(ci);
+ if (rc == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+ } else {
+ ceph_clear_error_write(ci);
}
- dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
- inode, rc, bytes, wrote);
+
+ /*
+ * We lost the cache cap, need to truncate the page before
+ * it is unlocked, otherwise we'd truncate it later in the
+ * page truncation thread, possibly losing some data that
+ * raced its way in
+ */
+ remove_page = !(ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
/* clean all pages */
- for (i = 0; i < num_pages; i++) {
- page = osd_data->pages[i];
- BUG_ON(!page);
- WARN_ON(!PageUptodate(page));
-
- writeback_stat =
- atomic_long_dec_return(&fsc->writeback_count);
- if (writeback_stat <
- CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
-
- ceph_put_snap_context(page_snap_context(page));
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %d %p\n", i, page);
- end_page_writeback(page);
+ for (i = 0; i < req->r_num_ops; i++) {
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn_client(cl,
+ "%llx.%llx incorrect op %d req %p index %d tid %llu\n",
+ ceph_vinop(inode), req->r_ops[i].op, req, i,
+ req->r_tid);
+ break;
+ }
- /*
- * We lost the cache cap, need to truncate the page before
- * it is unlocked, otherwise we'd truncate it later in the
- * page truncation thread, possibly losing some data that
- * raced its way in
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
- generic_error_remove_page(inode->i_mapping, page);
+ osd_data = osd_req_op_extent_osd_data(req, i);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ len += osd_data->length;
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ total_pages += num_pages;
+ for (j = 0; j < num_pages; j++) {
+ page = osd_data->pages[j];
+ if (fscrypt_is_bounce_page(page)) {
+ page = fscrypt_pagecache_page(page);
+ fscrypt_free_bounce_page(osd_data->pages[j]);
+ osd_data->pages[j] = page;
+ }
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
- unlock_page(page);
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = false;
+
+ ceph_put_snap_context(detach_page_private(page));
+ end_page_writeback(page);
+
+ if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) {
+ wake_up_all(&mdsc->flush_end_wq);
+ WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0);
+ }
+
+ doutc(cl, "unlocking %p\n", page);
+
+ if (remove_page)
+ generic_error_remove_folio(inode->i_mapping,
+ page_folio(page));
+
+ unlock_page(page);
+ }
+ doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
+ ceph_vinop(inode), osd_data->length,
+ rc >= 0 ? num_pages : 0);
+
+ release_pages(osd_data->pages, num_pages);
}
- dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
- ceph_release_pages(osd_data->pages, num_pages);
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+
+ ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+
+ osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
- mempool_free(osd_data->pages,
- ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
+ mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
+static inline
+bool is_forced_umount(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_vino vino = ceph_vino(inode);
- pgoff_t index, start, end;
- int range_whole = 0;
- int should_loop = 1;
- pgoff_t max_pages = 0, max_pages_ever = 0;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
- struct pagevec pvec;
- int done = 0;
- int rc = 0;
- unsigned wsize = 1 << inode->i_blkbits;
- struct ceph_osd_request *req = NULL;
- int do_sync;
- u64 truncate_size, snap_size;
- u32 truncate_seq;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+
+ if (ceph_inode_is_shutdown(inode)) {
+ if (ci->i_wrbuffer_ref > 0) {
+ pr_warn_ratelimited_client(cl,
+ "%llx.%llx %lld forced umount\n",
+ ceph_vinop(inode), ceph_ino(inode));
+ }
+ mapping_set_error(mapping, -EIO);
+ return true;
+ }
- /*
- * Include a 'sync' in the OSD request if this is a data
- * integrity write (e.g., O_SYNC write or fsync()), or if our
- * cap is being revoked.
- */
- if ((wbc->sync_mode == WB_SYNC_ALL) ||
- ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
- do_sync = 1;
- dout("writepages_start %p dosync=%d (mode=%s)\n",
- inode, do_sync,
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
-
- if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- pr_warning("writepage_start %p on forced umount\n", inode);
- return -EIO; /* we're in a forced umount, don't write! */
- }
- if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+ return false;
+}
+
+static inline
+unsigned int ceph_define_write_size(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ unsigned int wsize = i_blocksize(inode);
+
+ if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- if (wsize < PAGE_CACHE_SIZE)
- wsize = PAGE_CACHE_SIZE;
- max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
- pagevec_init(&pvec, 0);
+ return wsize;
+}
- /* where to start/end? */
- if (wbc->range_cyclic) {
- start = mapping->writeback_index; /* Start from prev offset */
- end = -1;
- dout(" cyclic, start at %lu\n", start);
- } else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- should_loop = 0;
- dout(" not cyclic, %lu to %lu\n", start, end);
- }
- index = start;
+static inline
+void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_init(&ceph_wbc->fbatch);
+ ceph_wbc->processed_in_fbatch = 0;
+}
+
+static inline
+void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_release(&ceph_wbc->fbatch);
+ ceph_folio_batch_init(ceph_wbc);
+}
+
+static inline
+void ceph_init_writeback_ctl(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ ceph_wbc->snapc = NULL;
+ ceph_wbc->last_snapc = NULL;
+
+ ceph_wbc->strip_unit_end = 0;
+ ceph_wbc->wsize = ceph_define_write_size(mapping);
+
+ ceph_wbc->nr_folios = 0;
+ ceph_wbc->max_pages = 0;
+ ceph_wbc->locked_pages = 0;
+
+ ceph_wbc->done = false;
+ ceph_wbc->should_loop = false;
+ ceph_wbc->range_whole = false;
+
+ ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
+
+ ceph_wbc->tag = wbc_to_tag(wbc);
+
+ ceph_wbc->op_idx = -1;
+ ceph_wbc->num_ops = 0;
+ ceph_wbc->offset = 0;
+ ceph_wbc->len = 0;
+ ceph_wbc->from_pool = false;
+
+ ceph_folio_batch_init(ceph_wbc);
+
+ ceph_wbc->pages = NULL;
+ ceph_wbc->data_pages = NULL;
+}
+
+static inline
+int ceph_define_writeback_range(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
-retry:
/* find oldest snap context with dirty data */
- ceph_put_snap_context(snapc);
- snap_size = 0;
- snapc = get_oldest_context(inode, &snap_size);
- if (!snapc) {
+ ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL);
+ if (!ceph_wbc->snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
- dout(" no snap context with dirty data?\n");
- goto out;
+ doutc(cl, " no snap context with dirty data?\n");
+ return -ENODATA;
}
- if (snap_size == 0)
- snap_size = i_size_read(inode);
- dout(" oldest snapc is %p seq %lld (%d snaps)\n",
- snapc, snapc->seq, snapc->num_snaps);
- spin_lock(&ci->i_ceph_lock);
- truncate_seq = ci->i_truncate_seq;
- truncate_size = ci->i_truncate_size;
- if (!snap_size)
- snap_size = i_size_read(inode);
- spin_unlock(&ci->i_ceph_lock);
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
+ ceph_wbc->snapc, ceph_wbc->snapc->seq,
+ ceph_wbc->snapc->num_snaps);
+
+ ceph_wbc->should_loop = false;
+
+ if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
+ /* where to start/end? */
+ if (wbc->range_cyclic) {
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
+ doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
+ } else {
+ ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
+ ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ ceph_wbc->range_whole = true;
+ doutc(cl, " not cyclic, %lu to %lu\n",
+ ceph_wbc->index, ceph_wbc->end);
+ }
+ } else if (!ceph_wbc->head_snapc) {
+ /* Do not respect wbc->range_{start,end}. Dirty pages
+ * in that range can be associated with newer snapc.
+ * They are not writeable until we write all dirty pages
+ * associated with 'snapc' get written */
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
+ doutc(cl, " non-head snapc, range whole\n");
+ }
- if (last_snapc && snapc != last_snapc) {
- /* if we switched to a newer snapc, restart our scan at the
- * start of the original file range. */
- dout(" snapc differs from last pass, restarting at %lu\n",
- index);
- index = start;
- }
- last_snapc = snapc;
-
- while (!done && index <= end) {
- int num_ops = do_sync ? 2 : 1;
- unsigned i;
- int first;
- pgoff_t next;
- int pvec_pages, locked_pages;
- struct page **pages = NULL;
- mempool_t *pool = NULL; /* Becomes non-null if mempool used */
- struct page *page;
- int want;
- u64 offset, len;
- long writeback_stat;
+ ceph_put_snap_context(ceph_wbc->last_snapc);
+ ceph_wbc->last_snapc = ceph_wbc->snapc;
- next = 0;
- locked_pages = 0;
- max_pages = max_pages_ever;
+ return 0;
+}
-get_more_pages:
- first = -1;
- want = min(end - index,
- min((pgoff_t)PAGEVEC_SIZE,
- max_pages - (pgoff_t)locked_pages) - 1)
- + 1;
- pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- want);
- dout("pagevec_lookup_tag got %d\n", pvec_pages);
- if (!pvec_pages && !locked_pages)
- break;
- for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
- page = pvec.pages[i];
- dout("? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
- else if (!trylock_page(page))
- break;
+static inline
+bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
+}
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- dout("!dirty or !mapping %p\n", page);
- unlock_page(page);
- break;
- }
- if (!wbc->range_cyclic && page->index > end) {
- dout("end of range %p\n", page);
- done = 1;
- unlock_page(page);
- break;
- }
- if (next && (page->index != next)) {
- dout("not consecutive %p\n", page);
- unlock_page(page);
- break;
- }
- if (wbc->sync_mode != WB_SYNC_NONE) {
- dout("waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- }
- if (page_offset(page) >= snap_size) {
- dout("%p page eof %llu\n", page, snap_size);
- done = 1;
- unlock_page(page);
- break;
- }
- if (PageWriteback(page)) {
- dout("%p under writeback\n", page);
- unlock_page(page);
- break;
+static inline
+bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned index)
+{
+ return index < ceph_wbc->nr_folios &&
+ ceph_wbc->locked_pages < ceph_wbc->max_pages;
+}
+
+static
+int ceph_check_page_before_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_snap_context *pgsnapc;
+
+ /* only dirty folios, or our accounting breaks */
+ if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) {
+ doutc(cl, "!dirty or !mapping %p\n", folio);
+ return -ENODATA;
+ }
+
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(&folio->page);
+ if (pgsnapc != ceph_wbc->snapc) {
+ doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq,
+ ceph_wbc->snapc, ceph_wbc->snapc->seq);
+
+ if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ ceph_wbc->should_loop = true;
+
+ return -ENODATA;
+ }
+
+ if (folio_pos(folio) >= ceph_wbc->i_size) {
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc->i_size);
+
+ if ((ceph_wbc->size_stable ||
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0, folio_size(folio));
+
+ return -ENODATA;
+ }
+
+ if (ceph_wbc->strip_unit_end &&
+ (folio->index > ceph_wbc->strip_unit_end)) {
+ doutc(cl, "end of strip unit %p\n", folio);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static inline
+void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned int max_pages)
+{
+ ceph_wbc->pages = kmalloc_array(max_pages,
+ sizeof(*ceph_wbc->pages),
+ GFP_NOFS);
+ if (!ceph_wbc->pages) {
+ ceph_wbc->from_pool = true;
+ ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
+ BUG_ON(!ceph_wbc->pages);
+ }
+}
+
+static inline
+void ceph_allocate_page_array(struct address_space *mapping,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objnum;
+ u64 objoff;
+ u32 xlen;
+
+ /* prepare async write request */
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ ceph_wbc->offset, ceph_wbc->wsize,
+ &objnum, &objoff, &xlen);
+
+ ceph_wbc->num_ops = 1;
+ ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT);
+
+ BUG_ON(ceph_wbc->pages);
+ ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen);
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages);
+
+ ceph_wbc->len = 0;
+}
+
+static inline
+bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
+ const struct folio *folio)
+{
+ return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
+}
+
+static inline
+bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->num_ops >=
+ (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
+}
+
+static inline
+bool is_write_congestion_happened(struct ceph_fs_client *fsc)
+{
+ return atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
+}
+
+static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct page **pages = ceph_wbc->pages;
+ unsigned int index = ceph_wbc->locked_pages;
+ gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
+
+ if (IS_ENCRYPTED(inode)) {
+ pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
+ PAGE_SIZE,
+ 0,
+ gfp_flags);
+ if (IS_ERR(pages[index])) {
+ int err = PTR_ERR(pages[index]);
+
+ if (err == -EINVAL) {
+ pr_err_client(cl, "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
}
- /* only if matching snap context */
- pgsnapc = page_snap_context(page);
- if (pgsnapc->seq > snapc->seq) {
- dout("page snapc %p %lld > oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- unlock_page(page);
- if (!locked_pages)
- continue; /* keep looking for snap */
+ /* better not fail on first page! */
+ BUG_ON(ceph_wbc->locked_pages == 0);
+
+ pages[index] = NULL;
+ return err;
+ }
+ } else {
+ pages[index] = &folio->page;
+ }
+
+ ceph_wbc->locked_pages++;
+
+ return 0;
+}
+
+static
+int ceph_process_folio_batch(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct folio *folio = NULL;
+ unsigned i;
+ int rc = 0;
+
+ for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) {
+ folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
+ "folio_test_dirty %#x, folio_test_locked %#x\n",
+ folio, folio->index, folio_test_writeback(folio),
+ folio_test_dirty(folio),
+ folio_test_locked(folio));
+
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
+ continue;
+ }
+
+ if (ceph_wbc->locked_pages == 0)
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
+ break;
+
+ rc = ceph_check_page_before_write(mapping, wbc,
+ ceph_wbc, folio);
+ if (rc == -ENODATA) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ } else if (rc == -E2BIG) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ break;
+ }
+
+ if (!folio_clear_dirty_for_io(folio)) {
+ doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ }
+
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * calculate max possible write size and
+ * allocate a page array
+ */
+ if (ceph_wbc->locked_pages == 0) {
+ ceph_allocate_page_array(mapping, ceph_wbc, folio);
+ } else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
+ if (is_num_ops_too_big(ceph_wbc)) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
break;
}
- if (!clear_page_dirty_for_io(page)) {
- dout("%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
+ ceph_wbc->num_ops++;
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_wbc->len = 0;
+ }
+
+ /* note position of first page in fbatch */
+ doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
+ ceph_vinop(inode), folio, folio->index);
+
+ fsc->write_congested = is_write_congestion_happened(fsc);
+
+ rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
+ folio);
+ if (rc) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ break;
+ }
+
+ ceph_wbc->fbatch.folios[i] = NULL;
+ ceph_wbc->len += folio_size(folio);
+ }
+
+ ceph_wbc->processed_in_fbatch = i;
+
+ return rc;
+}
+
+static inline
+void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
+{
+ unsigned j, n = 0;
+
+ /* shift unused page to beginning of fbatch */
+ for (j = 0; j < folio_batch_count(fbatch); j++) {
+ if (!fbatch->folios[j])
+ continue;
+
+ if (n < j) {
+ fbatch->folios[n] = fbatch->folios[j];
+ }
+
+ n++;
+ }
+
+ fbatch->nr = n;
+}
+
+static
+int ceph_submit_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct ceph_osd_request *req = NULL;
+ struct page *page = NULL;
+ bool caching = ceph_is_cache_enabled(inode);
+ u64 offset;
+ u64 len;
+ unsigned i;
+
+new_request:
+ offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]);
+ len = ceph_wbc->wsize;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, ceph_wbc->num_ops,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc, ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size, false);
+ if (IS_ERR(req)) {
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(ceph_wbc->num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc,
+ ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size,
+ true);
+ BUG_ON(IS_ERR(req));
+ }
+
+ page = ceph_wbc->pages[ceph_wbc->locked_pages - 1];
+ BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) {
+ struct folio *folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ page = &folio->page;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ }
+
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+
+ if (!page)
+ continue;
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ }
+
+ ceph_osdc_put_request(req);
+ return -EIO;
+ }
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ ceph_wbc->op_idx = 0;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ u64 cur_offset;
+
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+ cur_offset = page_offset(page);
+
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
+ if (offset + len != cur_offset) {
+ /* If it's full, stop here */
+ if (ceph_wbc->op_idx + 1 == req->r_num_ops)
break;
- }
- /*
- * We have something to write. If this is
- * the first locked page this time through,
- * allocate an osd request and a page array
- * that it will use.
- */
- if (locked_pages == 0) {
- BUG_ON(pages);
- /* prepare async write request */
- offset = (u64)page_offset(page);
- len = wsize;
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, num_ops,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, truncate_seq,
- truncate_size, true);
- if (IS_ERR(req)) {
- rc = PTR_ERR(req);
- unlock_page(page);
- break;
- }
-
- req->r_callback = writepages_finish;
- req->r_inode = inode;
-
- max_pages = calc_pages_for(0, (u64)len);
- pages = kmalloc(max_pages * sizeof (*pages),
- GFP_NOFS);
- if (!pages) {
- pool = fsc->wb_pagevec_pool;
- pages = mempool_alloc(pool, GFP_NOFS);
- BUG_ON(!pages);
- }
- }
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
- /* note position of first page in pvec */
- if (first < 0)
- first = i;
- dout("%p will write page %p idx %lu\n",
- inode, page, page->index);
-
- writeback_stat =
- atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat > CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
- }
+ /* Start a new extent */
+ osd_req_op_extent_dup_last(req, ceph_wbc->op_idx,
+ cur_offset - offset);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages,
+ len, 0,
+ ceph_wbc->from_pool,
+ false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
- set_page_writeback(page);
- pages[locked_pages] = page;
- locked_pages++;
- next = page->index + 1;
+ len = 0;
+ offset = cur_offset;
+ ceph_wbc->data_pages = ceph_wbc->pages + i;
+ ceph_wbc->op_idx++;
}
- /* did we get anything? */
- if (!locked_pages)
- goto release_pvec_pages;
- if (i) {
- int j;
- BUG_ON(!locked_pages || first < 0);
-
- if (pvec_pages && i == pvec_pages &&
- locked_pages < max_pages) {
- dout("reached end pvec, trying for more\n");
- pagevec_reinit(&pvec);
- goto get_more_pages;
- }
+ set_page_writeback(page);
+
+ if (caching)
+ ceph_set_page_fscache(page);
+
+ len += thp_size(page);
+ }
+
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ if (ceph_wbc->size_stable) {
+ len = min(len, ceph_wbc->i_size - offset);
+ } else if (i == ceph_wbc->locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - thp_size(page);
+ len = get_writepages_data_length(inode,
+ ceph_wbc->pages[i - 1],
+ offset);
+ len = max(len, min_len);
+ }
+
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
+ }
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages, len,
+ 0, ceph_wbc->from_pool, false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops);
+
+ ceph_wbc->from_pool = false;
+ if (i < ceph_wbc->locked_pages) {
+ BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
+ ceph_wbc->num_ops -= req->r_num_ops;
+ ceph_wbc->locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages);
+ memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ memset(ceph_wbc->data_pages + i, 0,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ } else {
+ BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
+ /* request message now owns the pages array */
+ ceph_wbc->pages = NULL;
+ }
- /* shift unused pages over in the pvec... we
- * will need to release them below. */
- for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n",
- pvec.pages[j]);
- pvec.pages[j-i+first] = pvec.pages[j];
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ req = NULL;
+
+ wbc->nr_to_write -= i;
+ if (ceph_wbc->pages)
+ goto new_request;
+
+ return 0;
+}
+
+static
+void ceph_wait_until_current_writes_complete(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct page *page;
+ unsigned i, nr;
+
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ ceph_wbc->start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc->head_snapc) {
+ ceph_wbc->index = 0;
+
+ while ((ceph_wbc->index <= ceph_wbc->end) &&
+ (nr = filemap_get_folios_tag(mapping,
+ &ceph_wbc->index,
+ (pgoff_t)-1,
+ PAGECACHE_TAG_WRITEBACK,
+ &ceph_wbc->fbatch))) {
+ for (i = 0; i < nr; i++) {
+ page = &ceph_wbc->fbatch.folios[i]->page;
+ if (page_snap_context(page) != ceph_wbc->snapc)
+ continue;
+ wait_on_page_writeback(page);
}
- pvec.nr -= i-first;
+
+ folio_batch_release(&ceph_wbc->fbatch);
+ cond_resched();
}
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_writeback_ctl ceph_wbc;
+ int rc = 0;
+
+ if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
+ return 0;
+
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ if (is_forced_umount(mapping)) {
+ /* we're in a forced umount, don't write! */
+ return -EIO;
+ }
+
+ ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto out;
+ }
- /* Format the osd request message and submit the write */
+retry:
+ rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
+ if (rc == -ENODATA) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ rc = 0;
+ goto dec_osd_stopping_blocker;
+ }
- offset = page_offset(pages[0]);
- len = min(snap_size - offset,
- (u64)locked_pages << PAGE_CACHE_SHIFT);
- dout("writepages got %d pages at %llu~%llu\n",
- locked_pages, offset, len);
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
- !!pool, false);
+ while (!has_writeback_done(&ceph_wbc)) {
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
- pages = NULL; /* request message now owns the pages array */
- pool = NULL;
+get_more_pages:
+ ceph_folio_batch_reinit(&ceph_wbc);
- /* Update the write op length in case we changed it */
+ ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
+ &ceph_wbc.index,
+ ceph_wbc.end,
+ ceph_wbc.tag,
+ &ceph_wbc.fbatch);
+ doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
+ ceph_wbc.tag, ceph_wbc.nr_folios);
- osd_req_op_extent_update(req, 0, len);
+ if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
+ break;
- vino = ceph_vino(inode);
- ceph_osdc_build_request(req, offset, snapc, vino.snap,
- &inode->i_mtime);
+process_folio_batch:
+ rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
+ if (rc)
+ goto release_folios;
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
- req = NULL;
+ /* did we get anything? */
+ if (!ceph_wbc.locked_pages)
+ goto release_folios;
+
+ if (ceph_wbc.processed_in_fbatch) {
+ if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
+ ceph_wbc.locked_pages < ceph_wbc.max_pages) {
+ doutc(cl, "reached end fbatch, trying for more\n");
+ goto get_more_pages;
+ }
+ }
+
+ rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
- /* continue? */
- index = next;
- wbc->nr_to_write -= locked_pages;
- if (wbc->nr_to_write <= 0)
- done = 1;
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.strip_unit_end = 0;
-release_pvec_pages:
- dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
- pvec.nr ? pvec.pages[0] : NULL);
- pagevec_release(&pvec);
+ if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
+ ceph_wbc.nr_folios =
+ folio_batch_count(&ceph_wbc.fbatch);
+ goto process_folio_batch;
+ }
- if (locked_pages && !done)
- goto retry;
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
+ ceph_wbc.done = true;
+
+release_folios:
+ doutc(cl, "folio_batch release on %d folios (%p)\n",
+ (int)ceph_wbc.fbatch.nr,
+ ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
+ folio_batch_release(&ceph_wbc.fbatch);
}
- if (should_loop && !done) {
+ if (ceph_wbc.should_loop && !ceph_wbc.done) {
/* more to do; loop back to beginning of file */
- dout("writepages looping back to beginning of file\n");
- should_loop = 0;
- index = 0;
+ doutc(cl, "looping back to beginning of file\n");
+ /* OK even when start_index == 0 */
+ ceph_wbc.end = ceph_wbc.start_index - 1;
+
+ /* to write dirty pages associated with next snapc,
+ * we need to wait until current writes complete */
+ ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
+
+ ceph_wbc.start_index = 0;
+ ceph_wbc.index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = ceph_wbc.index;
+
+dec_osd_stopping_blocker:
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
out:
- if (req)
- ceph_osdc_put_request(req);
- ceph_put_snap_context(snapc);
- dout("writepages done, rc = %d\n", rc);
+ ceph_put_snap_context(ceph_wbc.last_snapc);
+ doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
+ rc);
+
return rc;
}
-
-
/*
* See if a given @snapc is either writeable, or already written.
*/
static int context_is_writeable_or_written(struct inode *inode,
struct ceph_snap_context *snapc)
{
- struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
int ret = !oldest || snapc->seq <= oldest->seq;
ceph_put_snap_context(oldest);
return ret;
}
-/*
- * We are only allowed to write into/dirty the page if the page is
- * clean, or already dirty within the same snap context.
+/**
+ * ceph_find_incompatible - find an incompatible context and return it
+ * @folio: folio being dirtied
+ *
+ * We are only allowed to write into/dirty a folio if the folio is
+ * clean, or already dirty within the same snap context. Returns a
+ * conflicting context if there is one, NULL if there isn't, or a
+ * negative error code on other errors.
*
- * called with page locked.
- * return success with page locked,
- * or any failure (incl -EAGAIN) with page unlocked.
+ * Must be called with folio lock held.
*/
-static int ceph_update_writeable_page(struct file *file,
- loff_t pos, unsigned len,
- struct page *page)
+static struct ceph_snap_context *
+ceph_find_incompatible(struct folio *folio)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = folio->mapping->host;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- loff_t page_off = pos & PAGE_CACHE_MASK;
- int pos_in_page = pos & ~PAGE_CACHE_MASK;
- int end_in_page = pos_in_page + len;
- loff_t i_size;
- int r;
- struct ceph_snap_context *snapc, *oldest;
-retry_locked:
- /* writepages currently holds page lock, but if we change that later, */
- wait_on_page_writeback(page);
+ if (ceph_inode_is_shutdown(inode)) {
+ doutc(cl, " %llx.%llx folio %p is shutdown\n",
+ ceph_vinop(inode), folio);
+ return ERR_PTR(-ESTALE);
+ }
+
+ for (;;) {
+ struct ceph_snap_context *snapc, *oldest;
+
+ folio_wait_writeback(folio);
+
+ snapc = page_snap_context(&folio->page);
+ if (!snapc || snapc == ci->i_head_snapc)
+ break;
- /* check snap context */
- BUG_ON(!ci->i_snap_realm);
- down_read(&mdsc->snap_rwsem);
- BUG_ON(!ci->i_snap_realm->cached_context);
- snapc = page_snap_context(page);
- if (snapc && snapc != ci->i_head_snapc) {
/*
- * this page is already dirty in another (older) snap
+ * this folio is already dirty in another (older) snap
* context! is it writeable now?
*/
- oldest = get_oldest_context(inode, NULL);
- up_read(&mdsc->snap_rwsem);
-
+ oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
+ /* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- dout(" page %p snapc %p not current or oldest\n",
- page, snapc);
- /*
- * queue for writeback, and wait for snapc to
- * be writeable or written
- */
- snapc = ceph_get_snap_context(snapc);
- unlock_page(page);
- ceph_queue_writeback(inode);
- r = wait_event_interruptible(ci->i_cap_wq,
- context_is_writeable_or_written(inode, snapc));
- ceph_put_snap_context(snapc);
- if (r == -ERESTARTSYS)
- return r;
- return -EAGAIN;
+ doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), folio, snapc);
+ return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
- /* yay, writeable, do it now (without dropping page lock) */
- dout(" page %p snapc %p not current, but oldest\n",
- page, snapc);
- if (!clear_page_dirty_for_io(page))
- goto retry_locked;
- r = writepage_nounlock(page, NULL);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
- }
-
- if (PageUptodate(page)) {
- dout(" page %p already uptodate\n", page);
- return 0;
+ /* yay, writeable, do it now (without dropping folio lock) */
+ doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), folio, snapc);
+ if (folio_clear_dirty_for_io(folio)) {
+ int r = write_folio_nounlock(folio, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+ }
}
+ return NULL;
+}
- /* full page? */
- if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
- return 0;
+static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
+ struct folio **foliop, void **_fsdata)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
- /* past end of file? */
- i_size = inode->i_size; /* caller holds i_mutex */
+ snapc = ceph_find_incompatible(*foliop);
+ if (snapc) {
+ int r;
- if (i_size + len > inode->i_sb->s_maxbytes) {
- /* file is too big */
- r = -EINVAL;
- goto fail;
- }
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
+ if (IS_ERR(snapc))
+ return PTR_ERR(snapc);
- if (page_off >= i_size ||
- (pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
- dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
- zero_user_segments(page,
- 0, pos_in_page,
- end_in_page, PAGE_CACHE_SIZE);
- return 0;
+ ceph_queue_writeback(inode);
+ r = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ return r == 0 ? -EAGAIN : r;
}
-
- /* we need to read it. */
- up_read(&mdsc->snap_rwsem);
- r = readpage_nounlock(file, page);
- if (r < 0)
- goto fail_nosnap;
- goto retry_locked;
-
-fail:
- up_read(&mdsc->snap_rwsem);
-fail_nosnap:
- unlock_page(page);
- return r;
+ return 0;
}
/*
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
*/
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+static int ceph_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct ceph_inode_info *ci = ceph_inode(inode);
int r;
- do {
- /* get a page */
- page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
- dout("write_begin file %p inode %p page %p %d~%d\n", file,
- inode, page, (int)pos, (int)len);
-
- r = ceph_update_writeable_page(file, pos, len, page);
- } while (r == -EAGAIN);
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL);
+ if (r < 0)
+ return r;
- return r;
+ folio_wait_private_2(*foliop); /* [DEPRECATED] */
+ WARN_ON_ONCE(!folio_test_locked(*foliop));
+ return 0;
}
/*
* we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
+ * except adjust dirty page accounting
*/
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int ceph_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
- int check_cap = 0;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ bool check_cap = false;
- dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
- inode, page, (int)pos, (int)copied, (int)len);
+ doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
+ file, folio, (int)pos, (int)copied, (int)len);
- /* zero the stale part of the page if we did a short copy */
- if (copied < len)
- zero_user_segment(page, from+copied, len);
+ if (!folio_test_uptodate(folio)) {
+ /* just return that nothing was copied on a short copy */
+ if (copied < len) {
+ copied = 0;
+ goto out;
+ }
+ folio_mark_uptodate(folio);
+ }
/* did file size increase? */
- /* (no need for i_size_read(); we caller holds i_mutex */
- if (pos+copied > inode->i_size)
+ if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
- if (!PageUptodate(page))
- SetPageUptodate(page);
-
- set_page_dirty(page);
+ folio_mark_dirty(folio);
- unlock_page(page);
- up_read(&mdsc->snap_rwsem);
- page_cache_release(page);
+out:
+ folio_unlock(folio);
+ folio_put(folio);
if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
return copied;
}
-/*
- * we set .direct_IO to indicate direct io is supported, but since we
- * intercept O_DIRECT reads and writes early, this function should
- * never get called.
- */
-static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
-{
- WARN_ON(1);
- return -EINVAL;
-}
-
const struct address_space_operations ceph_aops = {
- .readpage = ceph_readpage,
- .readpages = ceph_readpages,
- .writepage = ceph_writepage,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
- .releasepage = ceph_releasepage,
- .direct_IO = ceph_direct_io,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
+ .release_folio = netfs_release_folio,
+ .direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
};
+static void ceph_block_sigs(sigset_t *oldset)
+{
+ sigset_t mask;
+ siginitsetinv(&mask, sigmask(SIGKILL));
+ sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
/*
* vm ops
*/
+static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
+ int want, got, err;
+ sigset_t oldset;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
+ ceph_block_sigs(&oldset);
+
+ doutc(cl, "%llx.%llx %llu trying to get caps\n",
+ ceph_vinop(inode), off);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_CACHE;
+
+ got = 0;
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
+ if (err < 0)
+ goto out_restore;
+
+ doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
+ off, ceph_cap_string(got));
+
+ if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
+ !ceph_has_inline_data(ci)) {
+ CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
+ ceph_add_rw_context(fi, &rw_ctx);
+ ret = filemap_fault(vmf);
+ ceph_del_rw_context(fi, &rw_ctx);
+ doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
+ ceph_vinop(inode), off, ceph_cap_string(got), ret);
+ } else
+ err = -EAGAIN;
+
+ ceph_put_cap_refs(ci, got);
+
+ if (err != -EAGAIN)
+ goto out_restore;
+
+ /* read inline data */
+ if (off >= PAGE_SIZE) {
+ /* does not support inline data > PAGE_SIZE */
+ ret = VM_FAULT_SIGBUS;
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
-/*
- * Reuse write_begin here for simplicity.
- */
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+ filemap_invalidate_lock_shared(mapping);
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out_inline;
+ }
+ err = __ceph_do_getattr(inode, page,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (err < 0 || off >= i_size_read(inode)) {
+ unlock_page(page);
+ put_page(page);
+ ret = vmf_error(err);
+ goto out_inline;
+ }
+ if (err < PAGE_SIZE)
+ zero_user_segment(page, err, PAGE_SIZE);
+ else
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ vmf->page = page;
+ ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+ filemap_invalidate_unlock_shared(mapping);
+ doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
+ ceph_vinop(inode), off, ret);
+ }
+out_restore:
+ ceph_restore_sigs(&oldset);
+ if (err < 0)
+ ret = vmf_error(err);
+
+ return ret;
+}
+
+static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
- struct page *page = vmf->page;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- loff_t off = page_offset(page);
- loff_t size, len;
- int ret;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = page_folio(vmf->page);
+ loff_t off = folio_pos(folio);
+ loff_t size = i_size_read(inode);
+ size_t len;
+ int want, got, err;
+ sigset_t oldset;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
+
+ if (ceph_inode_is_shutdown(inode))
+ return ret;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return VM_FAULT_OOM;
+
+ sb_start_pagefault(inode->i_sb);
+ ceph_block_sigs(&oldset);
+
+ if (off + folio_size(folio) <= size)
+ len = folio_size(folio);
+ else
+ len = offset_in_folio(folio, size);
+
+ doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
+ ceph_vinop(inode), off, len, size);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
- /* Update time before taking page lock */
+ got = 0;
+ err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
+ if (err < 0)
+ goto out_free;
+
+ doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
+ off, len, ceph_cap_string(got));
+
+ /* Update time before taking folio lock */
file_update_time(vma->vm_file);
+ inode_inc_iversion_raw(inode);
- size = i_size_read(inode);
- if (off + PAGE_CACHE_SIZE <= size)
- len = PAGE_CACHE_SIZE;
- else
- len = size & ~PAGE_CACHE_MASK;
+ do {
+ struct ceph_snap_context *snapc;
- dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
- off, len, page, page->index);
+ folio_lock(folio);
- lock_page(page);
+ if (folio_mkwrite_check_truncate(folio, inode) < 0) {
+ folio_unlock(folio);
+ ret = VM_FAULT_NOPAGE;
+ break;
+ }
- ret = VM_FAULT_NOPAGE;
- if ((off > size) ||
- (page->mapping != inode->i_mapping))
- goto out;
+ snapc = ceph_find_incompatible(folio);
+ if (!snapc) {
+ /* success. we'll keep the folio locked. */
+ folio_mark_dirty(folio);
+ ret = VM_FAULT_LOCKED;
+ break;
+ }
- ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (ret == 0) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
- up_read(&mdsc->snap_rwsem);
- ret = VM_FAULT_LOCKED;
+ folio_unlock(folio);
+
+ if (IS_ERR(snapc)) {
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ ceph_queue_writeback(inode);
+ err = wait_event_killable(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ } while (err == 0);
+
+ if (ret == VM_FAULT_LOCKED) {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
+ ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
+ ceph_put_cap_refs_async(ci, got);
+out_free:
+ ceph_restore_sigs(&oldset);
+ sb_end_pagefault(inode->i_sb);
+ ceph_free_cap_flush(prealloc_cf);
+ if (err < 0)
+ ret = vmf_error(err);
+ return ret;
+}
+
+void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ if (locked_page) {
+ page = locked_page;
} else {
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
+ if (i_size_read(inode) == 0)
+ return;
+ page = find_or_create_page(mapping, 0,
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
+ if (!page)
+ return;
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return;
+ }
+ }
+
+ doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
+ ceph_vinop(inode), len, locked_page);
+
+ if (len > 0) {
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr, data, len);
+ kunmap_atomic(kaddr);
+ }
+
+ if (page != locked_page) {
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
else
- ret = VM_FAULT_SIGBUS;
+ flush_dcache_page(page);
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+ }
+}
+
+int ceph_uninline_data(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_request *req = NULL;
+ struct ceph_cap_flush *prealloc_cf = NULL;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
+ int err = 0;
+ u64 len;
+
+ spin_lock(&ci->i_ceph_lock);
+ inline_version = ci->i_inline_version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
+ inline_version);
+
+ if (ceph_inode_is_shutdown(inode)) {
+ err = -EIO;
+ goto out;
+ }
+
+ if (inline_version == CEPH_INLINE_NONE)
+ return 0;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ if (inline_version == 1) /* initial version, no data */
+ goto out_uninline;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
+ }
+
+ folio_lock(folio);
+
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 0, 1,
+ CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out_unlock;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_put_request(req);
+ if (err < 0)
+ goto out_unlock;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode), 0, &len, 1, 3,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out_unlock;
+ }
+
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
+
+ {
+ __le64 xattr_buf = cpu_to_le64(inline_version);
+ err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+ "inline_version", &xattr_buf,
+ sizeof(xattr_buf),
+ CEPH_OSD_CMPXATTR_OP_GT,
+ CEPH_OSD_CMPXATTR_MODE_U64);
+ if (err)
+ goto out_put_req;
+ }
+
+ {
+ char xattr_buf[32];
+ int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
+ "%llu", inline_version);
+ err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+ "inline_version",
+ xattr_buf, xattr_len, 0, 0);
+ if (err)
+ goto out_put_req;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, err);
+
+out_uninline:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
+ ceph_osdc_put_request(req);
+ if (err == -ECANCELED)
+ err = 0;
+out_unlock:
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
out:
- dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
- if (ret != VM_FAULT_LOCKED)
- unlock_page(page);
- return ret;
+ ceph_free_cap_flush(prealloc_cf);
+ doutc(cl, "%llx.%llx inline_version %llu = %d\n",
+ ceph_vinop(inode), inline_version, err);
+ return err;
}
-static struct vm_operations_struct ceph_vmops = {
- .fault = filemap_fault,
+static const struct vm_operations_struct ceph_vmops = {
+ .fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+int ceph_mmap_prepare(struct vm_area_desc *desc)
{
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = desc->file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- file_accessed(file);
- vma->vm_ops = &ceph_vmops;
+ desc->vm_ops = &ceph_vmops;
return 0;
}
+
+enum {
+ POOL_READ = 1,
+ POOL_WRITE = 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+ struct rb_node **p, *parent;
+ struct ceph_pool_perm *perm;
+ struct page **pages;
+ size_t pool_ns_len;
+ int err = 0, err2 = 0, have = 0;
+
+ down_read(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ while (*p) {
+ perm = rb_entry(*p, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ }
+ up_read(&mdsc->pool_perm_rwsem);
+ if (*p)
+ goto out;
+
+ if (pool_ns)
+ doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
+ (int)pool_ns->len, pool_ns->str);
+ else
+ doutc(cl, "pool %lld no perm cached\n", pool);
+
+ down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ perm = rb_entry(parent, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ }
+ if (*p) {
+ up_write(&mdsc->pool_perm_rwsem);
+ goto out;
+ }
+
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
+ 1, false, GFP_NOFS);
+ if (!rd_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ rd_req->r_flags = CEPH_OSD_FLAG_READ;
+ osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+ rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
+ ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
+
+ err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
+
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
+ 1, false, GFP_NOFS);
+ if (!wr_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
+ osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+ ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
+ ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
+
+ err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+ if (err)
+ goto out_unlock;
+
+ /* one page should be large enough for STAT data */
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ err = PTR_ERR(pages);
+ goto out_unlock;
+ }
+
+ osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+ 0, false, true);
+ ceph_osdc_start_request(&fsc->client->osdc, rd_req);
+
+ wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
+ ceph_osdc_start_request(&fsc->client->osdc, wr_req);
+
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+ if (err >= 0 || err == -ENOENT)
+ have |= POOL_READ;
+ else if (err != -EPERM) {
+ if (err == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+ goto out_unlock;
+ }
+
+ if (err2 == 0 || err2 == -EEXIST)
+ have |= POOL_WRITE;
+ else if (err2 != -EPERM) {
+ if (err2 == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+ err = err2;
+ goto out_unlock;
+ }
+
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(struct_size(perm, pool_ns, pool_ns_len + 1), GFP_NOFS);
+ if (!perm) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ perm->pool = pool;
+ perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
+ rb_link_node(&perm->node, parent, p);
+ rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+ err = 0;
+out_unlock:
+ up_write(&mdsc->pool_perm_rwsem);
+
+ ceph_osdc_put_request(rd_req);
+ ceph_osdc_put_request(wr_req);
+out:
+ if (!err)
+ err = have;
+ if (pool_ns)
+ doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
+ (int)pool_ns->len, pool_ns->str, err);
+ else
+ doutc(cl, "pool %lld result = %d\n", pool, err);
+ return err;
+}
+
+int ceph_pool_perm_check(struct inode *inode, int need)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_string *pool_ns;
+ s64 pool;
+ int ret, flags;
+
+ /* Only need to do this for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ if (ci->i_vino.snap != CEPH_NOSNAP) {
+ /*
+ * Pool permission check needs to write to the first object.
+ * But for snapshot, head of the first object may have already
+ * been deleted. Skip check to avoid creating orphan object.
+ */
+ return 0;
+ }
+
+ if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
+ NOPOOLPERM))
+ return 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ flags = ci->i_ceph_flags;
+ pool = ci->i_layout.pool_id;
+ spin_unlock(&ci->i_ceph_lock);
+check:
+ if (flags & CEPH_I_POOL_PERM) {
+ if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+ doutc(cl, "pool %lld no read perm\n", pool);
+ return -EPERM;
+ }
+ if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+ doutc(cl, "pool %lld no write perm\n", pool);
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
+ if (ret < 0)
+ return ret;
+
+ flags = CEPH_I_POOL_PERM;
+ if (ret & POOL_READ)
+ flags |= CEPH_I_POOL_RD;
+ if (ret & POOL_WRITE)
+ flags |= CEPH_I_POOL_WR;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
+ } else {
+ pool = ci->i_layout.pool_id;
+ flags = ci->i_ceph_flags;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+ struct ceph_pool_perm *perm;
+ struct rb_node *n;
+
+ while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+ n = rb_first(&mdsc->pool_perm_tree);
+ perm = rb_entry(n, struct ceph_pool_perm, node);
+ rb_erase(n, &mdsc->pool_perm_tree);
+ kfree(perm);
+ }
+}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
new file mode 100644
index 000000000000..f678bab189d8
--- /dev/null
+++ b/fs/ceph/cache.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/fs_context.h>
+#include "super.h"
+#include "cache.h"
+
+void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+
+ /* No caching for filesystem? */
+ if (!fsc->fscache)
+ return;
+
+ /* Regular files only */
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ /* Only new inodes! */
+ if (!(inode_state_read_once(inode) & I_NEW))
+ return;
+
+ WARN_ON_ONCE(ci->netfs.cache);
+
+ ci->netfs.cache =
+ fscache_acquire_cookie(fsc->fscache, 0,
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
+ if (ci->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
+}
+
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
+{
+ fscache_relinquish_cookie(ceph_fscache_cookie(ci), false);
+}
+
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ fscache_use_cookie(ceph_fscache_cookie(ci), will_modify);
+}
+
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (update) {
+ loff_t i_size = i_size_read(inode);
+
+ fscache_unuse_cookie(ceph_fscache_cookie(ci),
+ &ci->i_version, &i_size);
+ } else {
+ fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL);
+ }
+}
+
+void ceph_fscache_update(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ loff_t i_size = i_size_read(inode);
+
+ fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size);
+}
+
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ fscache_invalidate(ceph_fscache_cookie(ci),
+ &ci->i_version, i_size_read(inode),
+ dio_write ? FSCACHE_INVAL_DIO_WRITE : 0);
+}
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc)
+{
+ const struct ceph_fsid *fsid = &fsc->client->fsid;
+ const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+ size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+ char *name;
+ int err = 0;
+
+ name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "",
+ uniq_len ? fscache_uniq : "");
+ if (!name)
+ return -ENOMEM;
+
+ fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(fsc->fscache)) {
+ errorfc(fc, "Unable to register fscache cookie for %s", name);
+ err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP;
+ fsc->fscache = NULL;
+ }
+ kfree(name);
+ return err;
+}
+
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+ fscache_relinquish_volume(fsc->fscache, NULL, false);
+}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
new file mode 100644
index 000000000000..20efac020394
--- /dev/null
+++ b/fs/ceph/cache.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Ceph cache definitions.
+ *
+ * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
+ * Written by Milosz Tanski (milosz@adfin.com)
+ */
+
+#ifndef _CEPH_CACHE_H
+#define _CEPH_CACHE_H
+
+#include <linux/netfs.h>
+
+#ifdef CONFIG_CEPH_FSCACHE
+#include <linux/fscache.h>
+
+int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc);
+void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
+
+void ceph_fscache_register_inode_cookie(struct inode *inode);
+void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
+
+void ceph_fscache_use_cookie(struct inode *inode, bool will_modify);
+void ceph_fscache_unuse_cookie(struct inode *inode, bool update);
+
+void ceph_fscache_update(struct inode *inode);
+void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
+
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+ return netfs_i_cookie(&ci->netfs);
+}
+
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
+
+ if (cookie) {
+ ceph_fscache_use_cookie(inode, true);
+ fscache_resize_cookie(cookie, to);
+ ceph_fscache_unuse_cookie(inode, true);
+ }
+}
+
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return netfs_unpin_writeback(inode, wbc);
+}
+
+#define ceph_fscache_dirty_folio netfs_dirty_folio
+
+static inline bool ceph_is_cache_enabled(struct inode *inode)
+{
+ return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
+}
+
+#else /* CONFIG_CEPH_FSCACHE */
+static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
+ struct fs_context *fc)
+{
+ return 0;
+}
+
+static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
+{
+}
+
+static inline void ceph_fscache_register_inode_cookie(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
+{
+}
+
+static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify)
+{
+}
+
+static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update)
+{
+}
+
+static inline void ceph_fscache_update(struct inode *inode)
+{
+}
+
+static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write)
+{
+}
+
+static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
+{
+ return NULL;
+}
+
+static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
+{
+}
+
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ return 0;
+}
+
+#define ceph_fscache_dirty_folio filemap_dirty_folio
+
+static inline bool ceph_is_cache_enabled(struct inode *inode)
+{
+ return false;
+}
+#endif /* CONFIG_CEPH_FSCACHE */
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 25442b40c25a..b1a8ff612c41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,15 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/writeback.h>
+#include <linux/iversion.h>
+#include <linux/filelock.h>
+#include <linux/jiffies.h>
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/decode.h>
#include <linux/ceph/messenger.h>
@@ -39,6 +45,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -62,6 +73,8 @@ static char *gcap_string(char *s, int c)
*s++ = 'w';
if (c & CEPH_CAP_GBUFFER)
*s++ = 'b';
+ if (c & CEPH_CAP_GWREXTEND)
+ *s++ = 'a';
if (c & CEPH_CAP_GLAZYIO)
*s++ = 'l';
return s;
@@ -139,24 +152,69 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
+void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt)
{
spin_lock(&mdsc->caps_list_lock);
- mdsc->caps_min_count += delta;
- BUG_ON(mdsc->caps_min_count < 0);
+ mdsc->caps_min_count = fsopt->max_readdir;
+ if (mdsc->caps_min_count < 1024)
+ mdsc->caps_min_count = 1024;
+ mdsc->caps_use_max = fsopt->caps_max;
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_max < mdsc->caps_min_count)
+ mdsc->caps_use_max = mdsc->caps_min_count;
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_reserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx, int need)
+static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
{
+ struct ceph_cap *cap;
int i;
+
+ if (nr_caps) {
+ BUG_ON(mdsc->caps_reserve_count < nr_caps);
+ mdsc->caps_reserve_count -= nr_caps;
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ mdsc->caps_total_count -= nr_caps;
+ for (i = 0; i < nr_caps; i++) {
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ } else {
+ mdsc->caps_avail_count += nr_caps;
+ }
+
+ doutc(mdsc->fsc->client,
+ "caps %d = %d used + %d resv + %d avail\n",
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count +
+ mdsc->caps_avail_count);
+ }
+}
+
+/*
+ * Called under mdsc->mutex.
+ */
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx, int need)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int i, j;
struct ceph_cap *cap;
int have;
int alloc = 0;
+ int max_caps;
+ int err = 0;
+ bool trimmed = false;
+ struct ceph_mds_session *s;
LIST_HEAD(newcaps);
- dout("reserve caps ctx=%p need=%d\n", ctx, need);
+ doutc(cl, "ctx=%p need=%d\n", ctx, need);
/* first reserve any caps that are already allocated */
spin_lock(&mdsc->caps_list_lock);
@@ -171,17 +229,62 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc,
mdsc->caps_avail_count);
spin_unlock(&mdsc->caps_list_lock);
- for (i = have; i < need; i++) {
+ for (i = have; i < need; ) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap)
- break;
- list_add(&cap->caps_item, &newcaps);
- alloc++;
+ if (cap) {
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ i++;
+ continue;
+ }
+
+ if (!trimmed) {
+ for (j = 0; j < mdsc->max_sessions; j++) {
+ s = __ceph_lookup_mds_session(mdsc, j);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ max_caps = s->s_nr_caps - (need - i);
+ ceph_trim_caps(mdsc, s, max_caps);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ trimmed = true;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ int more_have;
+ if (mdsc->caps_avail_count >= need - i)
+ more_have = need - i;
+ else
+ more_have = mdsc->caps_avail_count;
+
+ i += more_have;
+ have += more_have;
+ mdsc->caps_avail_count -= more_have;
+ mdsc->caps_reserve_count += more_have;
+
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
+ continue;
+ }
+
+ pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
+ have + alloc);
+ err = -ENOMEM;
+ break;
+ }
+
+ if (!err) {
+ BUG_ON(have + alloc != need);
+ ctx->count = need;
+ ctx->used = 0;
}
- /* we didn't manage to reserve as much as we needed */
- if (have + alloc != need)
- pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -191,38 +294,44 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc,
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
mdsc->caps_reserve_count +
mdsc->caps_avail_count);
+
+ if (err)
+ __ceph_unreserve_caps(mdsc, have + alloc);
+
spin_unlock(&mdsc->caps_list_lock);
- ctx->count = need;
- dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
- ctx, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
-}
-
-int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
-{
- dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
- if (ctx->count) {
- spin_lock(&mdsc->caps_list_lock);
- BUG_ON(mdsc->caps_reserve_count < ctx->count);
- mdsc->caps_reserve_count -= ctx->count;
- mdsc->caps_avail_count += ctx->count;
- ctx->count = 0;
- dout("unreserve caps %d = %d used + %d resv + %d avail\n",
- mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
- BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
- mdsc->caps_reserve_count +
- mdsc->caps_avail_count);
- spin_unlock(&mdsc->caps_list_lock);
- }
- return 0;
+ doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ return err;
+}
+
+void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ bool reclaim = false;
+ if (!ctx->count)
+ return;
+
+ doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
+ spin_lock(&mdsc->caps_list_lock);
+ __ceph_unreserve_caps(mdsc, ctx->count);
+ ctx->count = 0;
+
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ reclaim = true;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ if (reclaim)
+ ceph_reclaim_caps_nr(mdsc, ctx->used);
}
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap *cap = NULL;
/* temporary, until we do something about cap import/export */
@@ -233,19 +342,36 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
mdsc->caps_use_count++;
mdsc->caps_total_count++;
spin_unlock(&mdsc->caps_list_lock);
+ } else {
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ BUG_ON(list_empty(&mdsc->caps_list));
+
+ mdsc->caps_avail_count--;
+ mdsc->caps_use_count++;
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ }
+ spin_unlock(&mdsc->caps_list_lock);
}
+
return cap;
}
spin_lock(&mdsc->caps_list_lock);
- dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
- ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
+ ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
BUG_ON(!ctx->count);
BUG_ON(ctx->count > mdsc->caps_reserve_count);
BUG_ON(list_empty(&mdsc->caps_list));
ctx->count--;
+ ctx->used++;
mdsc->caps_reserve_count--;
mdsc->caps_use_count++;
@@ -260,10 +386,12 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+
spin_lock(&mdsc->caps_list_lock);
- dout("put_cap %p %d = %d used + %d resv + %d avail\n",
- cap, mdsc->caps_total_count, mdsc->caps_use_count,
- mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
+ mdsc->caps_total_count, mdsc->caps_use_count,
+ mdsc->caps_reserve_count, mdsc->caps_avail_count);
mdsc->caps_use_count--;
/*
* Keep some preallocated caps around (ceph_min_count), to
@@ -289,6 +417,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
{
struct ceph_mds_client *mdsc = fsc->mdsc;
+ spin_lock(&mdsc->caps_list_lock);
+
if (total)
*total = mdsc->caps_total_count;
if (avail)
@@ -299,6 +429,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
*reserved = mdsc->caps_reserve_count;
if (min)
*min = mdsc->caps_min_count;
+
+ spin_unlock(&mdsc->caps_list_lock);
}
/*
@@ -306,7 +438,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
*
* Called with i_ceph_lock held.
*/
-static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
{
struct ceph_cap *cap;
struct rb_node *n = ci->i_caps.rb_node;
@@ -334,37 +466,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
}
/*
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
-{
- struct ceph_cap *cap;
- int mds = -1;
- struct rb_node *p;
-
- /* prefer mds with WR|BUFFER|EXCL caps */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- cap = rb_entry(p, struct ceph_cap, ci_node);
- mds = cap->mds;
- if (cap->issued & (CEPH_CAP_FILE_WR |
- CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_FILE_EXCL))
- break;
- }
- return mds;
-}
-
-int ceph_get_cap_mds(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int mds;
- spin_lock(&ci->i_ceph_lock);
- mds = __ceph_get_cap_mds(ceph_inode(inode));
- spin_unlock(&ci->i_ceph_lock);
- return mds;
-}
-
-/*
* Called under i_ceph_lock.
*/
static void __insert_cap_node(struct ceph_inode_info *ci,
@@ -396,14 +497,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_options *ma = mdsc->fsc->mount_options;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
- ci->i_hold_caps_min = round_jiffies(jiffies +
- ma->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
- ma->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
- ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+ opt->caps_wanted_delay_max * HZ);
+ doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
+ ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
}
/*
@@ -417,9 +517,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- __cap_set_timeouts(mdsc, ci);
- dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
- ci->i_ceph_flags, ci->i_hold_caps_max);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
+ inode, ceph_vinop(inode), ci->i_ceph_flags,
+ ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
if (!list_empty(&ci->i_cap_delay_list)) {
@@ -427,6 +529,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -441,7 +544,9 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -458,7 +563,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ struct inode *inode = &ci->netfs.inode;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -466,83 +573,106 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
unsigned had = __ceph_caps_issued(ci, NULL);
+ lockdep_assert_held(&ci->i_ceph_lock);
+
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
+ }
/*
- * if we are newly issued FILE_SHARED, mark dir not complete; we
- * don't know what happened to this directory while we didn't
- * have the cap.
+ * If FILE_SHARED is newly issued, mark dir not complete. We don't
+ * know what happened to this directory while we didn't have the cap.
+ * If FILE_SHARED is being revoked, also mark dir not complete. It
+ * stops on-going cached readdir.
*/
- if ((issued & CEPH_CAP_FILE_SHARED) &&
- (had & CEPH_CAP_FILE_SHARED) == 0) {
- ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
+ if (issued & CEPH_CAP_FILE_SHARED)
+ atomic_inc(&ci->i_shared_gen);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ doutc(cl, " marking %p NOT complete\n", inode);
__ceph_dir_clear_complete(ci);
}
}
+
+ /* Wipe saved layout if we're losing DIR_CREATE caps */
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ !(issued & CEPH_CAP_DIR_CREATE)) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+}
+
+/**
+ * change_auth_cap_ses - move inode to appropriate lists when auth caps change
+ * @ci: inode to be moved
+ * @session: new auth caps session
+ */
+void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
+{
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
+ return;
+
+ spin_lock(&session->s_mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item))
+ list_move(&ci->i_dirty_item, &session->s_cap_dirty);
+ if (!list_empty(&ci->i_flushing_item))
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ spin_unlock(&session->s_mdsc->cap_dirty_lock);
}
/*
* Add a capability under the given MDS session.
*
- * Caller should hold session snap_rwsem (read) and s_mutex.
+ * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
*
* @fmode is the open file mode, if we are opening a file, otherwise
* it is < 0. (This is so we can atomically add the cap and add an
* open file reference to it.)
*/
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
+void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap **new_cap)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
int mds = session->s_mds;
int actual_wanted;
+ u32 gen;
- dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
- session->s_mds, cap_id, ceph_cap_string(issued), seq);
+ lockdep_assert_held(&ci->i_ceph_lock);
- /*
- * If we are opening the file, include file mode wanted bits
- * in wanted.
- */
- if (fmode >= 0)
- wanted |= ceph_caps_for_mode(fmode);
+ doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
+ ceph_vinop(inode), session->s_mds, cap_id,
+ ceph_cap_string(issued), seq);
+
+ gen = atomic_read(&session->s_cap_gen);
-retry:
- spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- new_cap = get_cap(mdsc, caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
+ cap = *new_cap;
+ *new_cap = NULL;
cap->issued = 0;
cap->implemented = 0;
@@ -553,40 +683,54 @@ retry:
cap->ci = ci;
__insert_cap_node(ci, cap);
- /* clear out old exporting info? (i.e. on cap import) */
- if (ci->i_cap_exporting_mds == mds) {
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- }
-
/* add to session cap list */
cap->session = session;
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
+ atomic64_inc(&mdsc->metric.total_caps);
spin_unlock(&session->s_cap_lock);
- } else if (new_cap)
- ceph_put_cap(mdsc, new_cap);
+ } else {
+ spin_lock(&session->s_cap_lock);
+ list_move_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ if (cap->cap_gen < gen)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+
+ /*
+ * auth mds of the inode changed. we received the cap export
+ * message, but still haven't received the cap import message.
+ * handle_cap_export() updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+ * a message that was send before the cap import message. So
+ * don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != cap_id);
+ seq = cap->seq;
+ mseq = cap->mseq;
+ issued |= cap->issued;
+ flags |= CEPH_CAP_FLAG_AUTH;
+ }
+ }
- if (!ci->i_snap_realm) {
+ if (!ci->i_snap_realm ||
+ ((flags & CEPH_CAP_FLAG_AUTH) &&
+ realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
/*
* add this inode to the appropriate snap realm
*/
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
- if (realm) {
- ceph_get_snap_realm(mdsc, realm);
- spin_lock(&realm->inodes_with_caps_lock);
- ci->i_snap_realm = realm;
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- spin_unlock(&realm->inodes_with_caps_lock);
- } else {
- pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
- realmino);
- WARN_ON(!realm);
- }
+ if (realm)
+ ceph_change_snap_realm(inode, realm);
+ else
+ WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
+ __func__, realmino, ci->i_vino.ino,
+ ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
}
__check_cap_issue(ci, cap, issued);
@@ -599,47 +743,40 @@ retry:
actual_wanted = __ceph_caps_wanted(ci);
if ((wanted & ~actual_wanted) ||
(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
- dout(" issued %s, mds wanted %s, actual %s, queueing\n",
- ceph_cap_string(issued), ceph_cap_string(wanted),
- ceph_cap_string(actual_wanted));
+ doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
__cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
- if (ci->i_auth_cap == NULL ||
- ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ if (!ci->i_auth_cap ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
+ if (ci->i_auth_cap &&
+ ci->i_auth_cap->session != cap->session)
+ change_auth_cap_ses(ci, cap->session);
ci->i_auth_cap = cap;
- } else if (ci->i_auth_cap == cap) {
- ci->i_auth_cap = NULL;
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n", inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ cap->mds_wanted = wanted;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ } else {
+ WARN_ON(ci->i_auth_cap == cap);
}
- dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
- inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
- ceph_cap_string(issued|cap->issued), seq, mds);
+ doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
- if (mseq > cap->mseq)
+ if (ceph_seq_cmp(mseq, cap->mseq) > 0)
cap->mds_wanted = wanted;
else
cap->mds_wanted |= wanted;
cap->seq = seq;
cap->issue_seq = seq;
cap->mseq = mseq;
- cap->cap_gen = session->s_cap_gen;
-
- if (fmode >= 0)
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
+ cap->cap_gen = gen;
wake_up_all(&ci->i_cap_wq);
- return 0;
}
/*
@@ -649,18 +786,18 @@ retry:
*/
static int __cap_is_valid(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
+ struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_gen_ttl_lock);
- gen = cap->session->s_cap_gen;
+ gen = atomic_read(&cap->session->s_cap_gen);
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
- dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
- cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+ doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -674,7 +811,9 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
- int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -684,8 +823,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
+ ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -728,16 +867,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
*/
static void __touch_cap(struct ceph_cap *cap)
{
+ struct inode *inode = &cap->ci->netfs.inode;
struct ceph_mds_session *s = cap->session;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
spin_lock(&s->s_cap_lock);
- if (s->s_cap_iterator == NULL) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
- s->s_mds);
+ if (!s->s_cap_iterator) {
+ doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
+ ceph_vinop(inode), cap, s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
- dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
+ inode, ceph_vinop(inode), cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -749,15 +890,16 @@ static void __touch_cap(struct ceph_cap *cap)
*/
int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct rb_node *p;
int have = ci->i_snap_caps;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p snap issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(have),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(mask));
return 1;
}
@@ -766,10 +908,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if (!__cap_is_valid(cap))
continue;
if ((cap->issued & mask) == mask) {
- dout("__ceph_caps_issued_mask %p cap %p issued %s"
- " (mask %s)\n", &ci->vfs_inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch)
__touch_cap(cap);
return 1;
@@ -778,10 +920,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/* does a combination of caps satisfy mask? */
have |= cap->issued;
if ((have & mask) == mask) {
- dout("__ceph_caps_issued_mask %p combo issued %s"
- " (mask %s)\n", &ci->vfs_inode,
- ceph_cap_string(cap->issued),
- ceph_cap_string(mask));
+ doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
if (touch) {
struct rb_node *q;
@@ -793,7 +935,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
ci_node);
if (!__cap_is_valid(cap))
continue;
- __touch_cap(cap);
+ if (cap->issued & mask)
+ __touch_cap(cap);
}
}
return 1;
@@ -803,6 +946,20 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
return 0;
}
+int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
+ int touch)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+ int r;
+
+ r = __ceph_caps_issued_mask(ci, mask, touch);
+ if (r)
+ ceph_update_cap_hit(&fsc->mdsc->metric);
+ else
+ ceph_update_cap_mis(&fsc->mdsc->metric);
+ return r;
+}
+
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
@@ -814,26 +971,13 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (cap != ocap && __cap_is_valid(cap) &&
+ if (cap != ocap &&
(cap->implemented & ~cap->issued & mask))
return 1;
}
return 0;
}
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
-{
- struct inode *inode = &ci->vfs_inode;
- int ret;
-
- spin_lock(&ci->i_ceph_lock);
- ret = __ceph_caps_revoking_other(ci, NULL, mask);
- spin_unlock(&ci->i_ceph_lock);
- dout("ceph_caps_revoking %p %s = %d\n", inode,
- ceph_cap_string(mask), ret);
- return ret;
-}
-
int __ceph_caps_used(struct ceph_inode_info *ci)
{
int used = 0;
@@ -841,32 +985,104 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
used |= CEPH_CAP_PIN;
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
- if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+ if (ci->i_rdcache_ref ||
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
+ if (ci->i_fx_ref)
+ used |= CEPH_CAP_FILE_EXCL;
return used;
}
+#define FMODE_WAIT_BIAS 1000
+
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+ const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+ const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+ const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
+ unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+ unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ int want = 0;
+
+ /* use used_cutoff here, to keep dir's wanted caps longer */
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+ time_after(ci->i_last_rd, used_cutoff))
+ want |= CEPH_CAP_ANY_SHARED;
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+ time_after(ci->i_last_wr, used_cutoff)) {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ want |= CEPH_CAP_ANY_DIR_OPS;
+ }
+
+ if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+ want |= CEPH_CAP_PIN;
+
+ return want;
+ } else {
+ int bits = 0;
+
+ if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_rd, used_cutoff))
+ bits |= 1 << RD_SHIFT;
+ } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+ bits |= 1 << RD_SHIFT;
+ }
+
+ if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+ if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+ time_after(ci->i_last_wr, used_cutoff))
+ bits |= 1 << WR_SHIFT;
+ } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+ bits |= 1 << WR_SHIFT;
+ }
+
+ /* check lazyio only when read/write is wanted */
+ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+ ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+ bits |= 1 << LAZY_SHIFT;
+
+ return bits ? ceph_caps_for_mode(bits >> 1) : 0;
+ }
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ /* we want EXCL if holding caps of dir ops */
+ if (w & CEPH_CAP_ANY_DIR_OPS)
+ w |= CEPH_CAP_FILE_EXCL;
+ } else {
+ /* we want EXCL if dirty data */
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL;
+ }
+ return w;
}
/*
* Return caps we have registered with the MDS(s) as 'wanted'.
*/
-int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
{
struct ceph_cap *cap;
struct rb_node *p;
@@ -874,19 +1090,26 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (!__cap_is_valid(cap))
+ if (check && !__cap_is_valid(cap))
continue;
- mds_wanted |= cap->mds_wanted;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
}
return mds_wanted;
}
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+int ceph_is_any_caps(struct inode *inode)
{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_is_any_real_caps(ci);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ret;
}
/*
@@ -895,324 +1118,490 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
* caller should hold i_ceph_lock.
* caller will not hold session s_mutex if called from destroy_inode.
*/
-void __ceph_remove_cap(struct ceph_cap *cap)
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_inode_info *ci = cap->ci;
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc;
int removed = 0;
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ doutc(cl, "inode is NULL\n");
+ return;
+ }
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
+
+ mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
+
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
/* remove from session list */
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
- dout("__ceph_remove_cap delaying %p removal from session %p\n",
- cap, cap->session);
+ doutc(cl, "delaying %p removal from session %p\n", cap,
+ cap->session);
} else {
list_del_init(&cap->session_caps);
session->s_nr_caps--;
+ atomic64_dec(&mdsc->metric.total_caps);
cap->session = NULL;
removed = 1;
}
/* protect backpointer with s_cap_lock: see iterate_session_caps */
cap->ci = NULL;
- spin_unlock(&session->s_cap_lock);
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect ||
+ cap->cap_gen == atomic_read(&session->s_cap_gen))) {
+ cap->queue_release = 1;
+ if (removed) {
+ __ceph_queue_cap_release(session, cap);
+ removed = 0;
+ }
+ } else {
+ cap->queue_release = 0;
+ }
+ cap->cap_ino = ci->i_vino.ino;
+
+ spin_unlock(&session->s_cap_lock);
if (removed)
ceph_put_cap(mdsc, cap);
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
- }
- if (!__ceph_is_any_real_caps(ci))
+ if (!__ceph_is_any_real_caps(ci)) {
+ /* when reconnect denied, we remove session caps forcibly,
+ * i_wr_ref can be non-zero. If there are ongoing write,
+ * keep i_snap_realm.
+ */
+ if (ci->i_wr_ref == 0 && ci->i_snap_realm)
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
+
__cap_delay_cancel(mdsc, ci);
+ }
}
-/*
- * Build and send a cap message to the given MDS.
- *
- * Caller should be holding s_mutex.
- */
-static int send_cap_msg(struct ceph_mds_session *session,
- u64 ino, u64 cid, int op,
- int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
- u64 size, u64 max_size,
- struct timespec *mtime, struct timespec *atime,
- u64 time_warp_seq,
- kuid_t uid, kgid_t gid, umode_t mode,
- u64 xattr_version,
- struct ceph_buffer *xattrs_buf,
- u64 follows)
+void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release)
{
- struct ceph_mds_caps *fc;
- struct ceph_msg *msg;
-
- dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u mseq %u follows %lld size %llu/%llu"
- " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
- cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
- ceph_cap_string(dirty),
- seq, issue_seq, mseq, follows, size, max_size,
- xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_fs_client *fsc;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
+ /* 'ci' being NULL means the remove have already occurred */
+ if (!ci) {
+ doutc(mdsc->fsc->client, "inode is NULL\n");
+ return;
+ }
- msg->hdr.tid = cpu_to_le64(flush_tid);
+ lockdep_assert_held(&ci->i_ceph_lock);
- fc = msg->front.iov_base;
- memset(fc, 0, sizeof(*fc));
+ fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
+ WARN_ON_ONCE(ci->i_auth_cap == cap &&
+ !list_empty(&ci->i_dirty_item) &&
+ !fsc->blocklisted &&
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
- fc->cap_id = cpu_to_le64(cid);
- fc->op = cpu_to_le32(op);
- fc->seq = cpu_to_le32(seq);
- fc->issue_seq = cpu_to_le32(issue_seq);
- fc->migrate_seq = cpu_to_le32(mseq);
- fc->caps = cpu_to_le32(caps);
- fc->wanted = cpu_to_le32(wanted);
- fc->dirty = cpu_to_le32(dirty);
- fc->ino = cpu_to_le64(ino);
- fc->snap_follows = cpu_to_le64(follows);
-
- fc->size = cpu_to_le64(size);
- fc->max_size = cpu_to_le64(max_size);
- if (mtime)
- ceph_encode_timespec(&fc->mtime, mtime);
- if (atime)
- ceph_encode_timespec(&fc->atime, atime);
- fc->time_warp_seq = cpu_to_le32(time_warp_seq);
-
- fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
- fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
- fc->mode = cpu_to_le32(mode);
-
- fc->xattr_version = cpu_to_le64(xattr_version);
- if (xattrs_buf) {
- msg->middle = ceph_buffer_get(xattrs_buf);
- fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
- }
-
- ceph_con_send(&session->s_con, msg);
- return 0;
+ __ceph_remove_cap(cap, queue_release);
}
-void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
+struct cap_msg_args {
+ struct ceph_mds_session *session;
+ u64 ino, cid, follows;
+ u64 flush_tid, oldest_flush_tid, size, max_size;
+ u64 xattr_version;
+ u64 change_attr;
+ struct ceph_buffer *xattr_buf;
+ struct ceph_buffer *old_xattr_buf;
+ struct timespec64 atime, mtime, ctime, btime;
+ int op, caps, wanted, dirty;
+ u32 seq, issue_seq, mseq, time_warp_seq;
+ u32 flags;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ bool inline_data;
+ bool wake;
+ bool encrypted;
+ u32 fscrypt_auth_len;
+ u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
+};
+
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
+ struct ceph_mds_caps *fc;
+ void *p;
+ struct ceph_mds_client *mdsc = arg->session->s_mdsc;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+
+ doutc(mdsc->fsc->client,
+ "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
+ " tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n",
+ ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+ ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+ ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+ arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+ arg->size, arg->max_size, arg->xattr_version,
+ arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
+
+ msg->hdr.version = cpu_to_le16(12);
+ msg->hdr.tid = cpu_to_le64(arg->flush_tid);
- spin_lock(&session->s_cap_lock);
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %llx release to mds%d msg %p (%d left)\n",
- ino, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- le32_add_cpu(&head->num, 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ino);
- item->cap_id = cpu_to_le64(cap_id);
- item->migrate_seq = cpu_to_le32(migrate_seq);
- item->seq = cpu_to_le32(issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head, &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
+ fc = msg->front.iov_base;
+ memset(fc, 0, sizeof(*fc));
+
+ fc->cap_id = cpu_to_le64(arg->cid);
+ fc->op = cpu_to_le32(arg->op);
+ fc->seq = cpu_to_le32(arg->seq);
+ fc->issue_seq = cpu_to_le32(arg->issue_seq);
+ fc->migrate_seq = cpu_to_le32(arg->mseq);
+ fc->caps = cpu_to_le32(arg->caps);
+ fc->wanted = cpu_to_le32(arg->wanted);
+ fc->dirty = cpu_to_le32(arg->dirty);
+ fc->ino = cpu_to_le64(arg->ino);
+ fc->snap_follows = cpu_to_le64(arg->follows);
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (arg->encrypted)
+ fc->size = cpu_to_le64(round_up(arg->size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ else
+#endif
+ fc->size = cpu_to_le64(arg->size);
+ fc->max_size = cpu_to_le64(arg->max_size);
+ ceph_encode_timespec64(&fc->mtime, &arg->mtime);
+ ceph_encode_timespec64(&fc->atime, &arg->atime);
+ ceph_encode_timespec64(&fc->ctime, &arg->ctime);
+ fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
+
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
+ fc->mode = cpu_to_le32(arg->mode);
+
+ fc->xattr_version = cpu_to_le64(arg->xattr_version);
+ if (arg->xattr_buf) {
+ msg->middle = ceph_buffer_get(arg->xattr_buf);
+ fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+ msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
}
- spin_unlock(&session->s_cap_lock);
+
+ p = fc + 1;
+ /* flock buffer size (version 2) */
+ ceph_encode_32(&p, 0);
+ /* inline version (version 4) */
+ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
+ /* inline data size */
+ ceph_encode_32(&p, 0);
+ /*
+ * osd_epoch_barrier (version 5)
+ * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
+ * case it was recently changed
+ */
+ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
+ /* oldest_flush_tid (version 6) */
+ ceph_encode_64(&p, arg->oldest_flush_tid);
+
+ /*
+ * caller_uid/caller_gid (version 7)
+ *
+ * Currently, we don't properly track which caller dirtied the caps
+ * last, and force a flush of them when there is a conflict. For now,
+ * just set this to 0:0, to emulate how the MDS has worked up to now.
+ */
+ ceph_encode_32(&p, 0);
+ ceph_encode_32(&p, 0);
+
+ /* pool namespace (version 8) (mds always ignores this) */
+ ceph_encode_32(&p, 0);
+
+ /* btime and change_attr (version 9) */
+ ceph_encode_timespec64(p, &arg->btime);
+ p += sizeof(struct ceph_timespec);
+ ceph_encode_64(&p, arg->change_attr);
+
+ /* Advisory flags (version 10) */
+ ceph_encode_32(&p, arg->flags);
+
+ /* dirstats (version 11) - these are r/o on the client */
+ ceph_encode_64(&p, 0);
+ ceph_encode_64(&p, 0);
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ /*
+ * fscrypt_auth and fscrypt_file (version 12)
+ *
+ * fscrypt_auth holds the crypto context (if any). fscrypt_file
+ * tracks the real i_size as an __le64 field (and we use a rounded-up
+ * i_size in the traditional size field).
+ */
+ ceph_encode_32(&p, arg->fscrypt_auth_len);
+ ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
+ ceph_encode_32(&p, sizeof(__le64));
+ ceph_encode_64(&p, arg->size);
+#else /* CONFIG_FS_ENCRYPTION */
+ ceph_encode_32(&p, 0);
+ ceph_encode_32(&p, 0);
+#endif /* CONFIG_FS_ENCRYPTION */
}
/*
- * Queue cap releases when an inode is dropped from our cache. Since
- * inode is about to be destroyed, there is no need for i_ceph_lock.
+ * Queue cap releases when an inode is dropped from our cache.
*/
-void ceph_queue_caps_release(struct inode *inode)
+void __ceph_remove_caps(struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
struct rb_node *p;
+ /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
+ * may call __ceph_caps_issued_mask() on a freeing inode. */
+ spin_lock(&ci->i_ceph_lock);
p = rb_first(&ci->i_caps);
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
- struct ceph_mds_session *session = cap->session;
-
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
p = rb_next(p);
- __ceph_remove_cap(cap);
+ ceph_remove_cap(mdsc, cap, true);
}
+ spin_unlock(&ci->i_ceph_lock);
}
/*
- * Send a cap msg on the given inode. Update our caps state, then
- * drop i_ceph_lock and send the message.
+ * Prepare to send a cap message to an MDS. Update the cap state, and populate
+ * the arg struct with the parameters that will need to be sent. This should
+ * be done under the i_ceph_lock to guard against changes to cap state.
*
* Make note of max_size reported/requested from mds, revoked caps
* that have now been implemented.
- *
- * Make half-hearted attempt ot to invalidate page cache if we are
- * dropping RDCACHE. Note that this will leave behind locked pages
- * that we'll then need to deal with elsewhere.
- *
- * Return non-zero if delayed release, or we experienced an error
- * such that the caller should requeue + retry later.
- *
- * called with i_ceph_lock, then drops it.
- * caller should hold snap_rwsem (read), s_mutex.
*/
-static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
- int op, int used, int want, int retain, int flushing,
- unsigned *pflush_tid)
- __releases(cap->ci->i_ceph_lock)
+static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
+ int op, int flags, int used, int want, int retain,
+ int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
- u64 cap_id = cap->cap_id;
- int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
- struct timespec mtime, atime;
- int wake = 0;
- umode_t mode;
- kuid_t uid;
- kgid_t gid;
- struct ceph_mds_session *session;
- u64 xattr_version = 0;
- struct ceph_buffer *xattr_blob = NULL;
- int delayed = 0;
- u64 flush_tid = 0;
- int i;
- int ret;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int held, revoking;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dropping = cap->issued & ~retain;
- dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
- inode, cap, cap->session,
- ceph_cap_string(held), ceph_cap_string(held & retain),
- ceph_cap_string(revoking));
+ doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
+ inode, ceph_vinop(inode), cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
- session = cap->session;
-
- /* don't release wanted unless we've waited a bit. */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_min)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- want |= cap->mds_wanted;
- retain |= cap->issued;
- delayed = 1;
- }
- ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
- if (cap->implemented & ~cap->issued) {
- /*
- * Wake up any waiters on wanted -> needed transition.
- * This is due to the weird transition from buffered
- * to sync IO... we need to flush dirty pages _before_
- * allowing sync writes to avoid reordering.
- */
- wake = 1;
- }
+ /*
+ * Wake up any waiters on wanted -> needed transition. This is due to
+ * the weird transition from buffered to sync IO... we need to flush
+ * dirty pages _before_ allowing sync writes to avoid reordering.
+ */
+ arg->wake = cap->implemented & ~cap->issued;
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
- if (flushing) {
- /*
- * assign a tid for flush operations so we can avoid
- * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
- * clean type races. track latest tid for every bit
- * so we can handle flush AxFw, flush Fw, and have the
- * first ack clean Ax.
- */
- flush_tid = ++ci->i_cap_flush_last_tid;
- if (pflush_tid)
- *pflush_tid = flush_tid;
- dout(" cap_flush_tid %d\n", (int)flush_tid);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if (flushing & (1 << i))
- ci->i_cap_flush_tid[i] = flush_tid;
-
- follows = ci->i_head_snapc->seq;
- } else {
- follows = 0;
- }
-
- keep = cap->implemented;
- seq = cap->seq;
- issue_seq = cap->issue_seq;
- mseq = cap->mseq;
- size = inode->i_size;
- ci->i_reported_size = size;
- max_size = ci->i_wanted_max_size;
- ci->i_requested_max_size = max_size;
- mtime = inode->i_mtime;
- atime = inode->i_atime;
- time_warp_seq = ci->i_time_warp_seq;
- uid = inode->i_uid;
- gid = inode->i_gid;
- mode = inode->i_mode;
+ arg->session = cap->session;
+ arg->ino = ceph_vino(inode).ino;
+ arg->cid = cap->cap_id;
+ arg->follows = flushing ? ci->i_head_snapc->seq : 0;
+ arg->flush_tid = flush_tid;
+ arg->oldest_flush_tid = oldest_flush_tid;
+ arg->size = i_size_read(inode);
+ ci->i_reported_size = arg->size;
+ arg->max_size = ci->i_wanted_max_size;
+ if (cap == ci->i_auth_cap) {
+ if (want & CEPH_CAP_ANY_FILE_WR)
+ ci->i_requested_max_size = arg->max_size;
+ else
+ ci->i_requested_max_size = 0;
+ }
if (flushing & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- xattr_blob = ci->i_xattrs.blob;
- xattr_version = ci->i_xattrs.version;
+ arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
+ arg->xattr_version = ci->i_xattrs.version;
+ arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
+ } else {
+ arg->xattr_buf = NULL;
+ arg->old_xattr_buf = NULL;
}
- spin_unlock(&ci->i_ceph_lock);
+ arg->mtime = inode_get_mtime(inode);
+ arg->atime = inode_get_atime(inode);
+ arg->ctime = inode_get_ctime(inode);
+ arg->btime = ci->i_btime;
+ arg->change_attr = inode_peek_iversion_raw(inode);
+
+ arg->op = op;
+ arg->caps = cap->implemented;
+ arg->wanted = want;
+ arg->dirty = flushing;
+
+ arg->seq = cap->seq;
+ arg->issue_seq = cap->issue_seq;
+ arg->mseq = cap->mseq;
+ arg->time_warp_seq = ci->i_time_warp_seq;
+
+ arg->uid = inode->i_uid;
+ arg->gid = inode->i_gid;
+ arg->mode = inode->i_mode;
+
+ arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+ if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+ !list_empty(&ci->i_cap_snaps)) {
+ struct ceph_cap_snap *capsnap;
+ list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->cap_flush.tid)
+ break;
+ if (capsnap->need_flush) {
+ flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+ break;
+ }
+ }
+ }
+ arg->flags = flags;
+ arg->encrypted = IS_ENCRYPTED(inode);
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (ci->fscrypt_auth_len &&
+ WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
+ /* Don't set this if it's too big */
+ arg->fscrypt_auth_len = 0;
+ } else {
+ arg->fscrypt_auth_len = ci->fscrypt_auth_len;
+ memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
+ min_t(size_t, ci->fscrypt_auth_len,
+ sizeof(arg->fscrypt_auth)));
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
- ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, time_warp_seq,
- uid, gid, mode, xattr_version, xattr_blob,
- follows);
- if (ret < 0) {
- dout("error sending cap msg, must requeue %p\n", inode);
- delayed = 1;
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+ return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
+}
+#else
+#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
+
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+ return CAP_MSG_FIXED_FIELDS;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
+
+/*
+ * Send a cap msg on the given inode.
+ *
+ * Caller should hold snap_rwsem (read), s_mutex.
+ */
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
+{
+ struct ceph_msg *msg;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
+ false);
+ if (!msg) {
+ pr_err_client(cl,
+ "error allocating cap msg: ino (%llx.%llx)"
+ " flushing %s tid %llu, requeuing cap.\n",
+ ceph_vinop(inode), ceph_cap_string(arg->dirty),
+ arg->flush_tid);
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(arg->session->s_mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ return;
}
- if (wake)
+ encode_cap_msg(msg, arg);
+ ceph_con_send(&arg->session->s_con, msg);
+ ceph_buffer_put(arg->old_xattr_buf);
+ ceph_buffer_put(arg->xattr_buf);
+ if (arg->wake)
wake_up_all(&ci->i_cap_wq);
+}
- return delayed;
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ struct cap_msg_args arg;
+ struct ceph_msg *msg;
+
+ arg.session = session;
+ arg.ino = ceph_vino(inode).ino;
+ arg.cid = 0;
+ arg.follows = capsnap->follows;
+ arg.flush_tid = capsnap->cap_flush.tid;
+ arg.oldest_flush_tid = oldest_flush_tid;
+
+ arg.size = capsnap->size;
+ arg.max_size = 0;
+ arg.xattr_version = capsnap->xattr_version;
+ arg.xattr_buf = capsnap->xattr_blob;
+ arg.old_xattr_buf = NULL;
+
+ arg.atime = capsnap->atime;
+ arg.mtime = capsnap->mtime;
+ arg.ctime = capsnap->ctime;
+ arg.btime = capsnap->btime;
+ arg.change_attr = capsnap->change_attr;
+
+ arg.op = CEPH_CAP_OP_FLUSHSNAP;
+ arg.caps = capsnap->issued;
+ arg.wanted = 0;
+ arg.dirty = capsnap->dirty;
+
+ arg.seq = 0;
+ arg.issue_seq = 0;
+ arg.mseq = mseq;
+ arg.time_warp_seq = capsnap->time_warp_seq;
+
+ arg.uid = capsnap->uid;
+ arg.gid = capsnap->gid;
+ arg.mode = capsnap->mode;
+
+ arg.inline_data = capsnap->inline_data;
+ arg.flags = 0;
+ arg.wake = false;
+ arg.encrypted = IS_ENCRYPTED(inode);
+
+ /* No fscrypt_auth changes from a capsnap.*/
+ arg.fscrypt_auth_len = 0;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
+ GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
+
+ encode_cap_msg(msg, &arg);
+ ceph_con_send(&arg.session->s_con, msg);
+ return 0;
}
/*
@@ -1222,37 +1611,24 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @again is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
- * Called under i_ceph_lock. Takes s_mutex as needed.
+ * Called under i_ceph_lock.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
+ session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1260,98 +1636,142 @@ retry:
if (capsnap->dirty_pages || capsnap->writing)
break;
- /*
- * if cap writeback already occurred, we should have dropped
- * the capsnap in ceph_put_wrbuffer_cap_refs.
- */
- BUG_ON(capsnap->dirty == 0);
-
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
+ /* should be removed by ceph_try_drop_cap_snap() */
+ BUG_ON(!capsnap->need_flush);
/* only flush each capsnap once */
- if (!again && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ doutc(cl, "already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
+
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf = NULL, *iter;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
+ inode, ceph_vinop(inode), cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
+ if (iter->tid >= first_tid) {
+ cf = iter;
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
- atomic_inc(&capsnap->nref);
- if (!list_empty(&capsnap->flushing_item))
- list_del_init(&capsnap->flushing_item);
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
+ first_tid = cf->tid + 1;
+
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
- capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
+ ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err_client(cl, "error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
+ }
+}
+
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_mds_session *session = NULL;
+ bool need_put = false;
+ int mds;
+
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
+ if (psession)
+ session = *psession;
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ doutc(cl, " no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ doutc(cl, " no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
+
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ doutc(cl, " oops, wrong session %p mutex\n", session);
+ ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
goto retry;
}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+ // make sure flushsnap messages are sent in proper order.
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ __ceph_flush_snaps(ci, session);
out:
+ spin_unlock(&ci->i_ceph_lock);
+
if (psession)
*psession = session;
- else if (session) {
- mutex_unlock(&session->s_mutex);
+ else
ceph_put_mds_session(session);
- }
-}
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ if (!list_empty(&ci->i_snap_flush_item))
+ need_put = true;
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
- spin_unlock(&ci->i_ceph_lock);
+ if (need_put)
+ iput(inode);
}
/*
@@ -1359,36 +1779,54 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
* Caller is then responsible for calling __mark_inode_dirty with the
* returned flags value.
*/
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int was = ci->i_dirty_caps;
int dirty = 0;
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
- ceph_cap_string(mask), ceph_cap_string(was),
- ceph_cap_string(was | mask));
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (!ci->i_auth_cap) {
+ pr_warn_client(cl, "%p %llx.%llx mask %s, "
+ "but no auth cap (session was closed?)\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(mask));
+ return 0;
+ }
+
+ doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask),
+ ceph_cap_string(was), ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
- if (!ci->i_head_snapc)
+ struct ceph_mds_session *session = ci->i_auth_cap->session;
+
+ WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+ swap(ci->i_prealloc_cap_flush, *pcf);
+
+ if (!ci->i_head_snapc) {
+ WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
- dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ }
+ doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
+ inode, ceph_vinop(inode), ci->i_head_snapc,
+ ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- if (ci->i_auth_cap)
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
- else
- list_add(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ list_add(&ci->i_dirty_item, &session->s_cap_dirty);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
dirty |= I_DIRTY_SYNC;
}
+ } else {
+ WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
}
BUG_ON(list_empty(&ci->i_dirty_item));
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1398,130 +1836,244 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
return dirty;
}
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+ struct ceph_cap_flush *cf;
+
+ cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+ if (!cf)
+ return NULL;
+
+ cf->is_capsnap = false;
+ return cf;
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+ if (cf)
+ kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ return cf->tid;
+ }
+ return 0;
+}
+
+/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del_init(&cf->g_list);
+ return wake;
+}
+
+static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del_init(&cf->i_list);
+ return wake;
+}
+
/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
- * Called under i_ceph_lock.
+ * Called under i_ceph_lock. Returns the flush tid.
*/
-static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session)
+static u64 __mark_caps_flushing(struct inode *inode,
+ struct ceph_mds_session *session, bool wake,
+ u64 *oldest_flush_tid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *cf = NULL;
int flushing;
+ lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
+ BUG_ON(!ci->i_prealloc_cap_flush);
flushing = ci->i_dirty_caps;
- dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
- ceph_cap_string(flushing),
- ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps | flushing));
+ doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
ci->i_flushing_caps |= flushing;
ci->i_dirty_caps = 0;
- dout(" inode %p now !dirty\n", inode);
+ doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
+
+ swap(cf, ci->i_prealloc_cap_flush);
+ cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+ cf->tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
+ *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing seq %lld\n", inode,
- ci->i_cap_flush_seq);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) seq %lld\n", inode,
- ci->i_cap_flush_seq);
}
spin_unlock(&mdsc->cap_dirty_lock);
- return flushing;
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
+
+ return cf->tid;
}
/*
* try to invalidate mapping pages without blocking.
*/
static int try_nonblocking_invalidate(struct inode *inode)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u32 invalidating_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
+ ceph_fscache_invalidate(inode, false);
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&ci->i_ceph_lock);
if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
- dout("try_nonblocking_invalidate %p success\n", inode);
+ doutc(cl, "%p %llx.%llx success\n", inode,
+ ceph_vinop(inode));
/* save any racing async invalidate some trouble */
ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
return 0;
}
- dout("try_nonblocking_invalidate %p failed\n", inode);
+ doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
return -1;
}
+bool __ceph_should_report_size(struct ceph_inode_info *ci)
+{
+ loff_t size = i_size_read(&ci->netfs.inode);
+ /* mds will adjust max size according to the reported size */
+ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
+ return false;
+ if (size >= ci->i_max_size)
+ return true;
+ /* half of previous max_size increment has been used */
+ if (ci->i_max_size > ci->i_reported_size &&
+ (size << 1) >= ci->i_max_size + ci->i_reported_size)
+ return true;
+ return false;
+}
+
/*
* Swiss army knife function to examine currently used and wanted
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
- * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- * cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
+ * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without
+ * further delay.
*/
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
+void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
+ u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
- int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
-
- /* if we are unmounting, flush any unused caps immediately. */
- if (mdsc->stopping)
- is_delayed = 1;
+ bool queue_invalidate = false;
+ bool tried_invalidate = false;
+ bool queue_writeback = false;
+ struct ceph_mds_session *session = NULL;
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
+
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ return;
+ }
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
-
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
- goto retry_locked;
retry:
- spin_lock(&ci->i_ceph_lock);
-retry_locked:
+ /* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
+
+ /* Caps which have active references against them */
used = __ceph_caps_used(ci);
- want = file_wanted | used;
+
+ /*
+ * "issued" represents the current caps that the MDS wants us to have.
+ * "implemented" is the set that we have been granted, and includes the
+ * ones that have not yet been returned to the MDS (the "revoking" set,
+ * usually because they have outstanding references).
+ */
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
- retain = want | CEPH_CAP_PIN;
+ want = file_wanted;
+
+ /* The ones we currently want to retain (may be adjusted below) */
+ retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
- if (want) {
+ if (file_wanted) {
retain |= CEPH_CAP_ANY; /* be greedy */
+ } else if (S_ISDIR(inode->i_mode) &&
+ (issued & CEPH_CAP_FILE_SHARED) &&
+ __ceph_dir_is_complete(ci)) {
+ /*
+ * If a directory is complete, we want to keep
+ * the exclusive cap. So that MDS does not end up
+ * revoking the shared cap on every create/unlink
+ * operation.
+ */
+ if (IS_RDONLY(inode)) {
+ want = CEPH_CAP_ANY_SHARED;
+ } else {
+ want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ }
+ retain |= want;
} else {
+
retain |= CEPH_CAP_ANY_SHARED;
/*
* keep RD only if we didn't have the file open RW,
@@ -1533,348 +2085,438 @@ retry_locked:
}
}
- dout("check_caps %p file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s%s\n", inode,
- ceph_cap_string(file_wanted),
+ doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
+ "flushing %s issued %s revoking %s retain %s %s%s%s%s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "",
+ (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : "");
/*
* If we no longer need to hold onto old our caps, and we may
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!is_delayed || mdsc->stopping) &&
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
+ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+ S_ISREG(inode->i_mode) &&
+ !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
+ inode->i_data.nrpages && /* have cached pages */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
- dout("check_caps trying to invalidate on %p\n", inode);
+ doutc(cl, "trying to invalidate on %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
if (try_nonblocking_invalidate(inode) < 0) {
- if (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO)) {
- dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
- } else {
- dout("check_caps failed to invalidate pages\n");
- /* we failed to invalidate pages. check these
- caps again later. */
- force_requeue = 1;
- __cap_set_timeouts(mdsc, ci);
- }
+ doutc(cl, "queuing invalidate\n");
+ queue_invalidate = true;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
- tried_invalidate = 1;
- goto retry_locked;
+ tried_invalidate = true;
+ goto retry;
}
- num = 0;
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ int mflags = 0;
+ struct cap_msg_args arg;
+
cap = rb_entry(p, struct ceph_cap, ci_node);
- num++;
/* avoid looping forever */
if (mds >= cap->mds ||
((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
continue;
- /* NOTE: no side-effects allowed, until we take s_mutex */
-
+ /*
+ * If we have an auth cap, we don't need to consider any
+ * overlapping caps as used.
+ */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
- cap->mds, cap, ceph_cap_string(cap->issued),
- ceph_cap_string(cap_used),
- ceph_cap_string(cap->implemented),
- ceph_cap_string(revoking));
+ doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+ cap->mds, cap, ceph_cap_string(cap_used),
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented),
+ ceph_cap_string(revoking));
+
+ /* completed revocation? going down and there are no caps? */
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ doutc(cl, "completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
+ }
+
+ if (flags & CHECK_CAPS_FLUSH_FORCE) {
+ doutc(cl, "force to flush caps\n");
+ goto ack;
+ }
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
if (ci->i_wanted_max_size > ci->i_max_size &&
ci->i_wanted_max_size > ci->i_requested_max_size) {
- dout("requesting new max_size\n");
+ doutc(cl, "requesting new max_size\n");
goto ack;
}
/* approaching file_max? */
- if ((inode->i_size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size) {
- dout("i_size approaching max_size\n");
+ if (__ceph_should_report_size(ci)) {
+ doutc(cl, "i_size approaching max_size\n");
goto ack;
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
- }
-
- /* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ doutc(cl, "flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ doutc(cl, "flushing snap caps\n");
+ goto ack;
+ }
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
- if ((cap->issued & ~retain) == 0 &&
- cap->mds_wanted == want)
+ if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
- if (is_delayed)
- goto ack;
-
- /* delay? */
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
- time_before(jiffies, ci->i_hold_caps_max)) {
- dout(" delaying issued %s -> %s, wanted %s -> %s\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & retain),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(want));
- delayed++;
- continue;
- }
-
ack:
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
- dout(" skipping %p I_NOFLUSH set\n", inode);
- continue;
- }
+ ceph_put_mds_session(session);
+ session = ceph_get_mds_session(cap->session);
- if (session && session != cap->session) {
- dout("oops, wrong session %p mutex\n", session);
- mutex_unlock(&session->s_mutex);
- session = NULL;
- }
- if (!session) {
- session = cap->session;
- if (mutex_trylock(&session->s_mutex) == 0) {
- dout("inverting session/ino locks on %p\n",
- session);
- spin_unlock(&ci->i_ceph_lock);
- if (took_snap_rwsem) {
- up_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 0;
- }
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- }
- /* take snap_rwsem after session mutex */
- if (!took_snap_rwsem) {
- if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
- dout("inverting snap/in locks on %p\n",
- inode);
- spin_unlock(&ci->i_ceph_lock);
- down_read(&mdsc->snap_rwsem);
- took_snap_rwsem = 1;
- goto retry;
- }
- took_snap_rwsem = 1;
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry;
}
- if (cap == ci->i_auth_cap && ci->i_dirty_caps)
- flushing = __mark_caps_flushing(inode, session);
- else
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+ flushing = ci->i_dirty_caps;
+ flush_tid = __mark_caps_flushing(inode, session, false,
+ &oldest_flush_tid);
+ if (flags & CHECK_CAPS_FLUSH &&
+ list_empty(&session->s_cap_dirty))
+ mflags |= CEPH_CLIENT_CAPS_SYNC;
+ } else {
flushing = 0;
+ flush_tid = 0;
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
mds = cap->mds; /* remember mds, so we don't repeat */
- sent++;
- /* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
- want, retain, flushing, NULL);
+ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used,
+ want, retain, flushing, flush_tid, oldest_flush_tid);
+
+ spin_unlock(&ci->i_ceph_lock);
+ __send_cap(&arg, ci);
+ spin_lock(&ci->i_ceph_lock);
+
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /*
- * Reschedule delayed caps release if we delayed anything,
- * otherwise cancel.
- */
- if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
- if (!delayed && !is_delayed)
- __cap_delay_cancel(mdsc, ci);
- else if (!is_delayed || force_requeue)
+ /* periodically re-calculate caps wanted by open files */
+ if (__ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list) &&
+ (file_wanted & ~CEPH_CAP_PIN) &&
+ !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
__cap_delay_requeue(mdsc, ci);
+ }
spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ if (queue_writeback)
+ ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
-
- if (session)
- mutex_unlock(&session->s_mutex);
- if (took_snap_rwsem)
- up_read(&mdsc->snap_rwsem);
}
/*
* Try to flush dirty caps back to the auth mds.
*/
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
- unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- int unlock_session = session ? 0 : 1;
int flushing = 0;
+ u64 flush_tid = 0, oldest_flush_tid = 0;
-retry:
spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
- dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
- goto out;
- }
+retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
- int used = __ceph_caps_used(ci);
- int want = __ceph_caps_wanted(ci);
- int delayed;
+ struct cap_msg_args arg;
+ struct ceph_mds_session *session = cap->session;
- if (!session) {
+ if (session->s_state < CEPH_MDS_SESSION_OPEN) {
spin_unlock(&ci->i_ceph_lock);
- session = cap->session;
- mutex_lock(&session->s_mutex);
- goto retry;
- }
- BUG_ON(session != cap->session);
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
+ }
- flushing = __mark_caps_flushing(inode, session);
+ if (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+ goto retry_locked;
+ }
- /* __send_cap drops i_ceph_lock */
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- cap->issued | cap->implemented, flushing,
- flush_tid);
- if (!delayed)
- goto out_unlocked;
+ flushing = ci->i_dirty_caps;
+ flush_tid = __mark_caps_flushing(inode, session, true,
+ &oldest_flush_tid);
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+ __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
+ spin_unlock(&ci->i_ceph_lock);
+
+ __send_cap(&arg, ci);
+ } else {
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
+ flush_tid = cf->tid;
+ }
+ flushing = ci->i_flushing_caps;
+ spin_unlock(&ci->i_ceph_lock);
}
out:
- spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
- if (session && unlock_session)
- mutex_unlock(&session->s_mutex);
+ *ptid = flush_tid;
return flushing;
}
/*
* Return true if we've flushed caps through the given flush_tid.
*/
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int i, ret = 1;
+ int ret = 1;
spin_lock(&ci->i_ceph_lock);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((ci->i_flushing_caps & (1 << i)) &&
- ci->i_cap_flush_tid[i] <= tid) {
- /* still flushing this bit */
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ if (cf->tid <= flush_tid)
ret = 0;
- break;
- }
+ }
spin_unlock(&ci->i_ceph_lock);
return ret;
}
/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
+ * flush the mdlog and wait for any unsafe requests to complete.
*/
-static void sync_write_wait(struct inode *inode)
+static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
- /* set upper bound as _last_ entry in chain */
- req = list_entry(head->prev, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
+ /*
+ * Trigger to flush the journal logs in all the relevant MDSes
+ * manually, or in the worst case we must wait at most 5 seconds
+ * to wait the journal logs to be flushed by the MDSes periodically.
+ */
+ if (req1 || req2) {
+ struct ceph_mds_request *req;
+ struct ceph_mds_session **sessions;
+ struct ceph_mds_session *s;
+ unsigned int max_sessions;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
+ if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
+ err = -ENOMEM;
+ goto out;
+ }
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
+ if (req1) {
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
+ r_unsafe_dir_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ if (req2) {
+ list_for_each_entry(req, &ci->i_unsafe_iops,
+ r_unsafe_target_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ spin_unlock(&ci->i_unsafe_lock);
+
+ /* the auth MDS */
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap) {
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&mdsc->mutex);
+
+ /* send flush mdlog request to MDSes */
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(s);
+ }
+ }
+ kfree(sessions);
+ }
+
+ doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
+ ceph_vinop(inode), req1 ? req1->r_tid : 0ULL,
+ req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
+ if (ret)
+ err = -EIO;
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ }
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_entry(head->next, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
out:
- spin_unlock(&ci->i_unsafe_lock);
+ if (req1)
+ ceph_mdsc_put_request(req1);
+ if (req2)
+ ceph_mdsc_put_request(req2);
+ return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
- int ret;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ u64 flush_tid;
+ int ret, err;
int dirty;
- dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
+ doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "");
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret < 0)
- return ret;
- mutex_lock(&inode->i_mutex);
+ ret = file_write_and_wait_range(file, start, end);
+ if (datasync)
+ goto out;
+
+ ret = ceph_wait_on_async_create(inode);
+ if (ret)
+ goto out;
+
+ dirty = try_flush_caps(inode, &flush_tid);
+ doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
- dirty = try_flush_caps(inode, NULL, &flush_tid);
- dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+ err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/*
* only wait on non-file metadata writeback (the mds
* can recover size and mtime, so we don't need to
* wait for that)
*/
- if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
- dout("fsync waiting for flush_tid %u\n", flush_tid);
- ret = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
+ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+ err = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
}
- dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
- mutex_unlock(&inode->i_mutex);
+ if (err < 0)
+ ret = err;
+
+ err = file_check_and_advance_wb_err(file);
+ if (err < 0)
+ ret = err;
+out:
+ doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
+ datasync ? " datasync" : "", ret);
return ret;
}
@@ -1887,20 +2529,25 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ u64 flush_tid;
int err = 0;
int dirty;
- int wait = wbc->sync_mode == WB_SYNC_ALL;
+ int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
- dout("write_inode %p wait=%d\n", inode, wait);
+ doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
+ ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
- dirty = try_flush_caps(inode, NULL, &flush_tid);
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
+ dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
} else {
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_fs_client(inode->i_sb)->mdsc;
spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_dirty(ci))
@@ -1910,34 +2557,137 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
+{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_cap *cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+ u64 first_tid = 0;
+ u64 last_snap_flush = 0;
+
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+
+ list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->is_capsnap) {
+ last_snap_flush = cf->tid;
+ break;
+ }
+ }
+
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
+
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
+ break;
+ }
+
+ first_tid = cf->tid + 1;
+
+ if (!cf->is_capsnap) {
+ struct cap_msg_args arg;
+
+ doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), cap, cf->tid,
+ ceph_cap_string(cf->caps));
+ __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
+ (cf->tid < last_snap_flush ?
+ CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ (cap->issued | cap->implemented),
+ cf->caps, cf->tid, oldest_flush_tid);
+ spin_unlock(&ci->i_ceph_lock);
+ __send_cap(&arg, ci);
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
+ inode, ceph_vinop(inode), capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ refcount_inc(&capsnap->nref);
+ spin_unlock(&ci->i_ceph_lock);
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err_client(cl, "error sending cap flushsnap,"
+ " %p %llx.%llx tid %llu follows %llu\n",
+ inode, ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
+
+ ceph_put_cap_snap(capsnap);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ }
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
- struct ceph_cap_snap *capsnap;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
+
+ doutc(cl, "mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->netfs.inode;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
+ if (!(cap && cap->session == session)) {
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+
+
+ /*
+ * if flushing caps were revoked, we re-send the cap flush
+ * in client reconnect stage. This guarantees MDS * processes
+ * the cap flush message before issuing the flushing caps to
+ * other client.
+ */
+ if ((cap->issued & ci->i_flushing_caps) !=
+ ci->i_flushing_caps) {
+ /* encode_caps_cb() also will reset these sequence
+ * numbers. make sure sequence numbers in cap flush
+ * message match later reconnect message */
+ cap->seq = 0;
+ cap->issue_seq = 0;
+ cap->mseq = 0;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
} else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
}
+
spin_unlock(&ci->i_ceph_lock);
}
}
@@ -1945,72 +2695,61 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
+
+ lockdep_assert_held(&session->s_mutex);
- kick_flushing_capsnaps(mdsc, session);
+ doutc(cl, "mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
- dout("kick_flushing_caps mds%d\n", session->s_mds);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int delayed = 0;
+ struct inode *inode = &ci->netfs.inode;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p %s\n", inode,
- cap, ceph_cap_string(ci->i_flushing_caps));
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
+ if (!(cap && cap->session == session)) {
+ pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+ inode, ceph_vinop(inode), cap,
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ continue;
}
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ }
+ spin_unlock(&ci->i_ceph_lock);
}
}
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct inode *inode)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
- int delayed = 0;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct inode *inode = &ci->netfs.inode;
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
- ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+ lockdep_assert_held(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 1);
+ doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
+ inode, ceph_vinop(inode),
+ ceph_cap_string(ci->i_flushing_caps));
- if (ci->i_flushing_caps) {
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- spin_unlock(&ci->i_ceph_lock);
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
}
}
@@ -2018,25 +2757,37 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
+ bool snap_rwsem_locked)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
+ if (got & CEPH_CAP_FILE_EXCL)
+ ci->i_fx_ref++;
+ if (got & CEPH_CAP_FILE_WR) {
+ if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+ BUG_ON(!snap_rwsem_locked);
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
ci->i_wr_ref++;
+ }
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(inode);
ci->i_wb_ref++;
- dout("__take_cap_refs %p wb %d -> %d (?)\n",
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2046,60 +2797,75 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* to (when applicable), and check against max_size here as well.
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
+ *
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 special error codes:
+ * -EAGAIN: need to sleep but non-blocking is specified
+ * -EFBIG: ask caller to call check_max_size() and try again.
+ * -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
*/
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff, int *check_max, int *err)
+enum {
+ /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+ NON_BLOCKING = (1 << 8),
+ CHECK_FILELOCK = (1 << 9),
+};
+
+static int try_get_cap_refs(struct inode *inode, int need, int want,
+ loff_t endoff, int flags, int *got)
{
- struct inode *inode = &ci->vfs_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
int have, implemented;
- int file_wanted;
+ bool snap_rwsem_locked = false;
+
+ doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(want));
- dout("get_cap_refs %p need %s want %s\n", inode,
- ceph_cap_string(need), ceph_cap_string(want));
+again:
spin_lock(&ci->i_ceph_lock);
- /* make sure file is actually open */
- file_wanted = __ceph_caps_file_wanted(ci);
- if ((file_wanted & need) == 0) {
- dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
- ceph_cap_string(need), ceph_cap_string(file_wanted));
- *err = -EBADF;
- ret = 1;
- goto out;
+ if ((flags & CHECK_FILELOCK) &&
+ (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
+ doutc(cl, "%p %llx.%llx error filelock\n", inode,
+ ceph_vinop(inode));
+ ret = -EIO;
+ goto out_unlock;
}
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
- if (!(need & CEPH_CAP_FILE_WR))
- mutex_lock(&inode->i_mutex);
+ if (snap_rwsem_locked) {
+ up_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = false;
+ }
__ceph_do_pending_vmtruncate(inode);
- if (!(need & CEPH_CAP_FILE_WR))
- mutex_unlock(&inode->i_mutex);
spin_lock(&ci->i_ceph_lock);
}
- if (need & CEPH_CAP_FILE_WR) {
+ have = __ceph_caps_issued(ci, &implemented);
+
+ if (have & need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
- dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
- inode, endoff, ci->i_max_size);
- if (endoff > ci->i_wanted_max_size) {
- *check_max = 1;
- ret = 1;
- }
- goto out;
+ doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
+ inode, ceph_vinop(inode), endoff, ci->i_max_size);
+ if (endoff > ci->i_requested_max_size)
+ ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
+ goto out_unlock;
}
/*
* If a sync write is in progress, we must wait, so that we
* can get a final snapshot value for size+mtime.
*/
if (__ceph_have_pending_cap_snap(ci)) {
- dout("get_cap_refs %p cap_snap_pending\n", inode);
- goto out;
+ doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
+ ceph_vinop(inode));
+ goto out_unlock;
}
}
- have = __ceph_caps_issued(ci, &implemented);
if ((have & need) == need) {
/*
@@ -2107,25 +2873,96 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
* on transition from wanted -> needed caps. This is needed
* for WRBUFFER|WR -> WR to avoid a new WR sync write from
* going before a prior buffered writeback happens.
+ *
+ * For RDCACHE|RD -> RD, there is not need to wait and we can
+ * just exclude the revoking caps and force to sync read.
*/
int not = want & ~(have & need);
int revoking = implemented & ~have;
- dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
- inode, ceph_cap_string(have), ceph_cap_string(not),
- ceph_cap_string(revoking));
- if ((revoking & not) == 0) {
- *got = need | (have & want);
- __take_cap_refs(ci, *got);
+ int exclude = revoking & not;
+ doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
+ inode, ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(not), ceph_cap_string(revoking));
+ if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
+ if (!snap_rwsem_locked &&
+ !ci->i_head_snapc &&
+ (need & CEPH_CAP_FILE_WR)) {
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ /*
+ * we can not call down_read() when
+ * task isn't in TASK_RUNNING state
+ */
+ if (flags & NON_BLOCKING) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = true;
+ goto again;
+ }
+ snap_rwsem_locked = true;
+ }
+ if ((have & want) == want)
+ *got = need | (want & ~exclude);
+ else
+ *got = need;
+ ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
- dout("get_cap_refs %p have %s needed %s\n", inode,
- ceph_cap_string(have), ceph_cap_string(need));
+ int session_readonly = false;
+ int mds_wanted;
+ if (ci->i_auth_cap &&
+ (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
+ struct ceph_mds_session *s = ci->i_auth_cap->session;
+ spin_lock(&s->s_cap_lock);
+ session_readonly = s->s_readonly;
+ spin_unlock(&s->s_cap_lock);
+ }
+ if (session_readonly) {
+ doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ci->i_auth_cap->mds);
+ ret = -EROFS;
+ goto out_unlock;
+ }
+
+ if (ceph_inode_is_shutdown(inode)) {
+ doutc(cl, "%p %llx.%llx inode is shutdown\n",
+ inode, ceph_vinop(inode));
+ ret = -ESTALE;
+ goto out_unlock;
+ }
+ mds_wanted = __ceph_caps_mds_wanted(ci, false);
+ if (need & ~mds_wanted) {
+ doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
+ inode, ceph_vinop(inode), ceph_cap_string(need),
+ ceph_cap_string(mds_wanted));
+ ret = -EUCLEAN;
+ goto out_unlock;
+ }
+
+ doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(have),
+ ceph_cap_string(need));
}
-out:
+out_unlock:
+
+ __ceph_touch_fmode(ci, mdsc, flags);
+
spin_unlock(&ci->i_ceph_lock);
- dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(*got));
+ if (snap_rwsem_locked)
+ up_read(&mdsc->snap_rwsem);
+
+ if (!ret)
+ ceph_update_cap_mis(&mdsc->metric);
+ else if (ret == 1)
+ ceph_update_cap_hit(&mdsc->metric);
+
+ doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
+ ceph_vinop(inode), ret, ceph_cap_string(*got));
return ret;
}
@@ -2137,21 +2974,61 @@ out:
static void check_max_size(struct inode *inode, loff_t endoff)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int check = 0;
/* do we need to explicitly request a larger max_size? */
spin_lock(&ci->i_ceph_lock);
- if ((endoff >= ci->i_max_size ||
- endoff > (inode->i_size << 1)) &&
- endoff > ci->i_wanted_max_size) {
- dout("write %p at large endoff %llu, req max_size\n",
- inode, endoff);
+ if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
+ doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
+ inode, ceph_vinop(inode), endoff);
ci->i_wanted_max_size = endoff;
- check = 1;
}
+ /* duplicate ceph_check_caps()'s logic */
+ if (ci->i_auth_cap &&
+ (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
+ ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size)
+ check = 1;
spin_unlock(&ci->i_ceph_lock);
if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
+}
+
+static inline int get_used_fmode(int caps)
+{
+ int fmode = 0;
+ if (caps & CEPH_CAP_FILE_RD)
+ fmode |= CEPH_FILE_MODE_RD;
+ if (caps & CEPH_CAP_FILE_WR)
+ fmode |= CEPH_FILE_MODE_WR;
+ return fmode;
+}
+
+int ceph_try_get_caps(struct inode *inode, int need, int want,
+ bool nonblock, int *got)
+{
+ int ret, flags;
+
+ BUG_ON(need & ~CEPH_CAP_FILE_RD);
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+ CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_ANY_DIR_OPS));
+ if (need) {
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+ }
+
+ flags = get_used_fmode(need | want);
+ if (nonblock)
+ flags |= NON_BLOCKING;
+
+ ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+ /* three special error codes */
+ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
+ ret = 0;
+ return ret;
}
/*
@@ -2159,25 +3036,146 @@ static void check_max_size(struct inode *inode, loff_t endoff)
* due to a small max_size, make sure we check_max_size (and possibly
* ask the mds) so we don't get hung up indefinitely.
*/
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
- loff_t endoff)
+int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
+ int want, loff_t endoff, int *got)
{
- int check_max, ret, err;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ int ret, _got, flags;
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- check_max = 0;
- err = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want,
- got, endoff,
- &check_max, &err));
- if (err)
- ret = err;
- if (check_max)
- goto retry;
- return ret;
+ ret = ceph_pool_perm_check(inode, need);
+ if (ret < 0)
+ return ret;
+
+ if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
+ fi->filp_gen != READ_ONCE(fsc->filp_gen))
+ return -EBADF;
+
+ flags = get_used_fmode(need | want);
+
+ while (true) {
+ flags &= CEPH_FILE_MODE_MASK;
+ if (vfs_inode_has_locks(inode))
+ flags |= CHECK_FILELOCK;
+ _got = 0;
+ ret = try_get_cap_refs(inode, need, want, endoff,
+ flags, &_got);
+ WARN_ON_ONCE(ret == -EAGAIN);
+ if (!ret) {
+#ifdef CONFIG_DEBUG_FS
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct cap_wait cw;
+#endif
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+#ifdef CONFIG_DEBUG_FS
+ cw.ino = ceph_ino(inode);
+ cw.tgid = current->tgid;
+ cw.need = need;
+ cw.want = want;
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_add(&cw.list, &mdsc->cap_wait_list);
+ spin_unlock(&mdsc->caps_list_lock);
+#endif
+
+ /* make sure used fmode not timeout */
+ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
+ add_wait_queue(&ci->i_cap_wq, &wait);
+
+ flags |= NON_BLOCKING;
+ while (!(ret = try_get_cap_refs(inode, need, want,
+ endoff, flags, &_got))) {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+
+ remove_wait_queue(&ci->i_cap_wq, &wait);
+ ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
+
+#ifdef CONFIG_DEBUG_FS
+ spin_lock(&mdsc->caps_list_lock);
+ list_del(&cw.list);
+ spin_unlock(&mdsc->caps_list_lock);
+#endif
+
+ if (ret == -EAGAIN)
+ continue;
+ }
+
+ if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
+ fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
+ if (ret >= 0 && _got)
+ ceph_put_cap_refs(ci, _got);
+ return -EBADF;
+ }
+
+ if (ret < 0) {
+ if (ret == -EFBIG || ret == -EUCLEAN) {
+ int ret2 = ceph_wait_on_async_create(inode);
+ if (ret2 < 0)
+ return ret2;
+ }
+ if (ret == -EFBIG) {
+ check_max_size(inode, endoff);
+ continue;
+ }
+ if (ret == -EUCLEAN) {
+ /* session was killed, try renew caps */
+ ret = ceph_renew_caps(inode, flags);
+ if (ret == 0)
+ continue;
+ }
+ return ret;
+ }
+
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ ceph_has_inline_data(ci) &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(inode) > 0) {
+ struct page *page =
+ find_get_page(inode->i_mapping, 0);
+ if (page) {
+ bool uptodate = PageUptodate(page);
+
+ put_page(page);
+ if (uptodate)
+ break;
+ }
+ /*
+ * drop cap refs first because getattr while
+ * holding * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /*
+ * getattr request will bring inline data into
+ * page cache
+ */
+ ret = __ceph_do_getattr(inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA,
+ true);
+ if (ret < 0)
+ return ret;
+ continue;
+ }
+ break;
+ }
+ *got = _got;
+ return 0;
+}
+
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
+ int *got)
+{
+ struct ceph_file_info *fi = filp->private_data;
+ struct inode *inode = file_inode(filp);
+
+ return __ceph_get_caps(inode, fi, need, want, endoff, got);
}
/*
@@ -2187,10 +3185,41 @@ retry:
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps);
+ ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
+{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ if (!capsnap->need_flush &&
+ !capsnap->writing && !capsnap->dirty_pages) {
+ doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
+ ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
+ list_del(&capsnap->ci_item);
+ ceph_put_cap_snap(capsnap);
+ return 1;
+ }
+ return 0;
+}
+
+enum put_cap_refs_mode {
+ PUT_CAP_REFS_SYNC = 0,
+ PUT_CAP_REFS_ASYNC,
+};
+
/*
* Release cap refs.
*
@@ -2200,11 +3229,13 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
* If we are releasing a WR cap (from a sync write), finalize any affected
* cap_snap, and wake up any waiters.
*/
-void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
+ enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int last = 0, put = 0, flushsnaps = 0, wake = 0;
- struct ceph_cap_snap *capsnap;
+ bool check_flushsnaps = false;
spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
@@ -2215,45 +3246,92 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
+ if (had & CEPH_CAP_FILE_EXCL)
+ if (--ci->i_fx_ref == 0)
+ last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
+ /* put the ref held by ceph_take_cap_refs() */
put++;
+ check_flushsnaps = true;
}
- dout("put_cap_refs %p wb %d -> %d (?)\n",
- inode, ci->i_wb_ref+1, ci->i_wb_ref);
+ doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+ ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref);
}
- if (had & CEPH_CAP_FILE_WR)
+ if (had & CEPH_CAP_FILE_WR) {
if (--ci->i_wr_ref == 0) {
+ /*
+ * The Fb caps will always be took and released
+ * together with the Fw caps.
+ */
+ WARN_ON_ONCE(ci->i_wb_ref);
+
last++;
- if (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- if (capsnap->writing) {
- capsnap->writing = 0;
- flushsnaps =
- __ceph_finish_cap_snap(ci,
- capsnap);
- wake = 1;
- }
+ check_flushsnaps = true;
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
}
+ /* see comment in __ceph_remove_cap() */
+ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
+ ceph_change_snap_realm(inode, NULL);
}
+ }
+ if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(ci, capsnap))
+ /* put the ref held by ceph_queue_cap_snap() */
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
+ }
spin_unlock(&ci->i_ceph_lock);
- dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
- last ? " last" : "", put ? " put" : "");
+ doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
- if (last && !flushsnaps)
- ceph_check_caps(ci, 0, NULL);
- else if (flushsnaps)
- ceph_flush_snaps(ci);
+ switch (mode) {
+ case PUT_CAP_REFS_SYNC:
+ if (last)
+ ceph_check_caps(ci, 0);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci, NULL);
+ break;
+ case PUT_CAP_REFS_ASYNC:
+ if (last)
+ ceph_queue_check_caps(inode);
+ else if (flushsnaps)
+ ceph_queue_flush_snaps(inode);
+ break;
+ default:
+ break;
+ }
if (wake)
wake_up_all(&ci->i_cap_wq);
- if (put)
+ while (put-- > 0)
iput(inode);
}
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
+}
+
+void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
+}
+
/*
* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
* context. Adjust per-snap dirty page accounting as appropriate.
@@ -2264,236 +3342,390 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
- struct ceph_cap_snap *capsnap = NULL;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_cap_snap *capsnap = NULL, *iter;
+ int put = 0;
+ bool last = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
- dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
- inode,
- ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
- ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
- last ? " LAST" : "");
+ doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
+ inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
+ ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
+ ci->i_wrbuffer_ref_head, last ? " LAST" : "");
} else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = 1;
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->context == snapc) {
+ capsnap = iter;
break;
}
}
- BUG_ON(!found);
+
+ if (!capsnap) {
+ /*
+ * The capsnap should already be removed when removing
+ * auth cap in the case of a forced unmount.
+ */
+ WARN_ON_ONCE(ci->i_auth_cap);
+ goto unlock;
+ }
+
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- if (capsnap->dirty == 0)
- /* cap writeback completed before we created
- * the cap_snap; no FLUSHSNAP is needed */
- drop_capsnap = 1;
- }
- dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s%s\n",
- inode, capsnap, capsnap->context->seq,
- ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
- ci->i_wrbuffer_ref, capsnap->dirty_pages,
- last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "",
- drop_capsnap ? " (drop capsnap)" : "");
- if (drop_capsnap) {
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
}
+ doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ complete_capsnap ? " (complete capsnap)" : "");
}
+unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
+ ceph_check_caps(ci, 0);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
}
- if (drop_capsnap)
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0) {
iput(inode);
+ }
+}
+
+/*
+ * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
+ */
+static void invalidate_aliases(struct inode *inode)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct dentry *dn, *prev = NULL;
+
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
+ d_prune_aliases(inode);
+ /*
+ * For non-directory inode, d_find_alias() only returns
+ * hashed dentry. After calling d_invalidate(), the
+ * dentry becomes unhashed.
+ *
+ * For directory inode, d_find_alias() can return
+ * unhashed dentry. But directory inode should have
+ * one alias at most.
+ */
+ while ((dn = d_find_alias(inode))) {
+ if (dn == prev) {
+ dput(dn);
+ break;
+ }
+ d_invalidate(dn);
+ if (prev)
+ dput(prev);
+ prev = dn;
+ }
+ if (prev)
+ dput(prev);
}
+struct cap_extra_info {
+ struct ceph_string *pool_ns;
+ /* inline data */
+ u64 inline_version;
+ void *inline_data;
+ u32 inline_len;
+ /* dirstat */
+ bool dirstat_valid;
+ u64 nfiles;
+ u64 nsubdirs;
+ u64 change_attr;
+ /* currently issued */
+ int issued;
+ struct timespec64 btime;
+ u8 *fscrypt_auth;
+ u32 fscrypt_auth_len;
+ u64 fscrypt_file_size;
+};
+
/*
* Handle a cap GRANT message from the MDS. (Note that a GRANT may
* actually be a revocation if it specifies a smaller cap set.)
*
* caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
*/
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct inode *inode,
struct ceph_mds_session *session,
struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(ci->i_ceph_lock)
+ struct ceph_mds_caps *grant,
+ struct ceph_buffer *xattr_buf,
+ struct cap_extra_info *extra_info)
+ __releases(ci->i_ceph_lock)
+ __releases(session->s_mdsc->snap_rwsem)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
+ int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
- struct timespec mtime, atime, ctime;
- int check_caps = 0;
- int wake = 0;
- int writeback = 0;
- int revoked_rdcache = 0;
- int queue_invalidate = 0;
-
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, mds, seq, ceph_cap_string(newcaps));
- dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
- inode->i_size);
+ unsigned char check_caps = 0;
+ bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen);
+ bool wake = false;
+ bool writeback = false;
+ bool queue_trunc = false;
+ bool queue_invalidate = false;
+ bool deleted_inode = false;
+ bool fill_inline = false;
+ bool revoke_wait = false;
+ int flags = 0;
+
+ /*
+ * If there is at least one crypto block then we'll trust
+ * fscrypt_file_size. If the real length of the file is 0, then
+ * ignore it (it has probably been truncated down to 0 by the MDS).
+ */
+ if (IS_ENCRYPTED(inode) && size)
+ size = extra_info->fscrypt_file_size;
+
+ doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
+ ceph_vinop(inode), cap, session->s_mds, seq,
+ ceph_cap_string(newcaps));
+ doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
+ max_size, i_size_read(inode));
+
/*
* If CACHE is being revoked, and we have no dirty buffers,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- !ci->i_wrbuffer_ref) {
- if (try_nonblocking_invalidate(inode) == 0) {
- revoked_rdcache = 1;
- } else {
+ !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
+ if (try_nonblocking_invalidate(inode)) {
/* there were locked pages.. invalidate later
in a separate thread. */
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
}
- /* side effects now are allowed */
+ if (was_stale)
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
- cap->cap_gen = session->s_cap_gen;
+ /* side effects now are allowed */
+ cap->cap_gen = atomic_read(&session->s_cap_gen);
+ cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(grant->mode);
+ inode_set_max_iversion_raw(inode, extra_info->change_attr);
+
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ umode_t mode = le32_to_cpu(grant->mode);
+
+ if (inode_wrong_type(inode, mode))
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ else
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ ci->i_btime = extra_info->btime;
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
+ memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
+ ci->fscrypt_auth_len))
+ pr_warn_ratelimited_client(cl,
+ "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+ ci->fscrypt_auth_len,
+ extra_info->fscrypt_auth_len);
+#endif
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ if ((newcaps & CEPH_CAP_LINK_SHARED) &&
+ (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
+ if (inode->i_nlink == 0)
+ deleted_inode = true;
+ }
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+ if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ grant->xattr_len) {
int len = le32_to_cpu(grant->xattr_len);
u64 version = le64_to_cpu(grant->xattr_version);
if (version > ci->i_xattrs.version) {
- dout(" got new xattrs v%llu on %p len %d\n",
- version, inode, len);
+ doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
+ version, inode, ceph_vinop(inode), len);
if (ci->i_xattrs.blob)
ceph_buffer_put(ci->i_xattrs.blob);
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
ci->i_xattrs.version = version;
+ ceph_forget_all_cached_acls(inode);
+ ceph_security_invalidate_secctx(inode);
}
}
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
- /* max size increase? */
- if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
+ if (newcaps & CEPH_CAP_ANY_RD) {
+ struct timespec64 mtime, atime, ctime;
+ /* ctime/mtime/atime? */
+ ceph_decode_timespec64(&mtime, &grant->mtime);
+ ceph_decode_timespec64(&atime, &grant->atime);
+ ceph_decode_timespec64(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, extra_info->issued,
+ le32_to_cpu(grant->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
+ ci->i_files = extra_info->nfiles;
+ ci->i_subdirs = extra_info->nsubdirs;
+ }
+
+ if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ /* file layout may have changed */
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool ||
+ extra_info->pool_ns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ extra_info->pool_ns = old_ns;
+
+ /* size/truncate_seq? */
+ queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size),
+ size);
+ }
+
+ if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
+ if (max_size != ci->i_max_size) {
+ doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
+ max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = true;
}
- wake = 1;
}
/* check cap bits */
wanted = __ceph_caps_wanted(ci);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout(" my wanted = %s, used = %s, dirty %s\n",
- ceph_cap_string(wanted),
- ceph_cap_string(used),
- ceph_cap_string(dirty));
- if (wanted != le32_to_cpu(grant->wanted)) {
- dout("mds wanted %s -> %s\n",
- ceph_cap_string(le32_to_cpu(grant->wanted)),
- ceph_cap_string(wanted));
- /* imported cap may not have correct mds_wanted */
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
- check_caps = 1;
- }
-
- cap->seq = seq;
+ doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted), ceph_cap_string(used),
+ ceph_cap_string(dirty));
- /* file layout may have changed */
- ci->i_layout = grant->layout;
+ if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
+ (wanted & ~(cap->mds_wanted | newcaps))) {
+ /*
+ * If mds is importing cap, prior cap messages that update
+ * 'wanted' may get dropped by mds (migrate seq mismatch).
+ *
+ * We don't send cap message to update 'wanted' if what we
+ * want are already issued. If mds revokes caps, cap message
+ * that releases caps also tells mds what we want. But if
+ * caps got revoked by mds forcedly (session stale). We may
+ * haven't told mds what we want.
+ */
+ check_caps = 1;
+ }
/* revocation, grant, or no-op? */
if (cap->issued & ~newcaps) {
int revoking = cap->issued & ~newcaps;
- dout("revocation: %s -> %s (revoking %s)\n",
- ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps),
- ceph_cap_string(revoking));
- if (revoking & used & CEPH_CAP_FILE_BUFFER)
- writeback = 1; /* initiate writeback; will delay ack */
- else if (revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
- queue_invalidate)
- ; /* do nothing yet, invalidation will be queued */
- else if (cap == ci->i_auth_cap)
+ doutc(cl, "revocation: %s -> %s (revoking %s)\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
+ ceph_cap_string(revoking));
+ if (S_ISREG(inode->i_mode) &&
+ (revoking & used & CEPH_CAP_FILE_BUFFER)) {
+ writeback = true; /* initiate writeback; will delay ack */
+ revoke_wait = true;
+ } else if (queue_invalidate &&
+ revoking == CEPH_CAP_FILE_CACHE &&
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) {
+ revoke_wait = true; /* do nothing yet, invalidation will be queued */
+ } else if (cap == ci->i_auth_cap) {
check_caps = 1; /* check auth cap only */
- else
+ } else {
check_caps = 2; /* check all caps */
+ }
+ /* If there is new caps, try to wake up the waiters */
+ if (~cap->issued & newcaps)
+ wake = true;
cap->issued = newcaps;
cap->implemented |= newcaps;
} else if (cap->issued == newcaps) {
- dout("caps unchanged: %s -> %s\n",
- ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ doutc(cl, "caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
} else {
- dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
- ceph_cap_string(newcaps));
+ doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
/* non-auth MDS is revoking the newly grant caps ? */
if (cap == ci->i_auth_cap &&
__ceph_caps_revoking_other(ci, cap, newcaps))
@@ -2503,11 +3735,53 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
* pending revocation */
- wake = 1;
+ wake = true;
}
BUG_ON(cap->issued & ~cap->implemented);
+ /* don't let check_caps skip sending a response to MDS for revoke msgs */
+ if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
+ cap->mds_wanted = 0;
+ flags |= CHECK_CAPS_FLUSH_FORCE;
+ if (cap == ci->i_auth_cap)
+ check_caps = 1; /* check auth cap only */
+ else
+ check_caps = 2; /* check all caps */
+ }
+
+ if (extra_info->inline_version > 0 &&
+ extra_info->inline_version >= ci->i_inline_version) {
+ ci->i_inline_version = extra_info->inline_version;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
+ fill_inline = true;
+ }
+
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
+
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
+
+ ceph_kick_flushing_inode_caps(session, ci);
+ }
+ up_read(&session->s_mdsc->snap_rwsem);
+ }
spin_unlock(&ci->i_ceph_lock);
+
+ if (fill_inline)
+ ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
+ extra_info->inline_len);
+
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2517,16 +3791,16 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
+ if (deleted_inode)
+ invalidate_aliases(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
+ mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
- session);
+ ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
- else
- mutex_unlock(&session->s_mutex);
+ ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL);
}
/*
@@ -2540,47 +3814,83 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_cap_flush *cf, *tmp_cf;
+ LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
- int drop = 0;
- int i;
+ bool drop = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
- cleaned |= 1 << i;
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
+ /* Is this the one that was flushed? */
+ if (cf->tid == flush_tid)
+ cleaned = cf->caps;
- dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
- " flushing %s -> %s\n",
- inode, session->s_mds, seq, ceph_cap_string(dirty),
- ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
- ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+ /* Is this a capsnap? */
+ if (cf->is_capsnap)
+ continue;
+
+ if (cf->tid <= flush_tid) {
+ /*
+ * An earlier or current tid. The FLUSH_ACK should
+ * represent a superset of this flush's caps.
+ */
+ wake_ci |= __detach_cap_flush_from_ci(ci, cf);
+ list_add_tail(&cf->i_list, &to_remove);
+ } else {
+ /*
+ * This is a later one. Any caps in it are still dirty
+ * so don't count them as cleaned.
+ */
+ cleaned &= ~cf->caps;
+ if (!cleaned)
+ break;
+ }
+ }
+
+ doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
+ inode, ceph_vinop(inode), session->s_mds, seq,
+ ceph_cap_string(dirty), ceph_cap_string(cleaned),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ if (list_empty(&to_remove) && !cleaned)
goto out;
ci->i_flushing_caps &= ~cleaned;
spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, i_list)
+ wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
+
if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ struct inode *inode =
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->netfs.inode;
+ doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
+ session->s_mds, inode, ceph_vinop(inode));
+ }
+ }
mdsc->num_cap_flushing--;
- wake_up_all(&mdsc->cap_flushing_wq);
- dout(" inode %p now !flushing\n", inode);
+ doutc(cl, " %p %llx.%llx now !flushing\n", inode,
+ ceph_vinop(inode));
if (ci->i_dirty_caps == 0) {
- dout(" inode %p now clean\n", inode);
+ doutc(cl, " %p %llx.%llx now clean\n", inode,
+ ceph_vinop(inode));
BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
- if (ci->i_wrbuffer_ref_head == 0) {
+ drop = true;
+ if (ci->i_wr_ref == 0 &&
+ ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2590,14 +3900,65 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&ci->i_ceph_lock);
+
+ while (!list_empty(&to_remove)) {
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, i_list);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
if (drop)
iput(inode);
}
+void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ bool ret;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
+ inode, ceph_vinop(inode), ci);
+
+ list_del_init(&capsnap->ci_item);
+ ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
+ if (wake_ci)
+ *wake_ci = ret;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
+ if (wake_mdsc)
+ *wake_mdsc = ret;
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
+ __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
+}
+
/*
* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
* throw away our cap_snap.
@@ -2609,39 +3970,46 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- int drop = 0;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
- dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
- inode, ci, session->s_mds, follows);
+ doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
+ ceph_vinop(inode), ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
- dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->follows == follows) {
+ if (iter->cap_flush.tid != flush_tid) {
+ doutc(cl, " cap_snap %p follows %lld "
+ "tid %lld != %lld\n", iter,
+ follows, flush_tid,
+ iter->cap_flush.tid);
break;
}
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- drop = 1;
+ capsnap = iter;
break;
} else {
- dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
+ doutc(cl, " skipping cap_snap %p follows %lld\n",
+ iter, iter->follows);
}
}
+ if (capsnap)
+ ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
- if (drop)
+
+ if (capsnap) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
iput(inode);
+ }
}
/*
@@ -2649,12 +4017,13 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
*
* caller hold s_mutex.
*/
-static void handle_cap_trunc(struct inode *inode,
+static bool handle_cap_trunc(struct inode *inode,
struct ceph_mds_caps *trunc,
- struct ceph_mds_session *session)
- __releases(ci->i_ceph_lock)
+ struct ceph_mds_session *session,
+ struct cap_extra_info *extra_info)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(trunc->seq);
u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
@@ -2663,18 +4032,25 @@ static void handle_cap_trunc(struct inode *inode,
int implemented = 0;
int dirty = __ceph_caps_dirty(ci);
int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
- int queue_trunc = 0;
+ bool queue_trunc = false;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
issued |= implemented | dirty;
- dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
- inode, mds, seq, truncate_size, truncate_seq);
+ /*
+ * If there is at least one crypto block then we'll trust
+ * fscrypt_file_size. If the real length of the file is 0, then
+ * ignore it (it has probably been truncated down to 0 by the MDS).
+ */
+ if (IS_ENCRYPTED(inode) && size)
+ size = extra_info->fscrypt_file_size;
+
+ doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
+ inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
- spin_unlock(&ci->i_ceph_lock);
-
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
+ return queue_trunc;
}
/*
@@ -2686,124 +4062,261 @@ static void handle_cap_trunc(struct inode *inode,
* caller holds s_mutex
*/
static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
- struct ceph_mds_session *session,
- int *open_target_sessions)
+ struct ceph_mds_cap_peer *ph,
+ struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_session *tsession = NULL;
+ struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 t_cap_id;
+ u32 t_issue_seq, t_mseq;
+ int target, issued;
int mds = session->s_mds;
- unsigned mseq = le32_to_cpu(ex->migrate_seq);
- struct ceph_cap *cap = NULL, *t;
- struct rb_node *p;
- int remember = 1;
- dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
+ if (ph) {
+ t_cap_id = le64_to_cpu(ph->cap_id);
+ t_issue_seq = le32_to_cpu(ph->issue_seq);
+ t_mseq = le32_to_cpu(ph->mseq);
+ target = le32_to_cpu(ph->mds);
+ } else {
+ t_cap_id = t_issue_seq = t_mseq = 0;
+ target = -1;
+ }
+ doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n",
+ ceph_vinop(inode), target, t_issue_seq, t_mseq);
+retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+ goto out_unlock;
- /* make sure we haven't seen a higher mseq */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- t = rb_entry(p, struct ceph_cap, ci_node);
- if (ceph_seq_cmp(t->mseq, mseq) > 0) {
- dout(" higher mseq on cap from mds%d\n",
- t->session->s_mds);
- remember = 0;
- }
- if (t->session->s_mds == mds)
- cap = t;
+ if (target < 0) {
+ ceph_remove_cap(mdsc, cap, false);
+ goto out_unlock;
}
- if (cap) {
- if (remember) {
- /* make note */
- ci->i_cap_exporting_mds = mds;
- ci->i_cap_exporting_mseq = mseq;
- ci->i_cap_exporting_issued = cap->issued;
-
- /*
- * make sure we have open sessions with all possible
- * export targets, so that we get the matching IMPORT
- */
- *open_target_sessions = 1;
+ /*
+ * now we know we haven't received the cap import message yet
+ * because the exported cap still exist.
+ */
- /*
- * we can't flush dirty caps that we've seen the
- * EXPORT but no IMPORT for
- */
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n",
- inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ issued = cap->issued;
+ if (issued != cap->implemented)
+ pr_err_ratelimited_client(cl, "issued != implemented: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " issued %s implemented %s\n",
+ inode, ceph_vinop(inode), mds,
+ cap->seq, cap->mseq,
+ ceph_cap_string(issued),
+ ceph_cap_string(cap->implemented));
+
+
+ tcap = __get_cap_for_mds(ci, target);
+ if (tcap) {
+ /* already have caps from the target */
+ if (tcap->cap_id == t_cap_id &&
+ ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) {
+ doutc(cl, " updating import cap %p mds%d\n", tcap,
+ target);
+ tcap->cap_id = t_cap_id;
+ tcap->seq = t_issue_seq - 1;
+ tcap->issue_seq = t_issue_seq - 1;
+ tcap->issued |= issued;
+ tcap->implemented |= issued;
+ if (cap == ci->i_auth_cap) {
+ ci->i_auth_cap = tcap;
+ change_auth_cap_ses(ci, tcap->session);
}
+ }
+ ceph_remove_cap(mdsc, cap, false);
+ goto out_unlock;
+ } else if (tsession) {
+ /* add placeholder for the export target */
+ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+ tcap = new_cap;
+ ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
+ t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
spin_unlock(&mdsc->cap_dirty_lock);
}
- __ceph_remove_cap(cap);
+
+ ceph_remove_cap(mdsc, cap, false);
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
+ mutex_unlock(&session->s_mutex);
+
+ /* open target session */
+ tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+ if (!IS_ERR(tsession)) {
+ if (mds > target) {
+ mutex_lock(&session->s_mutex);
+ mutex_lock_nested(&tsession->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&tsession->s_mutex);
+ mutex_lock_nested(&session->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ }
+ new_cap = ceph_get_cap(mdsc, NULL);
+ } else {
+ WARN_ON(1);
+ tsession = NULL;
+ target = -1;
+ mutex_lock(&session->s_mutex);
}
- /* else, we already released it */
+ goto retry;
+out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
+ mutex_unlock(&session->s_mutex);
+ if (tsession) {
+ mutex_unlock(&tsession->s_mutex);
+ ceph_put_mds_session(tsession);
+ }
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
}
/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
+ * Handle cap IMPORT.
*
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
*/
static void handle_cap_import(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
+ struct ceph_cap **target_cap, int *old_issued)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
+ int issued;
+ unsigned caps = le32_to_cpu(im->caps);
unsigned wanted = le32_to_cpu(im->wanted);
unsigned seq = le32_to_cpu(im->seq);
unsigned mseq = le32_to_cpu(im->migrate_seq);
u64 realmino = le64_to_cpu(im->realm);
u64 cap_id = le64_to_cpu(im->cap_id);
+ u64 p_cap_id;
+ u32 piseq = 0;
+ u32 pmseq = 0;
+ int peer;
+
+ if (ph) {
+ p_cap_id = le64_to_cpu(ph->cap_id);
+ peer = le32_to_cpu(ph->mds);
+ piseq = le32_to_cpu(ph->issue_seq);
+ pmseq = le32_to_cpu(ph->mseq);
+ } else {
+ p_cap_id = 0;
+ peer = -1;
+ }
- if (ci->i_cap_exporting_mds >= 0 &&
- ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d"
- " - cleared exporting from mds%d\n",
- inode, ci, mds, mseq,
- ci->i_cap_exporting_mds);
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p back to cap_dirty\n", inode);
- list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+ doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n",
+ ceph_vinop(inode), peer, piseq, pmseq);
+retry:
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (!new_cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ spin_lock(&ci->i_ceph_lock);
+ goto retry;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ cap = new_cap;
} else {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
- }
-
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
+ if (new_cap) {
+ ceph_put_cap(mdsc, new_cap);
+ new_cap = NULL;
+ }
+ }
- /* make sure we re-request max_size, if necessary */
- spin_lock(&ci->i_ceph_lock);
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+
+ ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
+ realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+ ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+ if (ocap && ocap->cap_id == p_cap_id) {
+ doutc(cl, " remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
+ if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+ (ocap->seq != piseq ||
+ ocap->mseq != pmseq)) {
+ pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
+ "%p %llx.%llx mds%d seq %d mseq %d"
+ " importer mds%d has peer seq %d mseq %d\n",
+ inode, ceph_vinop(inode), peer,
+ ocap->seq, ocap->mseq, mds, piseq, pmseq);
+ }
+ ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ }
+
+ *old_issued = issued;
+ *target_cap = cap;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+static int parse_fscrypt_fields(void **p, void *end,
+ struct cap_extra_info *extra)
+{
+ u32 len;
+
+ ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
+ if (extra->fscrypt_auth_len) {
+ ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
+ extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!extra->fscrypt_auth)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
+ extra->fscrypt_auth_len, bad);
+ }
+
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len >= sizeof(u64)) {
+ ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
+ len -= sizeof(u64);
+ }
+ ceph_decode_skip_n(p, end, len, bad);
+ return 0;
+bad:
+ return -EIO;
}
+#else
+static int parse_fscrypt_fields(void **p, void *end,
+ struct cap_extra_info *extra)
+{
+ u32 len;
+
+ /* Don't care about these fields unless we're encryption-capable */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len)
+ ceph_decode_skip_n(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len)
+ ceph_decode_skip_n(p, end, len, bad);
+ return 0;
+bad:
+ return -EIO;
+}
+#endif
/*
* Handle a caps message from the MDS.
@@ -2815,99 +4328,200 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
- struct super_block *sb = mdsc->fsc->sb;
+ struct ceph_client *cl = mdsc->fsc->client;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
- int mds = session->s_mds;
+ struct ceph_mds_cap_peer *peer = NULL;
+ struct ceph_snap_realm *realm = NULL;
int op;
- u32 seq, mseq;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ u32 seq, mseq, issue_seq;
struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
- u64 tid;
void *snaptrace;
size_t snaptrace_len;
- void *flock;
- u32 flock_len;
- int open_target_sessions = 0;
+ void *p, *end;
+ struct cap_extra_info extra_info = {};
+ bool queue_trunc;
+ bool close_sessions = false;
+ bool do_cap_release = false;
- dout("handle_caps from mds%d\n", mds);
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
/* decode */
- tid = le64_to_cpu(msg->hdr.tid);
+ end = msg->front.iov_base + msg->front.iov_len;
if (msg->front.iov_len < sizeof(*h))
goto bad;
h = msg->front.iov_base;
op = le32_to_cpu(h->op);
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
+ issue_seq = le32_to_cpu(h->issue_seq);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
+ p = snaptrace + snaptrace_len;
- if (le16_to_cpu(msg->hdr.version) >= 2) {
- void *p, *end;
-
- p = snaptrace + snaptrace_len;
- end = msg->front.iov_base + msg->front.iov_len;
+ if (msg_version >= 2) {
+ u32 flock_len;
ceph_decode_32_safe(&p, end, flock_len, bad);
- flock = p;
- } else {
- flock = NULL;
- flock_len = 0;
+ if (p + flock_len > end)
+ goto bad;
+ p += flock_len;
}
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
- (unsigned)seq);
+ if (msg_version >= 3) {
+ if (op == CEPH_CAP_OP_IMPORT) {
+ if (p + sizeof(*peer) > end)
+ goto bad;
+ peer = p;
+ p += sizeof(*peer);
+ } else if (op == CEPH_CAP_OP_EXPORT) {
+ /* recorded in unused fields */
+ peer = (void *)&h->size;
+ }
+ }
+
+ if (msg_version >= 4) {
+ ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
+ ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
+ if (p + extra_info.inline_len > end)
+ goto bad;
+ extra_info.inline_data = p;
+ p += extra_info.inline_len;
+ }
+
+ if (msg_version >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
+ if (msg_version >= 8) {
+ u32 pool_ns_len;
+
+ /* version >= 6 */
+ ceph_decode_skip_64(&p, end, bad); // flush_tid
+ /* version >= 7 */
+ ceph_decode_skip_32(&p, end, bad); // caller_uid
+ ceph_decode_skip_32(&p, end, bad); // caller_gid
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ extra_info.pool_ns =
+ ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
+ }
+
+ if (msg_version >= 9) {
+ struct ceph_timespec *btime;
+
+ if (p + sizeof(*btime) > end)
+ goto bad;
+ btime = p;
+ ceph_decode_timespec64(&extra_info.btime, btime);
+ p += sizeof(*btime);
+ ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
+ }
+
+ if (msg_version >= 11) {
+ /* version >= 10 */
+ ceph_decode_skip_32(&p, end, bad); // flags
+ /* version >= 11 */
+ extra_info.dirstat_valid = true;
+ ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
+ ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
+ }
- if (op == CEPH_CAP_OP_IMPORT)
- ceph_add_cap_releases(mdsc, session);
+ if (msg_version >= 12) {
+ if (parse_fscrypt_fields(&p, end, &extra_info))
+ goto bad;
+ }
/* lookup ino */
- inode = ceph_find_inode(sb, vino);
- ci = ceph_inode(inode);
- dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
- vino.snap, inode);
+ inode = ceph_find_inode(mdsc->fsc->sb, vino);
+ doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n",
+ session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode,
+ seq, issue_seq, mseq);
+
+ mutex_lock(&session->s_mutex);
+
if (!inode) {
- dout(" i don't have ino %llx\n", vino.ino);
+ doutc(cl, " i don't have ino %llx\n", vino.ino);
- if (op == CEPH_CAP_OP_IMPORT)
- __queue_cap_release(session, vino.ino, cap_id,
- mseq, seq);
+ switch (op) {
+ case CEPH_CAP_OP_IMPORT:
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
+ do_cap_release = true;
+ break;
+ default:
+ break;
+ }
goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
case CEPH_CAP_OP_FLUSHSNAP_ACK:
- handle_cap_flushsnap_ack(inode, tid, h, session);
+ handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
+ h, session);
goto done;
case CEPH_CAP_OP_EXPORT:
- handle_cap_export(inode, h, session, &open_target_sessions);
- goto done;
+ handle_cap_export(inode, h, peer, session);
+ goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
- handle_cap_import(mdsc, inode, h, session,
- snaptrace, snaptrace_len);
+ realm = NULL;
+ if (snaptrace_len) {
+ down_write(&mdsc->snap_rwsem);
+ if (ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len,
+ false, &realm)) {
+ up_write(&mdsc->snap_rwsem);
+ close_sessions = true;
+ goto done;
+ }
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
+ spin_lock(&ci->i_ceph_lock);
+ handle_cap_import(mdsc, inode, h, peer, session,
+ &cap, &extra_info.issued);
+ handle_cap_grant(inode, session, cap,
+ h, msg->middle, &extra_info);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+ goto done_unlocked;
}
/* the rest require a cap */
spin_lock(&ci->i_ceph_lock);
- cap = __get_cap_for_mds(ceph_inode(inode), mds);
+ cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
if (!cap) {
- dout(" no cap on %p ino %llx.%llx from mds%d\n",
- inode, ceph_ino(inode), ceph_snap(inode), mds);
+ doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
+ inode, ceph_ino(inode), ceph_snap(inode),
+ session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ switch (op) {
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
+ do_cap_release = true;
+ break;
+ default:
+ break;
+ }
goto flush_cap_releases;
}
@@ -2915,100 +4529,228 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
- case CEPH_CAP_OP_IMPORT:
- handle_cap_grant(inode, h, session, cap, msg->middle);
+ __ceph_caps_issued(ci, &extra_info.issued);
+ extra_info.issued |= __ceph_caps_dirty(ci);
+ handle_cap_grant(inode, session, cap,
+ h, msg->middle, &extra_info);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
- handle_cap_flush_ack(inode, tid, h, session, cap);
+ handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
+ h, session, cap);
break;
case CEPH_CAP_OP_TRUNC:
- handle_cap_trunc(inode, h, session);
+ queue_trunc = handle_cap_trunc(inode, h, session,
+ &extra_info);
+ spin_unlock(&ci->i_ceph_lock);
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
break;
default:
spin_unlock(&ci->i_ceph_lock);
- pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
- ceph_cap_op_name(op));
+ pr_err_client(cl, "unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
}
- goto done;
+done:
+ mutex_unlock(&session->s_mutex);
+done_unlocked:
+ iput(inode);
+out:
+ ceph_dec_mds_stopping_blocker(mdsc);
+
+ ceph_put_string(extra_info.pool_ns);
+
+ /* Defer closing the sessions after s_mutex lock being released */
+ if (close_sessions)
+ ceph_mdsc_close_sessions(mdsc);
+
+ kfree(extra_info.fscrypt_auth);
+ return;
flush_cap_releases:
/*
- * send any full release message to try to move things
+ * send any cap release message to try to move things
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_add_cap_releases(mdsc, session);
- ceph_send_cap_releases(mdsc, session);
-
-done:
- mutex_unlock(&session->s_mutex);
-done_unlocked:
- if (inode)
- iput(inode);
- if (open_target_sessions)
- ceph_mdsc_open_export_target_sessions(mdsc, session);
- return;
+ if (do_cap_release) {
+ cap = ceph_get_cap(mdsc, NULL);
+ cap->cap_ino = vino.ino;
+ cap->queue_release = 1;
+ cap->cap_id = le64_to_cpu(h->cap_id);
+ cap->mseq = mseq;
+ cap->seq = seq;
+ cap->issue_seq = seq;
+ spin_lock(&session->s_cap_lock);
+ __ceph_queue_cap_release(session, cap);
+ spin_unlock(&session->s_cap_lock);
+ }
+ ceph_flush_session_cap_releases(mdsc, session);
+ goto done;
bad:
- pr_err("ceph_handle_caps: corrupt message\n");
+ pr_err_client(cl, "corrupt message\n");
ceph_msg_dump(msg);
- return;
+ goto out;
}
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct inode *inode;
struct ceph_inode_info *ci;
- int flags = CHECK_CAPS_NODELAY;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
- dout("check_delayed_caps\n");
- while (1) {
- spin_lock(&mdsc->cap_delay_lock);
- if (list_empty(&mdsc->cap_delay_list))
- break;
+ doutc(cl, "begin\n");
+ spin_lock(&mdsc->cap_delay_lock);
+ while (!list_empty(&mdsc->cap_delay_list)) {
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ doutc(cl, "caps added recently. Exiting loop");
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
list_del_init(&ci->i_cap_delay_list);
- spin_unlock(&mdsc->cap_delay_lock);
- dout("check_delayed_caps on %p\n", &ci->vfs_inode);
- ceph_check_caps(ci, flags, NULL);
+
+ inode = igrab(&ci->netfs.inode);
+ if (inode) {
+ spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ ceph_check_caps(ci, 0);
+ iput(inode);
+ spin_lock(&mdsc->cap_delay_lock);
+ }
+
+ /*
+ * Make sure too many dirty caps or general
+ * slowness doesn't block mdsc delayed work,
+ * preventing send_renew_caps() from running.
+ */
+ if (time_after_eq(jiffies, loop_start + 5 * HZ))
+ break;
}
spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "done\n");
+
+ return delay;
}
/*
* Flush all dirty caps to the mds
*/
-void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+static void flush_dirty_session_caps(struct ceph_mds_session *s)
{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
- dout("flush_dirty_caps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->cap_dirty_lock);
- while (!list_empty(&mdsc->cap_dirty)) {
- ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+ while (!list_empty(&s->s_cap_dirty)) {
+ ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
- dout("flush_dirty_caps %p\n", inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
- ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+ ceph_wait_on_async_create(inode);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
spin_unlock(&mdsc->cap_dirty_lock);
- dout("flush_dirty_caps done\n");
+ doutc(cl, "done\n");
+}
+
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
+}
+
+/*
+ * Flush all cap releases to the mds
+ */
+static void flush_cap_releases(struct ceph_mds_session *s)
+{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&s->s_cap_lock);
+ if (s->s_num_cap_releases)
+ ceph_flush_session_cap_releases(mdsc, s);
+ spin_unlock(&s->s_cap_lock);
+ doutc(cl, "done\n");
+
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true);
+}
+
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode)
+{
+ unsigned long now = jiffies;
+ if (fmode & CEPH_FILE_MODE_RD)
+ ci->i_last_rd = now;
+ if (fmode & CEPH_FILE_MODE_WR)
+ ci->i_last_wr = now;
+ /* queue periodic check */
+ if (fmode &&
+ __ceph_is_any_real_caps(ci) &&
+ list_empty(&ci->i_cap_delay_list))
+ __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
+ int bits = (fmode << 1) | 1;
+ bool already_opened = false;
+ int i;
+
+ if (count == 1)
+ atomic64_inc(&mdsc->metric.opened_files);
+
+ spin_lock(&ci->i_ceph_lock);
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ /*
+ * If any of the mode ref is larger than 0,
+ * that means it has been already opened by
+ * others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ already_opened = true;
+
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i] += count;
+ }
+
+ if (!already_opened)
+ percpu_counter_inc(&mdsc->metric.opened_inodes);
+ spin_unlock(&ci->i_ceph_lock);
}
/*
@@ -3016,21 +4758,75 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
+ int bits = (fmode << 1) | 1;
+ bool is_closed = true;
+ int i;
+
+ if (count == 1)
+ atomic64_dec(&mdsc->metric.opened_files);
spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] < count);
+ ci->i_nr_by_mode[i] -= count;
+ }
+
+ /*
+ * If any of the mode ref is not 0 after
+ * decreased, that means it is still opened
+ * by others. Just skip checking the PIN ref.
+ */
+ if (i && ci->i_nr_by_mode[i])
+ is_closed = false;
+ }
+
+ if (is_closed)
+ percpu_counter_dec(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
+}
- if (last && ci->i_vino.snap == CEPH_NOSNAP)
- ceph_check_caps(ci, 0, NULL);
+/*
+ * For a soon-to-be unlinked file, drop the LINK caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+int ceph_drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+
+ if (__ceph_caps_dirty(ci)) {
+ struct ceph_mds_client *mdsc =
+ ceph_inode_to_fs_client(inode)->mdsc;
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ spin_lock(&mdsc->cap_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add_tail(&ci->i_cap_delay_list,
+ &mdsc->cap_unlink_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+
+ /*
+ * Fire the work immediately, because the MDS maybe
+ * waiting for caps release.
+ */
+ ceph_queue_cap_unlink_work(mdsc);
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return drop;
}
/*
@@ -3045,6 +4841,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_cap *cap;
struct ceph_mds_request_release *rel = *p;
int used, dirty;
@@ -3054,68 +4851,96 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
- dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
- inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
- ceph_cap_string(unless));
+ doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n",
+ inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty),
+ ceph_cap_string(drop), ceph_cap_string(unless));
/* only drop unused, clean caps */
drop &= ~(used | dirty);
cap = __get_cap_for_mds(ci, mds);
if (cap && __cap_is_valid(cap)) {
- if (force ||
- ((cap->issued & drop) &&
- (cap->issued & unless) == 0)) {
- if ((cap->issued & drop) &&
- (cap->issued & unless) == 0) {
+ unless &= cap->issued;
+ if (unless) {
+ if (unless & CEPH_CAP_AUTH_EXCL)
+ drop &= ~CEPH_CAP_AUTH_SHARED;
+ if (unless & CEPH_CAP_LINK_EXCL)
+ drop &= ~CEPH_CAP_LINK_SHARED;
+ if (unless & CEPH_CAP_XATTR_EXCL)
+ drop &= ~CEPH_CAP_XATTR_SHARED;
+ if (unless & CEPH_CAP_FILE_EXCL)
+ drop &= ~CEPH_CAP_FILE_SHARED;
+ }
+
+ if (force || (cap->issued & drop)) {
+ if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
- if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
- wanted |= cap->mds_wanted;
- dout("encode_inode_release %p cap %p "
- "%s -> %s, wanted %s -> %s\n", inode, cap,
- ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop),
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(wanted));
+ doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
+ "wanted %s -> %s\n", inode,
+ ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
cap->issued &= ~drop;
cap->implemented &= ~drop;
cap->mds_wanted = wanted;
+ if (cap == ci->i_auth_cap &&
+ !(wanted & CEPH_CAP_ANY_FILE_WR))
+ ci->i_requested_max_size = 0;
} else {
- dout("encode_inode_release %p cap %p %s"
- " (force)\n", inode, cap,
- ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
rel->ino = cpu_to_le64(ceph_ino(inode));
rel->cap_id = cpu_to_le64(cap->cap_id);
rel->seq = cpu_to_le32(cap->seq);
- rel->issue_seq = cpu_to_le32(cap->issue_seq),
+ rel->issue_seq = cpu_to_le32(cap->issue_seq);
rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
+ rel->caps = cpu_to_le32(cap->implemented);
rel->wanted = cpu_to_le32(cap->mds_wanted);
rel->dname_len = 0;
rel->dname_seq = 0;
*p += sizeof(*rel);
ret = 1;
} else {
- dout("encode_inode_release %p cap %p %s\n",
- inode, cap, ceph_cap_string(cap->issued));
+ doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
+ inode, ceph_vinop(inode), cap,
+ ceph_cap_string(cap->issued));
}
}
spin_unlock(&ci->i_ceph_lock);
return ret;
}
+/**
+ * ceph_encode_dentry_release - encode a dentry release into an outgoing request
+ * @p: outgoing request buffer
+ * @dentry: dentry to release
+ * @dir: dir to release it from
+ * @mds: mds that we're speaking to
+ * @drop: caps being dropped
+ * @unless: unless we have these caps
+ *
+ * Encode a dentry release into an outgoing request buffer. Returns 1 if the
+ * thing was released, or a negative error code otherwise.
+ */
int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+ struct inode *dir,
int mds, int drop, int unless)
{
- struct inode *dir = dentry->d_parent->d_inode;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct ceph_client *cl;
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -3129,16 +4954,145 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+ cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
- dout("encode_dentry_release %p mds%d seq %d\n",
- dentry, mds, (int)di->lease_seq);
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
+ int len = dentry->d_name.len;
+ doutc(cl, "%p mds%d seq %d\n", dentry, mds,
+ (int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
+ memcpy(*p, dentry->d_name.name, len);
+ spin_unlock(&dentry->d_lock);
+ if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
+ len = ceph_encode_encrypted_dname(dir, *p, len);
+ if (len < 0)
+ return len;
+ }
+ rel->dname_len = cpu_to_le32(len);
+ *p += len;
+ } else {
+ spin_unlock(&dentry->d_lock);
}
- spin_unlock(&dentry->d_lock);
return ret;
}
+
+static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_cap_snap *capsnap;
+ int capsnap_release = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
+ ci, inode, ceph_vinop(inode));
+
+ while (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ __ceph_remove_capsnap(inode, capsnap, NULL, NULL);
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ capsnap_release++;
+ }
+ wake_up_all(&ci->i_cap_wq);
+ wake_up_all(&mdsc->cap_flushing_wq);
+ return capsnap_release;
+}
+
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_client *cl = fsc->client;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_auth;
+ bool dirty_dropped = false;
+ int iputs = 0;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
+ cap, ci, inode, ceph_vinop(inode));
+
+ is_auth = (cap == ci->i_auth_cap);
+ __ceph_remove_cap(cap, false);
+ if (is_auth) {
+ struct ceph_cap_flush *cf;
+
+ if (ceph_inode_is_shutdown(inode)) {
+ if (inode->i_data.nrpages > 0)
+ *invalidate = true;
+ if (ci->i_wrbuffer_ref > 0)
+ mapping_set_error(&inode->i_data, -EIO);
+ }
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ /* trash all of the cap flushes for this inode */
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del_init(&cf->g_list);
+ list_del_init(&cf->i_list);
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_dirty_item)) {
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty %s state for %p %llx.%llx\n",
+ ceph_cap_string(ci->i_dirty_caps),
+ inode, ceph_vinop(inode));
+ ci->i_dirty_caps = 0;
+ list_del_init(&ci->i_dirty_item);
+ dirty_dropped = true;
+ }
+ if (!list_empty(&ci->i_flushing_item)) {
+ pr_warn_ratelimited_client(cl,
+ " dropping dirty+flushing %s state for %p %llx.%llx\n",
+ ceph_cap_string(ci->i_flushing_caps),
+ inode, ceph_vinop(inode));
+ ci->i_flushing_caps = 0;
+ list_del_init(&ci->i_flushing_item);
+ mdsc->num_cap_flushing--;
+ dirty_dropped = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (dirty_dropped) {
+ mapping_set_error(inode->i_mapping, -EIO);
+
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ }
+
+ if (atomic_read(&ci->i_filelock_ref) > 0) {
+ /* make further file lock syscall return -EIO */
+ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+ pr_warn_ratelimited_client(cl,
+ " dropping file locks for %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
+ }
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ cf = ci->i_prealloc_cap_flush;
+ ci->i_prealloc_cap_flush = NULL;
+ if (!cf->is_capsnap)
+ ceph_free_cap_flush(cf);
+ }
+
+ if (!list_empty(&ci->i_cap_snaps))
+ iputs = remove_capsnaps(mdsc, inode);
+ }
+ if (dirty_dropped)
+ ++iputs;
+ return iputs;
+}
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index bdce8b1fbd06..6f67d5b884a0 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Ceph 'frag' type
*/
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..0ea4db650f85
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,604 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The base64 encode/decode code was copied from fscrypt:
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ * Written by Uday Savagaonkar, 2014.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#include <linux/ceph/ceph_debug.h>
+#include <linux/xattr.h>
+#include <linux/fscrypt.h>
+#include <linux/ceph/striper.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "crypto.h"
+
+static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth;
+ u32 ctxlen;
+
+ /* Non existent or too short? */
+ if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1)))
+ return -ENOBUFS;
+
+ /* Some format we don't recognize? */
+ if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION)
+ return -ENOBUFS;
+
+ ctxlen = le32_to_cpu(cfa->cfa_blob_len);
+ if (len < ctxlen)
+ return -ERANGE;
+
+ memcpy(ctx, cfa->cfa_blob, ctxlen);
+ return ctxlen;
+}
+
+static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
+ size_t len, void *fs_data)
+{
+ int ret;
+ struct iattr attr = { };
+ struct ceph_iattr cia = { };
+ struct ceph_fscrypt_auth *cfa;
+
+ WARN_ON_ONCE(fs_data);
+
+ if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE)
+ return -EINVAL;
+
+ cfa = kzalloc(sizeof(*cfa), GFP_KERNEL);
+ if (!cfa)
+ return -ENOMEM;
+
+ cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION);
+ cfa->cfa_blob_len = cpu_to_le32(len);
+ memcpy(cfa->cfa_blob, ctx, len);
+
+ cia.fscrypt_auth = cfa;
+
+ ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia);
+ if (ret == 0)
+ inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+ kfree(cia.fscrypt_auth);
+ return ret;
+}
+
+static bool ceph_crypt_empty_dir(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return ci->i_rsubdirs + ci->i_rfiles == 1;
+}
+
+static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
+{
+ return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy;
+}
+
+static struct fscrypt_operations ceph_fscrypt_ops = {
+ .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) -
+ (int)offsetof(struct ceph_inode_info, netfs.inode),
+ .needs_bounce_pages = 1,
+ .get_context = ceph_crypt_get_context,
+ .set_context = ceph_crypt_set_context,
+ .get_dummy_policy = ceph_get_dummy_policy,
+ .empty_dir = ceph_crypt_empty_dir,
+};
+
+void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+ fscrypt_set_ops(sb, &ceph_fscrypt_ops);
+}
+
+void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc)
+{
+ fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy);
+}
+
+int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
+ struct ceph_acl_sec_ctx *as)
+{
+ int ret, ctxsize;
+ bool encrypted = false;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ret = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (ret)
+ return ret;
+ if (!encrypted)
+ return 0;
+
+ as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL);
+ if (!as->fscrypt_auth)
+ return -ENOMEM;
+
+ ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob,
+ inode);
+ if (ctxsize < 0)
+ return ctxsize;
+
+ as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION);
+ as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize);
+
+ WARN_ON_ONCE(ci->fscrypt_auth);
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth);
+ ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!ci->fscrypt_auth)
+ return -ENOMEM;
+
+ inode->i_flags |= S_ENCRYPTED;
+
+ return 0;
+}
+
+void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as)
+{
+ swap(req->r_fscrypt_auth, as->fscrypt_auth);
+}
+
+/*
+ * User-created snapshots can't start with '_'. Snapshots that start with this
+ * character are special (hint: there aren't real snapshots) and use the
+ * following format:
+ *
+ * _<SNAPSHOT-NAME>_<INODE-NUMBER>
+ *
+ * where:
+ * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted,
+ * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot
+ *
+ * This function parses these snapshot names and returns the inode
+ * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME>
+ * length.
+ */
+static struct inode *parse_longname(const struct inode *parent,
+ const char *name, int *name_len)
+{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
+ struct inode *dir = NULL;
+ struct ceph_vino vino = { .snap = CEPH_NOSNAP };
+ char *name_end, *inode_number;
+ int ret = -EIO;
+ /* NUL-terminate */
+ char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL);
+ if (!str)
+ return ERR_PTR(-ENOMEM);
+ /* Skip initial '_' */
+ str++;
+ name_end = strrchr(str, '_');
+ if (!name_end) {
+ doutc(cl, "failed to parse long snapshot name: %s\n", str);
+ return ERR_PTR(-EIO);
+ }
+ *name_len = (name_end - str);
+ if (*name_len <= 0) {
+ pr_err_client(cl, "failed to parse long snapshot name\n");
+ return ERR_PTR(-EIO);
+ }
+
+ /* Get the inode number */
+ inode_number = name_end + 1;
+ ret = kstrtou64(inode_number, 10, &vino.ino);
+ if (ret) {
+ doutc(cl, "failed to parse inode number: %s\n", str);
+ return ERR_PTR(ret);
+ }
+
+ /* And finally the inode */
+ dir = ceph_find_inode(parent->i_sb, vino);
+ if (!dir) {
+ /* This can happen if we're not mounting cephfs on the root */
+ dir = ceph_get_inode(parent->i_sb, vino, NULL);
+ if (IS_ERR(dir))
+ doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
+ }
+ return dir;
+}
+
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
+{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
+ struct inode *dir = parent;
+ char *p = buf;
+ u32 len;
+ int name_len = elen;
+ int ret;
+ u8 *cryptbuf = NULL;
+
+ /* Handle the special case of snapshot names that start with '_' */
+ if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') {
+ dir = parse_longname(parent, p, &name_len);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ p++; /* skip initial '_' */
+ }
+
+ if (!fscrypt_has_encryption_key(dir))
+ goto out;
+
+ /*
+ * Convert cleartext d_name to ciphertext. If result is longer than
+ * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes
+ *
+ * See: fscrypt_setup_filename
+ */
+ if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) {
+ elen = -ENAMETOOLONG;
+ goto out;
+ }
+
+ /* Allocate a buffer appropriate to hold the result */
+ cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len,
+ GFP_KERNEL);
+ if (!cryptbuf) {
+ elen = -ENOMEM;
+ goto out;
+ }
+
+ ret = fscrypt_fname_encrypt(dir,
+ &(struct qstr)QSTR_INIT(p, name_len),
+ cryptbuf, len);
+ if (ret) {
+ elen = ret;
+ goto out;
+ }
+
+ /* hash the end if the name is long enough */
+ if (len > CEPH_NOHASH_NAME_MAX) {
+ u8 hash[SHA256_DIGEST_SIZE];
+ u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX;
+
+ /*
+ * hash the extra bytes and overwrite crypttext beyond that
+ * point with it
+ */
+ sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash);
+ memcpy(extra, hash, SHA256_DIGEST_SIZE);
+ len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE;
+ }
+
+ /* base64 encode the encrypted name */
+ elen = base64_encode(cryptbuf, len, p, false, BASE64_IMAP);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p);
+
+ /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
+ WARN_ON(elen > 240);
+ if (dir != parent) // leading _ is already there; append _<inum>
+ elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino);
+
+out:
+ kfree(cryptbuf);
+ if (dir != parent) {
+ if ((inode_state_read_once(dir) & I_NEW))
+ discard_new_inode(dir);
+ else
+ iput(dir);
+ }
+ return elen;
+}
+
+/**
+ * ceph_fname_to_usr - convert a filename for userland presentation
+ * @fname: ceph_fname to be converted
+ * @tname: temporary name buffer to use for conversion (may be NULL)
+ * @oname: where converted name should be placed
+ * @is_nokey: set to true if key wasn't available during conversion (may be NULL)
+ *
+ * Given a filename (usually from the MDS), format it for presentation to
+ * userland. If @parent is not encrypted, just pass it back as-is.
+ *
+ * Otherwise, base64 decode the string, and then ask fscrypt to format it
+ * for userland presentation.
+ *
+ * Returns 0 on success or negative error code on error.
+ */
+int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey)
+{
+ struct inode *dir = fname->dir;
+ struct fscrypt_str _tname = FSTR_INIT(NULL, 0);
+ struct fscrypt_str iname;
+ char *name = fname->name;
+ int name_len = fname->name_len;
+ int ret;
+
+ /* Sanity check that the resulting name will fit in the buffer */
+ if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX)
+ return -EIO;
+
+ /* Handle the special case of snapshot names that start with '_' */
+ if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
+ (name[0] == '_')) {
+ dir = parse_longname(dir, name, &name_len);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ name++; /* skip initial '_' */
+ }
+
+ if (!IS_ENCRYPTED(dir)) {
+ oname->name = fname->name;
+ oname->len = fname->name_len;
+ ret = 0;
+ goto out_inode;
+ }
+
+ ret = ceph_fscrypt_prepare_readdir(dir);
+ if (ret)
+ goto out_inode;
+
+ /*
+ * Use the raw dentry name as sent by the MDS instead of
+ * generating a nokey name via fscrypt.
+ */
+ if (!fscrypt_has_encryption_key(dir)) {
+ if (fname->no_copy)
+ oname->name = fname->name;
+ else
+ memcpy(oname->name, fname->name, fname->name_len);
+ oname->len = fname->name_len;
+ if (is_nokey)
+ *is_nokey = true;
+ ret = 0;
+ goto out_inode;
+ }
+
+ if (fname->ctext_len == 0) {
+ int declen;
+
+ if (!tname) {
+ ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname);
+ if (ret)
+ goto out_inode;
+ tname = &_tname;
+ }
+
+ declen = base64_decode(name, name_len,
+ tname->name, false, BASE64_IMAP);
+ if (declen <= 0) {
+ ret = -EIO;
+ goto out;
+ }
+ iname.name = tname->name;
+ iname.len = declen;
+ } else {
+ iname.name = fname->ctext;
+ iname.len = fname->ctext_len;
+ }
+
+ ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname);
+ if (!ret && (dir != fname->dir)) {
+ char tmp_buf[BASE64_CHARS(NAME_MAX)];
+
+ name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
+ oname->len, oname->name, dir->i_ino);
+ memcpy(oname->name, tmp_buf, name_len);
+ oname->len = name_len;
+ }
+
+out:
+ fscrypt_fname_free_buffer(&_tname);
+out_inode:
+ if (dir != fname->dir) {
+ if ((inode_state_read_once(dir) & I_NEW))
+ discard_new_inode(dir);
+ else
+ iput(dir);
+ }
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper
+ * @dir: directory inode for readdir prep
+ *
+ * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as
+ * non-complete if this call results in having the directory unlocked.
+ *
+ * Returns:
+ * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked)
+ * 0 - if directory is still locked
+ * < 0 - if __fscrypt_prepare_readdir() fails
+ */
+int ceph_fscrypt_prepare_readdir(struct inode *dir)
+{
+ bool had_key = fscrypt_has_encryption_key(dir);
+ int err;
+
+ if (!IS_ENCRYPTED(dir))
+ return 0;
+
+ err = __fscrypt_prepare_readdir(dir);
+ if (err)
+ return err;
+ if (!had_key && fscrypt_has_encryption_key(dir)) {
+ /* directory just got unlocked, mark it as not complete */
+ ceph_dir_clear_complete(dir);
+ return 1;
+ }
+ return 0;
+}
+
+int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
+ return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num);
+}
+
+int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+ ceph_vinop(inode), len, offs, lblk_num);
+ return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num);
+}
+
+/**
+ * ceph_fscrypt_decrypt_pages - decrypt an array of pages
+ * @inode: pointer to inode associated with these pages
+ * @page: pointer to page array
+ * @off: offset into the file that the read data starts
+ * @len: max length to decrypt
+ *
+ * Decrypt an array of fscrypt'ed pages and return the amount of
+ * data decrypted. Any data in the page prior to the start of the
+ * first complete block in the read is ignored. Any incomplete
+ * crypto blocks at the end of the array are ignored (and should
+ * probably be zeroed by the caller).
+ *
+ * Returns the length of the decrypted data or a negative errno.
+ */
+int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
+ u64 off, int len)
+{
+ int i, num_blocks;
+ u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ int ret = 0;
+
+ /*
+ * We can't deal with partial blocks on an encrypted file, so mask off
+ * the last bit.
+ */
+ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK);
+
+ /* Decrypt each block */
+ for (i = 0; i < num_blocks; ++i) {
+ int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT;
+ int pgidx = blkoff >> PAGE_SHIFT;
+ unsigned int pgoffs = offset_in_page(blkoff);
+ int fret;
+
+ fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx],
+ CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
+ baseblk + i);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret += CEPH_FSCRYPT_BLOCK_SIZE;
+ }
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer
+ * @inode: inode associated with pages being decrypted
+ * @page: pointer to page array
+ * @off: offset into the file that the data in page[0] starts
+ * @map: pointer to extent array
+ * @ext_cnt: length of extent array
+ *
+ * Given an extent map and a page array, decrypt the received data in-place,
+ * skipping holes. Returns the offset into buffer of end of last decrypted
+ * block.
+ */
+int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
+ u64 off, struct ceph_sparse_extent *map,
+ u32 ext_cnt)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int i, ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Nothing to do for empty array */
+ if (ext_cnt == 0) {
+ doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode,
+ ceph_vinop(inode));
+ return 0;
+ }
+
+ ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len,
+ &objno, &objoff, &xlen);
+
+ for (i = 0; i < ext_cnt; ++i) {
+ struct ceph_sparse_extent *ext = &map[i];
+ int pgsoff = ext->off - objoff;
+ int pgidx = pgsoff >> PAGE_SHIFT;
+ int fret;
+
+ if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) {
+ pr_warn_client(cl,
+ "%p %llx.%llx bad encrypted sparse extent "
+ "idx %d off %llx len %llx\n",
+ inode, ceph_vinop(inode), i, ext->off,
+ ext->len);
+ return -EIO;
+ }
+ fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx],
+ off + pgsoff, ext->len);
+ doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode,
+ ceph_vinop(inode), i, ext->off, ext->len, fret);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret = pgsoff + fret;
+ }
+ doutc(cl, "ret %d\n", ret);
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_encrypt_pages - encrypt an array of pages
+ * @inode: pointer to inode associated with these pages
+ * @page: pointer to page array
+ * @off: offset into the file that the data starts
+ * @len: max length to encrypt
+ *
+ * Encrypt an array of cleartext pages and return the amount of
+ * data encrypted. Any data in the page prior to the start of the
+ * first complete block in the read is ignored. Any incomplete
+ * crypto blocks at the end of the array are ignored.
+ *
+ * Returns the length of the encrypted data or a negative errno.
+ */
+int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
+ int len)
+{
+ int i, num_blocks;
+ u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ int ret = 0;
+
+ /*
+ * We can't deal with partial blocks on an encrypted file, so mask off
+ * the last bit.
+ */
+ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK);
+
+ /* Encrypt each block */
+ for (i = 0; i < num_blocks; ++i) {
+ int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT;
+ int pgidx = blkoff >> PAGE_SHIFT;
+ unsigned int pgoffs = offset_in_page(blkoff);
+ int fret;
+
+ fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx],
+ CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
+ baseblk + i);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret += CEPH_FSCRYPT_BLOCK_SIZE;
+ }
+ return ret;
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..b748e2060bc9
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,272 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ceph fscrypt functionality
+ */
+
+#ifndef _CEPH_CRYPTO_H
+#define _CEPH_CRYPTO_H
+
+#include <crypto/sha2.h>
+#include <linux/fscrypt.h>
+#include <linux/base64.h>
+
+#define CEPH_FSCRYPT_BLOCK_SHIFT 12
+#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT)
+#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1))
+
+struct ceph_fs_client;
+struct ceph_acl_sec_ctx;
+struct ceph_mds_request;
+
+struct ceph_fname {
+ struct inode *dir;
+ char *name; // b64 encoded, possibly hashed
+ unsigned char *ctext; // binary crypttext (if any)
+ u32 name_len; // length of name buffer
+ u32 ctext_len; // length of crypttext
+ bool no_copy;
+};
+
+/*
+ * Header for the encrypted file when truncating the size, this
+ * will be sent to MDS, and the MDS will update the encrypted
+ * last block and then truncate the size.
+ */
+struct ceph_fscrypt_truncate_size_header {
+ __u8 ver;
+ __u8 compat;
+
+ /*
+ * It will be sizeof(assert_ver + file_offset + block_size)
+ * if the last block is empty when it's located in a file
+ * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE.
+ */
+ __le32 data_len;
+
+ __le64 change_attr;
+ __le64 file_offset;
+ __le32 block_size;
+} __packed;
+
+struct ceph_fscrypt_auth {
+ __le32 cfa_version;
+ __le32 cfa_blob_len;
+ u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE];
+} __packed;
+
+#define CEPH_FSCRYPT_AUTH_VERSION 1
+static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa)
+{
+ u32 ctxsize = le32_to_cpu(fa->cfa_blob_len);
+
+ return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+/*
+ * We want to encrypt filenames when creating them, but the encrypted
+ * versions of those names may have illegal characters in them. To mitigate
+ * that, we base64 encode them, but that gives us a result that can exceed
+ * NAME_MAX.
+ *
+ * Follow a similar scheme to fscrypt itself, and cap the filename to a
+ * smaller size. If the ciphertext name is longer than the value below, then
+ * sha256 hash the remaining bytes.
+ *
+ * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph
+ * so the corresponding struct will be:
+ *
+ * struct fscrypt_ceph_nokey_name {
+ * u8 bytes[157];
+ * u8 sha256[SHA256_DIGEST_SIZE];
+ * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255)
+ *
+ * (240 bytes is the maximum size allowed for snapshot names to take into
+ * account the format: '_<SNAPSHOT-NAME>_<INODE-NUMBER>'.)
+ *
+ * Note that for long names that end up having their tail portion hashed, we
+ * must also store the full encrypted name (in the dentry's alternate_name
+ * field).
+ */
+#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE)
+
+void ceph_fscrypt_set_ops(struct super_block *sb);
+
+void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc);
+
+int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
+ struct ceph_acl_sec_ctx *as);
+void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as);
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len);
+
+static inline int ceph_fname_alloc_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ if (!IS_ENCRYPTED(parent))
+ return 0;
+ return fscrypt_fname_alloc_buffer(NAME_MAX, fname);
+}
+
+static inline void ceph_fname_free_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ if (IS_ENCRYPTED(parent))
+ fscrypt_fname_free_buffer(fname);
+}
+
+int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey);
+int ceph_fscrypt_prepare_readdir(struct inode *dir);
+
+static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len)
+{
+ /* crypto blocks cannot span more than one page */
+ BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT);
+
+ return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) -
+ (off >> CEPH_FSCRYPT_BLOCK_SHIFT);
+}
+
+/*
+ * If we have an encrypted inode then we must adjust the offset and
+ * range of the on-the-wire read to cover an entire encryption block.
+ * The copy will be done using the original offset and length, after
+ * we've decrypted the result.
+ */
+static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode,
+ u64 *off, u64 *len)
+{
+ if (IS_ENCRYPTED(inode)) {
+ *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE;
+ *off &= CEPH_FSCRYPT_BLOCK_MASK;
+ }
+}
+
+int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num);
+int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num);
+int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
+ u64 off, int len);
+int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
+ u64 off, struct ceph_sparse_extent *map,
+ u32 ext_cnt);
+int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
+ int len);
+
+static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
+{
+ return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page;
+}
+
+#else /* CONFIG_FS_ENCRYPTION */
+
+static inline void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+}
+
+static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc)
+{
+}
+
+static inline int ceph_fscrypt_prepare_context(struct inode *dir,
+ struct inode *inode,
+ struct ceph_acl_sec_ctx *as)
+{
+ if (IS_ENCRYPTED(dir))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+}
+
+static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf,
+ int len)
+{
+ return len;
+}
+
+static inline int ceph_fname_alloc_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ return 0;
+}
+
+static inline void ceph_fname_free_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+}
+
+static inline int ceph_fname_to_usr(const struct ceph_fname *fname,
+ struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey)
+{
+ oname->name = fname->name;
+ oname->len = fname->name_len;
+ return 0;
+}
+
+static inline int ceph_fscrypt_prepare_readdir(struct inode *dir)
+{
+ return 0;
+}
+
+static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode,
+ u64 *off, u64 *len)
+{
+}
+
+static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_decrypt_pages(struct inode *inode,
+ struct page **page, u64 off,
+ int len)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_decrypt_extents(struct inode *inode,
+ struct page **page, u64 off,
+ struct ceph_sparse_extent *map,
+ u32 ext_cnt)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_encrypt_pages(struct inode *inode,
+ struct page **page, u64 off,
+ int len)
+{
+ return 0;
+}
+
+static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
+{
+ return page;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
+
+static inline loff_t ceph_fscrypt_page_offset(struct page *page)
+{
+ return page_offset(ceph_fscrypt_pagecache_page(page));
+}
+
+#endif /* _CEPH_CRYPTO_H */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/device.h>
@@ -6,6 +7,8 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/math64.h>
+#include <linux/ktime.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/mon_client.h>
@@ -17,27 +20,27 @@
#ifdef CONFIG_DEBUG_FS
#include "mds_client.h"
+#include "metric.h"
static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_fs_client *fsc = s->private;
+ struct ceph_mdsmap *mdsmap;
- if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
+ if (!fsc->mdsc || !fsc->mdsc->mdsmap)
return 0;
- seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
- seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- fsc->mdsc->mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- fsc->mdsc->mdsmap->m_session_autoclose);
- for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &fsc->mdsc->mdsmap->m_info[i].addr;
- int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+ mdsmap = fsc->mdsc->mdsmap;
+ seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", mdsmap->m_root);
+ seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+ seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+ for (i = 0; i < mdsmap->possible_max_rank; i++) {
+ struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+ int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
- ceph_pr_addr(&addr->in_addr),
+ ceph_pr_addr(addr),
ceph_mds_state_name(state));
}
return 0;
@@ -52,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -70,45 +71,46 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
- if (req->r_got_unsafe)
- seq_printf(s, "\t(unsafe)");
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ seq_puts(s, "\t(unsafe)");
else
- seq_printf(s, "\t");
+ seq_puts(s, "\t");
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_dentry->d_parent->d_inode),
- req->r_dentry->d_name.len,
- req->r_dentry->d_name.name,
+ seq_printf(s, " #%llx/%pd (%s)",
+ ceph_ino(d_inode(req->r_dentry->d_parent)),
+ req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- kfree(path);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
- seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
+ seq_printf(s, " #%llx/%pd (%s)",
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
+ req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- kfree(path);
- } else if (req->r_path2) {
+ ceph_mdsc_free_path_info(&path_info);
+ } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
req->r_path2);
@@ -116,49 +118,257 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, " %s", req->r_path2);
}
- seq_printf(s, "\n");
+ seq_puts(s, "\n");
}
mutex_unlock(&mdsc->mutex);
return 0;
}
+#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \
+ s64 _total, _avg, _min, _max, _sq, _st; \
+ _avg = ktime_to_us(avg); \
+ _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \
+ _max = ktime_to_us(max); \
+ _total = total - 1; \
+ _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \
+ _st = int_sqrt64(_sq); \
+ _st = ktime_to_us(_st); \
+ seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \
+ name, total, _avg, _min, _max, _st); \
+}
+
+#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \
+ u64 _min = min == U64_MAX ? 0 : min; \
+ seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \
+ name, total, avg, _min, max, sum); \
+}
+
+static int metrics_file_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
+
+ seq_printf(s, "item total\n");
+ seq_printf(s, "------------------------------------------\n");
+ seq_printf(s, "%-35s%lld\n", "total inodes",
+ percpu_counter_sum(&m->total_inodes));
+ seq_printf(s, "%-35s%lld\n", "opened files",
+ atomic64_read(&m->opened_files));
+ seq_printf(s, "%-35s%lld\n", "pinned i_caps",
+ atomic64_read(&m->total_caps));
+ seq_printf(s, "%-35s%lld\n", "opened inodes",
+ percpu_counter_sum(&m->opened_inodes));
+ return 0;
+}
+
+static const char * const metric_str[] = {
+ "read",
+ "write",
+ "metadata",
+ "copyfrom"
+};
+static int metrics_latency_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total, avg, min, max, sq;
+ int i;
+
+ seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
+ seq_printf(s, "-----------------------------------------------------------------------------------\n");
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ avg = m->latency_avg;
+ min = m->latency_min;
+ max = m->latency_max;
+ sq = m->latency_sq_sum;
+ spin_unlock(&m->lock);
+ CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq);
+ }
+
+ return 0;
+}
+
+static int metrics_size_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *cm = &fsc->mdsc->metric;
+ struct ceph_metric *m;
+ s64 total;
+ u64 sum, avg, min, max;
+ int i;
+
+ seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n");
+ seq_printf(s, "----------------------------------------------------------------------------------------\n");
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ /* skip 'metadata' as it doesn't use the size metric */
+ if (i == METRIC_METADATA)
+ continue;
+ m = &cm->metric[i];
+ spin_lock(&m->lock);
+ total = m->total;
+ sum = m->size_sum;
+ avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ min = m->size_min;
+ max = m->size_max;
+ spin_unlock(&m->lock);
+ CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum);
+ }
+
+ return 0;
+}
+
+static int metrics_caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_client_metric *m = &fsc->mdsc->metric;
+ int nr_caps = 0;
+
+ seq_printf(s, "item total miss hit\n");
+ seq_printf(s, "-------------------------------------------------\n");
+
+ seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease",
+ atomic64_read(&m->total_dentries),
+ percpu_counter_sum(&m->d_lease_mis),
+ percpu_counter_sum(&m->d_lease_hit));
+
+ nr_caps = atomic64_read(&m->total_caps);
+ seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps,
+ percpu_counter_sum(&m->i_caps_mis),
+ percpu_counter_sum(&m->i_caps_hit));
+
+ return 0;
+}
+
+static int caps_show_cb(struct inode *inode, int mds, void *p)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct seq_file *s = p;
+ struct ceph_cap *cap;
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap)
+ seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+ cap->session->s_mds,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented));
+ spin_unlock(&ci->i_ceph_lock);
+ return 0;
+}
+
static int caps_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
- int total, avail, used, reserved, min;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ int total, avail, used, reserved, min, i;
+ struct cap_wait *cw;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
"avail\t\t%d\n"
"used\t\t%d\n"
"reserved\t%d\n"
- "min\t%d\n",
+ "min\t\t%d\n\n",
total, avail, used, reserved, min);
+ seq_printf(s, "ino mds issued implemented\n");
+ seq_printf(s, "--------------------------------------------------\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *session;
+
+ session = __ceph_lookup_mds_session(mdsc, i);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ ceph_iterate_session_caps(session, caps_show_cb, s);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ seq_printf(s, "\n\nWaiters:\n--------\n");
+ seq_printf(s, "tgid ino need want\n");
+ seq_printf(s, "-----------------------------------------------------\n");
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
+ seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
+ ceph_cap_string(cw->need),
+ ceph_cap_string(cw->want));
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
return 0;
}
-static int dentry_lru_show(struct seq_file *s, void *ptr)
+static int mds_sessions_show(struct seq_file *s, void *ptr)
{
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
- struct ceph_dentry_info *di;
+ struct ceph_auth_client *ac = fsc->client->monc.auth;
+ struct ceph_options *opt = fsc->client->options;
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+
+ /* The 'num' portion of an 'entity name' */
+ seq_printf(s, "global_id %llu\n", ac->global_id);
- spin_lock(&mdsc->dentry_lru_lock);
- list_for_each_entry(di, &mdsc->dentry_lru, lru) {
- struct dentry *dentry = di->dentry;
- seq_printf(s, "%p %p\t%.*s\n",
- di, dentry, dentry->d_name.len, dentry->d_name.name);
+ /* The -o name mount argument */
+ seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
+
+ /* The list of MDS session rank+state */
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session =
+ __ceph_lookup_mds_session(mdsc, mds);
+ if (!session) {
+ continue;
+ }
+ mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "mds.%d %s\n",
+ session->s_mds,
+ ceph_session_state_name(session->s_state));
+
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
}
- spin_unlock(&mdsc->dentry_lru_lock);
+ mutex_unlock(&mdsc->mutex);
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+static int status_show(struct seq_file *s, void *p)
+{
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_entity_inst *inst = &fsc->client->msgr.inst;
+ struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client);
+
+ seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name),
+ ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce));
+ seq_printf(s, "blocklisted: %s\n", str_true_false(fsc->blocklisted));
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
+DEFINE_SHOW_ATTRIBUTE(status);
+DEFINE_SHOW_ATTRIBUTE(metrics_file);
+DEFINE_SHOW_ATTRIBUTE(metrics_latency);
+DEFINE_SHOW_ATTRIBUTE(metrics_size);
+DEFINE_SHOW_ATTRIBUTE(metrics_caps);
/*
@@ -186,85 +396,86 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
- dout("ceph_fs_debugfs_cleanup\n");
+ doutc(fsc->client, "begin\n");
debugfs_remove(fsc->debugfs_bdi);
debugfs_remove(fsc->debugfs_congestion_kb);
debugfs_remove(fsc->debugfs_mdsmap);
+ debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
+ debugfs_remove(fsc->debugfs_status);
debugfs_remove(fsc->debugfs_mdsc);
- debugfs_remove(fsc->debugfs_dentry_lru);
+ debugfs_remove_recursive(fsc->debugfs_metrics_dir);
+ doutc(fsc->client, "done\n");
}
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- char name[100];
- int err = -ENOMEM;
+ char name[NAME_MAX];
- dout("ceph_fs_debugfs_init\n");
- BUG_ON(!fsc->client->debugfs_dir);
+ doutc(fsc->client, "begin\n");
fsc->debugfs_congestion_kb =
debugfs_create_file("writeback_congestion_kb",
0600,
fsc->client->debugfs_dir,
fsc,
&congestion_kb_fops);
- if (!fsc->debugfs_congestion_kb)
- goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
+ bdi_dev_name(fsc->sb->s_bdi));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
name);
- if (!fsc->debugfs_bdi)
- goto out;
fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mdsmap_fops);
+
+ fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
+ 0400,
fsc->client->debugfs_dir,
fsc,
- &mdsmap_show_fops);
- if (!fsc->debugfs_mdsmap)
- goto out;
+ &mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
- 0600,
+ 0400,
fsc->client->debugfs_dir,
fsc,
- &mdsc_show_fops);
- if (!fsc->debugfs_mdsc)
- goto out;
+ &mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
- 0400,
- fsc->client->debugfs_dir,
- fsc,
- &caps_show_fops);
- if (!fsc->debugfs_caps)
- goto out;
-
- fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
- fsc->client->debugfs_dir,
- fsc,
- &dentry_lru_show_fops);
- if (!fsc->debugfs_dentry_lru)
- goto out;
-
- return 0;
-
-out:
- ceph_fs_debugfs_cleanup(fsc);
- return err;
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &caps_fops);
+
+ fsc->debugfs_status = debugfs_create_file("status",
+ 0400,
+ fsc->client->debugfs_dir,
+ fsc,
+ &status_fops);
+
+ fsc->debugfs_metrics_dir = debugfs_create_dir("metrics",
+ fsc->client->debugfs_dir);
+
+ debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_file_fops);
+ debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_latency_fops);
+ debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_size_fops);
+ debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
+ &metrics_caps_fops);
+ doutc(fsc->client, "done\n");
}
#else /* CONFIG_DEBUG_FS */
-int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
+void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- return 0;
}
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a40ceda47a32..86d7aa594ea9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,13 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/spinlock.h>
-#include <linux/fs_struct.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/xattr.h>
#include "super.h"
#include "mds_client.h"
+#include "crypto.h"
/*
* Directory operations: readdir, lookup, create, link, unlink,
@@ -26,84 +28,154 @@
* point by name.
*/
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
const struct dentry_operations ceph_dentry_ops;
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
+static int __dir_lease_try_check(const struct dentry *dentry);
+
/*
* Initialize ceph dentry state.
*/
-int ceph_init_dentry(struct dentry *dentry)
+static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
- if (dentry->d_fsdata)
- return 0;
-
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+ di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
return -ENOMEM; /* oh well */
- spin_lock(&dentry->d_lock);
- if (dentry->d_fsdata) {
- /* lost a race */
- kmem_cache_free(ceph_dentry_cachep, di);
- goto out_unlock;
- }
-
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
- d_set_d_op(dentry, &ceph_dentry_ops);
- else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
- d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
- else
- d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
di->dentry = dentry;
di->lease_session = NULL;
- dentry->d_time = jiffies;
- /* avoid reordering d_fsdata setup so that the check above is safe */
- smp_mb();
+ di->time = jiffies;
dentry->d_fsdata = di;
- ceph_dentry_lru_add(dentry);
-out_unlock:
- spin_unlock(&dentry->d_lock);
+ INIT_LIST_HEAD(&di->lease_list);
+
+ atomic64_inc(&mdsc->metric.total_dentries);
+
return 0;
}
-struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
+/*
+ * for f_pos for readdir:
+ * - hash order:
+ * (0xff << 52) | ((24 bits hash) << 28) |
+ * (the nth entry has hash collision);
+ * - frag+name order;
+ * ((frag value) << 28) | (the nth entry in frag);
+ */
+#define OFFSET_BITS 28
+#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
{
- struct inode *inode = NULL;
+ loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+ if (hash_order)
+ fpos |= HASH_ORDER;
+ return fpos;
+}
- if (!dentry)
- return NULL;
+static bool is_hash_order(loff_t p)
+{
+ return (p & HASH_ORDER) == HASH_ORDER;
+}
- spin_lock(&dentry->d_lock);
- if (!IS_ROOT(dentry)) {
- inode = dentry->d_parent->d_inode;
- ihold(inode);
- }
- spin_unlock(&dentry->d_lock);
- return inode;
+static unsigned fpos_frag(loff_t p)
+{
+ return p >> OFFSET_BITS;
+}
+
+static unsigned fpos_hash(loff_t p)
+{
+ return ceph_frag_value(fpos_frag(p));
+}
+
+static unsigned fpos_off(loff_t p)
+{
+ return p & OFFSET_MASK;
}
+static int fpos_cmp(loff_t l, loff_t r)
+{
+ int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+ if (v)
+ return v;
+ return (int)(fpos_off(l) - fpos_off(r));
+}
/*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
*/
-static unsigned fpos_frag(loff_t p)
+static int note_last_dentry(struct ceph_fs_client *fsc,
+ struct ceph_dir_file_info *dfi,
+ const char *name,
+ int len, unsigned next_offset)
{
- return p >> 32;
+ char *buf = kmalloc(len+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ kfree(dfi->last_name);
+ dfi->last_name = buf;
+ memcpy(dfi->last_name, name, len);
+ dfi->last_name[len] = 0;
+ dfi->next_offset = next_offset;
+ doutc(fsc->client, "'%s'\n", dfi->last_name);
+ return 0;
}
-static unsigned fpos_off(loff_t p)
+
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+ struct ceph_readdir_cache_control *cache_ctl)
{
- return p & 0xffffffff;
+ struct inode *dir = d_inode(parent);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
+ struct dentry *dentry;
+ unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+ loff_t ptr_pos = idx * sizeof(struct dentry *);
+ pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+ if (ptr_pos >= i_size_read(dir))
+ return NULL;
+
+ if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) {
+ ceph_readdir_cache_release(cache_ctl);
+ cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff);
+ if (IS_ERR(cache_ctl->folio)) {
+ cache_ctl->folio = NULL;
+ doutc(cl, " folio %lu not found\n", ptr_pgoff);
+ return ERR_PTR(-EAGAIN);
+ }
+ /* reading/filling the cache are serialized by
+ i_rwsem, no need to use folio lock */
+ folio_unlock(cache_ctl->folio);
+ cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0);
+ }
+
+ cache_ctl->index = idx & idx_mask;
+
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_rwsem. */
+ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+ dentry = cache_ctl->dentries[cache_ctl->index];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ return dentry ? : ERR_PTR(-EAGAIN);
}
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
- * d_u.d_child when we initially get results back from the MDS, and
+ * d_children when we initially get results back from the MDS, and
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
@@ -111,419 +183,563 @@ static unsigned fpos_off(loff_t p)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ int shared_gen)
{
- struct ceph_file_info *fi = file->private_data;
- struct dentry *parent = file->f_dentry;
- struct inode *dir = parent->d_inode;
- struct list_head *p;
- struct dentry *dentry, *last;
+ struct ceph_dir_file_info *dfi = file->private_data;
+ struct dentry *parent = file->f_path.dentry;
+ struct inode *dir = d_inode(parent);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir);
+ struct ceph_client *cl = ceph_inode_to_client(dir);
+ struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
+ struct ceph_readdir_cache_control cache_ctl = {};
+ u64 idx = 0;
int err = 0;
- /* claim ref on last dentry we returned */
- last = fi->dentry;
- fi->dentry = NULL;
+ doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir),
+ (unsigned)shared_gen, ctx->pos);
+
+ /* search start position */
+ if (ctx->pos > 2) {
+ u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+ while (count > 0) {
+ u64 step = count >> 1;
+ dentry = __dcache_find_get_entry(parent, idx + step,
+ &cache_ctl);
+ if (!dentry) {
+ /* use linear search */
+ idx = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
+ if (fpos_cmp(di->offset, ctx->pos) < 0) {
+ idx += step + 1;
+ count -= step + 1;
+ } else {
+ count = step;
+ }
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
- last);
+ doutc(cl, "%p %llx.%llx cache idx %llu\n", dir,
+ ceph_vinop(dir), idx);
+ }
- spin_lock(&parent->d_lock);
- /* start at beginning? */
- if (ctx->pos == 2 || last == NULL ||
- ctx->pos < ceph_dentry(last)->offset) {
- if (list_empty(&parent->d_subdirs))
- goto out_unlock;
- p = parent->d_subdirs.prev;
- dout(" initial p %p/%p\n", p->prev, p->next);
- } else {
- p = last->d_u.d_child.prev;
- }
+ for (;;) {
+ bool emit_dentry = false;
+ dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+ if (!dentry) {
+ dfi->file_info.flags |= CEPH_F_ATEND;
+ err = 0;
+ break;
+ }
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out;
+ }
-more:
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- while (1) {
- dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
- d_unhashed(dentry) ? "!hashed" : "hashed",
- parent->d_subdirs.prev, parent->d_subdirs.next);
- if (p == &parent->d_subdirs) {
- fi->flags |= CEPH_F_ATEND;
- goto out_unlock;
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (d_unhashed(dentry) ||
+ d_really_is_negative(dentry) ||
+ di->lease_shared_gen != shared_gen ||
+ ((dentry->d_flags & DCACHE_NOKEY_NAME) &&
+ fscrypt_has_encryption_key(dir))) {
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ err = -EAGAIN;
+ goto out;
+ }
+ if (fpos_cmp(ctx->pos, di->offset) <= 0) {
+ __ceph_dentry_dir_lease_touch(di);
+ emit_dentry = true;
}
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
- ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
- ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- ctx->pos <= di->offset)
- break;
- dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
- dentry->d_name.len, dentry->d_name.name, di->offset,
- ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
- !dentry->d_inode ? " null" : "");
spin_unlock(&dentry->d_lock);
- p = p->prev;
- dentry = list_entry(p, struct dentry, d_u.d_child);
- di = ceph_dentry(dentry);
- }
- dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
+ if (emit_dentry) {
+ doutc(cl, " %llx dentry %p %pd %p\n", di->offset,
+ dentry, dentry, d_inode(dentry));
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
+ d_inode(dentry)->i_mode >> 12)) {
+ dput(dentry);
+ err = 0;
+ break;
+ }
+ ctx->pos++;
- dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
- dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- ctx->pos = di->offset;
- if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12)) {
- if (last) {
- /* remember our position */
- fi->dentry = last;
- fi->next_offset = di->offset;
+ if (last)
+ dput(last);
+ last = dentry;
+ } else {
+ dput(dentry);
}
- dput(dentry);
- return 0;
- }
-
- if (last)
- dput(last);
- last = dentry;
-
- ctx->pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- err = -EAGAIN;
- goto out;
}
-
- spin_lock(&parent->d_lock);
- p = p->prev; /* advance to next dentry */
- goto more;
-
-out_unlock:
- spin_unlock(&parent->d_lock);
out:
- if (last)
+ ceph_readdir_cache_release(&cache_ctl);
+ if (last) {
+ int ret;
+ di = ceph_dentry(last);
+ ret = note_last_dentry(fsc, dfi, last->d_name.name,
+ last->d_name.len,
+ fpos_off(di->offset) + 1);
+ if (ret < 0)
+ err = ret;
dput(last);
+ /* last_name no longer match cache index */
+ if (dfi->readdir_cache_idx >= 0) {
+ dfi->readdir_cache_idx = -1;
+ dfi->dir_release_count = 0;
+ }
+ }
return err;
}
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
- int len)
+static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
{
- kfree(fi->last_name);
- fi->last_name = kmalloc(len+1, GFP_NOFS);
- if (!fi->last_name)
- return -ENOMEM;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- dout("note_last_dentry '%s'\n", fi->last_name);
- return 0;
+ if (!dfi->last_readdir)
+ return true;
+ if (is_hash_order(pos))
+ return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos));
+ else
+ return dfi->frag != fpos_frag(pos);
}
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
- unsigned frag = fpos_frag(ctx->pos);
- int off = fpos_off(ctx->pos);
+ struct ceph_client *cl = fsc->client;
+ int i;
int err;
- u32 ftype;
+ unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
- dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
- if (fi->flags & CEPH_F_ATEND)
+ doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode,
+ ceph_vinop(inode), file, ctx->pos);
+ if (dfi->file_info.flags & CEPH_F_ATEND)
return 0;
/* always start with . and .. */
if (ctx->pos == 0) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = atomic_read(&ci->i_release_count);
-
- dout("readdir off 0 -> '.'\n");
- if (!dir_emit(ctx, ".", 1,
- ceph_translate_ino(inode->i_sb, inode->i_ino),
+ doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode,
+ ceph_vinop(inode));
+ if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
inode->i_mode >> 12))
return 0;
ctx->pos = 1;
- off = 1;
}
if (ctx->pos == 1) {
- ino_t ino = parent_ino(file->f_dentry);
- dout("readdir off 1 -> '..'\n");
- if (!dir_emit(ctx, "..", 2,
- ceph_translate_ino(inode->i_sb, ino),
- inode->i_mode >> 12))
+ u64 ino;
+ struct dentry *dentry = file->f_path.dentry;
+
+ spin_lock(&dentry->d_lock);
+ ino = ceph_present_inode(dentry->d_parent->d_inode);
+ spin_unlock(&dentry->d_lock);
+
+ doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode,
+ ceph_vinop(inode));
+ if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
return 0;
ctx->pos = 2;
- off = 2;
}
- /* can we use the dcache? */
+ err = ceph_fscrypt_prepare_readdir(inode);
+ if (err < 0)
+ return err;
+
spin_lock(&ci->i_ceph_lock);
- if ((ctx->pos == 2 || fi->dentry) &&
+ /* request Fx cap. if have Fx, we don't need to release Fs cap
+ * for later create/unlink. */
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+ /* can we use the dcache? */
+ if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
- __ceph_dir_is_complete(ci) &&
- __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ __ceph_dir_is_complete_ordered(ci) &&
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ int shared_gen = atomic_read(&ci->i_shared_gen);
+
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(file, ctx);
+ err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
} else {
spin_unlock(&ci->i_ceph_lock);
}
- if (fi->dentry) {
- err = note_last_dentry(fi, fi->dentry->d_name.name,
- fi->dentry->d_name.len);
- if (err)
- return err;
- dput(fi->dentry);
- fi->dentry = NULL;
- }
/* proceed with a normal readdir */
-
more:
/* do we have the correct frag content buffered? */
- if (fi->frag != frag || fi->last_readdir == NULL) {
+ if (need_send_readdir(dfi, ctx->pos)) {
struct ceph_mds_request *req;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
/* discard old result, if any */
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
}
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+ if (is_hash_order(ctx->pos)) {
+ /* fragtree isn't always accurate. choose frag
+ * based on previous reply when possible. */
+ if (frag == (unsigned)-1)
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
+ } else {
+ frag = fpos_frag(ctx->pos);
+ }
- dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
- ceph_vinop(inode), frag, fi->last_name);
+ doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n",
+ inode, ceph_vinop(inode), frag, dfi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_dentry = dget(file->f_dentry);
+
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
- req->r_direct_hash = ceph_frag_value(frag);
- req->r_direct_is_hash = true;
- req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
- req->r_readdir_offset = fi->next_offset;
+ if (op == CEPH_MDS_OP_READDIR) {
+ req->r_direct_hash = ceph_frag_value(frag);
+ __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ req->r_inode_drop = CEPH_CAP_FILE_EXCL;
+ }
+ if (dfi->last_name) {
+ int len = strlen(dfi->last_name);
+
+ req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ return -ENOMEM;
+ }
+ memcpy(req->r_path2, dfi->last_name, len);
+
+ err = ceph_encode_encrypted_dname(inode, req->r_path2, len);
+ if (err < 0) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
+ } else if (is_hash_order(ctx->pos)) {
+ req->r_args.readdir.offset_hash =
+ cpu_to_le32(fpos_hash(ctx->pos));
+ }
+
+ req->r_dir_release_cnt = dfi->dir_release_count;
+ req->r_dir_ordered_cnt = dfi->dir_ordered_count;
+ req->r_readdir_cache_idx = dfi->readdir_cache_idx;
+ req->r_readdir_offset = dfi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
+ req->r_args.readdir.flags =
+ cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_dentry = dget(file->f_path.dentry);
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
return err;
}
- dout("readdir got and parsed readdir result=%d"
- " on frag %x, end=%d, complete=%d\n", err, frag,
- (int)req->r_reply_info.dir_end,
- (int)req->r_reply_info.dir_complete);
+ doutc(cl, "%p %llx.%llx got and parsed readdir result=%d"
+ "on frag %x, end=%d, complete=%d, hash_order=%d\n",
+ inode, ceph_vinop(inode), err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete,
+ (int)req->r_reply_info.hash_order);
+
+ rinfo = &req->r_reply_info;
+ if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (!rinfo->hash_order) {
+ dfi->next_offset = req->r_readdir_offset;
+ /* adjust ctx->pos to beginning of frag */
+ ctx->pos = ceph_make_fpos(frag,
+ dfi->next_offset,
+ false);
+ }
+ }
- if (!req->r_did_prepopulate) {
- dout("readdir !did_prepopulate");
+ dfi->frag = frag;
+ dfi->last_readdir = req;
+
+ if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
+ dfi->readdir_cache_idx = req->r_readdir_cache_idx;
+ if (dfi->readdir_cache_idx < 0) {
+ /* preclude from marking dir ordered */
+ dfi->dir_ordered_count = 0;
+ } else if (ceph_frag_is_leftmost(frag) &&
+ dfi->next_offset == 2) {
+ /* note dir version at start of readdir so
+ * we can tell if any dentries get dropped */
+ dfi->dir_release_count = req->r_dir_release_cnt;
+ dfi->dir_ordered_count = req->r_dir_ordered_cnt;
+ }
+ } else {
+ doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode,
+ ceph_vinop(inode));
+ /* disable readdir cache */
+ dfi->readdir_cache_idx = -1;
/* preclude from marking dir complete */
- fi->dir_release_count--;
+ dfi->dir_release_count = 0;
}
/* note next offset and last dentry name */
- fi->offset = fi->next_offset;
- fi->last_readdir = req;
-
- if (req->r_reply_info.dir_end) {
- kfree(fi->last_name);
- fi->last_name = NULL;
- if (ceph_frag_is_rightmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- } else {
- rinfo = &req->r_reply_info;
- err = note_last_dentry(fi,
- rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1]);
- if (err)
+ if (rinfo->dir_nr > 0) {
+ struct ceph_mds_reply_dir_entry *rde =
+ rinfo->dir_entries + (rinfo->dir_nr-1);
+ unsigned next_offset = req->r_reply_info.dir_end ?
+ 2 : (fpos_off(rde->offset) + 1);
+ err = note_last_dentry(fsc, dfi, rde->name,
+ rde->name_len, next_offset);
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
- fi->next_offset += rinfo->dir_nr;
+ }
+ } else if (req->r_reply_info.dir_end) {
+ dfi->next_offset = 2;
+ /* keep last name */
}
}
- rinfo = &fi->last_readdir->r_reply_info;
- dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
- rinfo->dir_nr, off, fi->offset);
-
- ctx->pos = ceph_make_fpos(frag, off);
- while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
- struct ceph_mds_reply_inode *in =
- rinfo->dir_in[off - fi->offset].in;
- struct ceph_vino vino;
- ino_t ino;
-
- dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
- off, off - fi->offset, rinfo->dir_nr, ctx->pos,
- rinfo->dir_dname_len[off - fi->offset],
- rinfo->dir_dname[off - fi->offset], in);
- BUG_ON(!in);
- ftype = le32_to_cpu(in->mode) >> 12;
- vino.ino = le64_to_cpu(in->ino);
- vino.snap = le64_to_cpu(in->snapid);
- ino = ceph_vino_to_ino(vino);
- if (!dir_emit(ctx,
- rinfo->dir_dname[off - fi->offset],
- rinfo->dir_dname_len[off - fi->offset],
- ceph_translate_ino(inode->i_sb, ino), ftype)) {
- dout("filldir stopping us...\n");
+ rinfo = &dfi->last_readdir->r_reply_info;
+ doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n",
+ inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos,
+ rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+
+ i = 0;
+ /* search start position */
+ if (rinfo->dir_nr > 0) {
+ int step, nr = rinfo->dir_nr;
+ while (nr > 0) {
+ step = nr >> 1;
+ if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+ i += step + 1;
+ nr -= step + 1;
+ } else {
+ nr = step;
+ }
+ }
+ }
+ for (; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
+
+ if (rde->offset < ctx->pos) {
+ pr_warn_client(cl,
+ "%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n",
+ inode, ceph_vinop(inode), rde->offset, ctx->pos);
+ return -EIO;
+ }
+
+ if (WARN_ON_ONCE(!rde->inode.in))
+ return -EIO;
+
+ ctx->pos = rde->offset;
+ doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode,
+ ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos,
+ rde->name_len, rde->name, &rde->inode.in);
+
+ if (!dir_emit(ctx, rde->name, rde->name_len,
+ ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+ le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
+ doutc(cl, "filldir stopping us...\n");
return 0;
}
- off++;
+
+ /* Reset the lengths to their original allocated vals */
ctx->pos++;
}
- if (fi->last_name) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
+
+ if (dfi->next_offset > 2) {
+ frag = dfi->frag;
goto more;
}
/* more frags? */
- if (!ceph_frag_is_rightmost(frag)) {
- frag = ceph_frag_next(frag);
- off = 0;
- ctx->pos = ceph_make_fpos(frag, off);
- dout("readdir next frag is %x\n", frag);
+ if (!ceph_frag_is_rightmost(dfi->frag)) {
+ frag = ceph_frag_next(dfi->frag);
+ if (is_hash_order(ctx->pos)) {
+ loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+ dfi->next_offset, true);
+ if (new_pos > ctx->pos)
+ ctx->pos = new_pos;
+ /* keep last_name */
+ } else {
+ ctx->pos = ceph_make_fpos(frag, dfi->next_offset,
+ false);
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
+ }
+ doutc(cl, "%p %llx.%llx next frag is %x\n", inode,
+ ceph_vinop(inode), frag);
goto more;
}
- fi->flags |= CEPH_F_ATEND;
+ dfi->file_info.flags |= CEPH_F_ATEND;
/*
* if dir_release_count still matches the dir, no dentries
* were released during the whole readdir, and we should have
* the complete dir contents in our cache.
*/
- spin_lock(&ci->i_ceph_lock);
- if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
- dout(" marking %p complete\n", inode);
- __ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = ctx->pos;
+ if (atomic64_read(&ci->i_release_count) ==
+ dfi->dir_release_count) {
+ spin_lock(&ci->i_ceph_lock);
+ if (dfi->dir_ordered_count ==
+ atomic64_read(&ci->i_ordered_count)) {
+ doutc(cl, " marking %p %llx.%llx complete and ordered\n",
+ inode, ceph_vinop(inode));
+ /* use i_size to track number of entries in
+ * readdir cache */
+ BUG_ON(dfi->readdir_cache_idx < 0);
+ i_size_write(inode, dfi->readdir_cache_idx *
+ sizeof(struct dentry*));
+ } else {
+ doutc(cl, " marking %llx.%llx complete\n",
+ ceph_vinop(inode));
+ }
+ __ceph_dir_set_complete(ci, dfi->dir_release_count,
+ dfi->dir_ordered_count);
+ spin_unlock(&ci->i_ceph_lock);
}
- spin_unlock(&ci->i_ceph_lock);
-
- dout("readdir %p file %p done.\n", inode, file);
+ doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode),
+ file);
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_dir_file_info *dfi)
{
- if (fi->last_readdir) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
- }
- kfree(fi->last_name);
- fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
- if (fi->dentry) {
- dput(fi->dentry);
- fi->dentry = NULL;
- }
- fi->flags &= ~CEPH_F_ATEND;
+ if (dfi->last_readdir) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
+ }
+ kfree(dfi->last_name);
+ dfi->last_name = NULL;
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
+ dfi->next_offset = 2; /* compensate for . and .. */
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
+}
+
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
+{
+ struct ceph_mds_reply_info_parsed *rinfo;
+ loff_t chunk_offset;
+ if (new_pos == 0)
+ return true;
+ if (is_hash_order(new_pos)) {
+ /* no need to reset last_name for a forward seek when
+ * dentries are sorted in hash order */
+ } else if (dfi->frag != fpos_frag(new_pos)) {
+ return true;
+ }
+ rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
+ if (!rinfo || !rinfo->dir_nr)
+ return true;
+ chunk_offset = rinfo->dir_entries[0].offset;
+ return new_pos < chunk_offset ||
+ is_hash_order(new_pos) != is_hash_order(chunk_offset);
}
static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
- struct ceph_file_info *fi = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
loff_t retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = -EINVAL;
switch (whence) {
- case SEEK_END:
- offset += inode->i_size + 2; /* FIXME */
- break;
case SEEK_CUR:
offset += file->f_pos;
+ break;
case SEEK_SET:
break;
+ case SEEK_END:
+ retval = -EOPNOTSUPP;
+ goto out;
default:
goto out;
}
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset >= 0) {
+ if (need_reset_readdir(dfi, offset)) {
+ doutc(cl, "%p %llx.%llx dropping %p content\n",
+ inode, ceph_vinop(inode), file);
+ reset_readdir(dfi);
+ } else if (is_hash_order(offset) && offset > file->f_pos) {
+ /* for hash offset, we don't know if a forward seek
+ * is within same frag */
+ dfi->dir_release_count = 0;
+ dfi->readdir_cache_idx = -1;
+ }
+
if (offset != file->f_pos) {
file->f_pos = offset;
- file->f_version = 0;
- fi->flags &= ~CEPH_F_ATEND;
+ dfi->file_info.flags &= ~CEPH_F_ATEND;
}
retval = offset;
-
- /*
- * discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk.
- */
- if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
- fpos_off(offset) < fi->offset) {
- dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
- }
-
- /* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
- fi->dir_release_count--;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
/*
* Handle lookups for the hidden .snap directory.
*/
-int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
+struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
+ struct ceph_client *cl = ceph_inode_to_client(parent);
/* .snap dir? */
- if (err == -ENOENT &&
- ceph_snap(parent) == CEPH_NOSNAP &&
- strcmp(dentry->d_name.name,
- fsc->mount_options->snapdir_name) == 0) {
+ if (ceph_snap(parent) == CEPH_NOSNAP &&
+ strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
+ struct dentry *res;
struct inode *inode = ceph_get_snapdir(parent);
- dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
- dentry, dentry->d_name.len, dentry->d_name.name, inode);
- BUG_ON(!d_unhashed(dentry));
- d_add(dentry, inode);
- err = 0;
+
+ res = d_splice_alias(inode, dentry);
+ doutc(cl, "ENOENT on snapdir %p '%pd', linking to "
+ "snapdir %p %llx.%llx. Spliced dentry %p\n",
+ dentry, dentry, inode, ceph_vinop(inode), res);
+ if (res)
+ dentry = res;
}
- return err;
+ return dentry;
}
/*
@@ -540,13 +756,16 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
+
if (err == -ENOENT) {
/* no trace? */
err = 0;
if (!req->r_reply_info.head->is_dentry) {
- dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, dentry->d_inode);
- if (dentry->d_inode) {
+ doutc(cl,
+ "ENOENT and no trace, dentry %p inode %llx.%llx\n",
+ dentry, ceph_vinop(d_inode(dentry)));
+ if (d_really_is_positive(dentry)) {
d_drop(dentry);
err = -ENOENT;
} else {
@@ -563,7 +782,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
return dentry;
}
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
{
return ceph_ino(inode) == CEPH_INO_ROOT &&
strncmp(dentry->d_name.name, ".ceph", 5) == 0;
@@ -576,39 +795,53 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_request *req;
int op;
+ int mask;
int err;
- dout("lookup %p dentry %p '%.*s'\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name);
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir),
+ dentry, dentry);
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- err = ceph_init_dentry(dentry);
- if (err < 0)
- return ERR_PTR(err);
+ if (IS_ENCRYPTED(dir)) {
+ bool had_key = fscrypt_has_encryption_key(dir);
+
+ err = fscrypt_prepare_lookup_partial(dir, dentry);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* mark directory as incomplete if it has been unlocked */
+ if (!had_key && fscrypt_has_encryption_key(dir))
+ ceph_dir_clear_complete(dir);
+ }
/* can we conclude ENOENT locally? */
- if (dentry->d_inode == NULL) {
+ if (d_really_is_negative(dentry)) {
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
- dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ doutc(cl, " dir %llx.%llx flags are 0x%lx\n",
+ ceph_vinop(dir), ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
+ ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
- (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
- dout(" dir %p complete, -ENOENT\n", dir);
+ doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
+ ceph_vinop(dir));
d_add(dentry, NULL);
- di->lease_shared_gen = ci->i_shared_gen;
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
return NULL;
}
spin_unlock(&ci->i_ceph_lock);
@@ -621,14 +854,30 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(req);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- /* we only need inode linkage */
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
- req->r_locked_dir = dir;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ ihold(dir);
+ req->r_parent = dir;
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
- err = ceph_handle_snapdir(req, dentry, err);
+ if (err == -ENOENT) {
+ struct dentry *res;
+
+ res = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(res)) {
+ err = PTR_ERR(res);
+ } else {
+ dentry = res;
+ err = 0;
+ }
+ }
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
- dout("lookup result=%p\n", dentry);
+ doutc(cl, "result=%p\n", dentry);
return dentry;
}
@@ -644,143 +893,322 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
/*
* We created the item, then did a lookup, and found
* it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
+ * had in our cache (and thus got spliced). To not
+ * confuse VFS (especially when inode is a directory),
+ * we don't link our dentry to that inode, return an
+ * error instead.
+ *
+ * This event should be rare and it happens only when
+ * we talk to old MDS. Recent MDS does not send traceless
+ * reply for request that creates new inode.
*/
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
+ d_drop(result);
+ return -ESTALE;
}
return PTR_ERR(result);
}
-static int ceph_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
+static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
+ struct ceph_acl_sec_ctx as_ctx = {};
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
- dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
- dir, dentry, mode, rdev);
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
+ if (ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n",
+ dir, ceph_vinop(dir), dentry, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ err = PTR_ERR(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
}
+
+ if (S_ISREG(mode) && IS_ENCRYPTED(dir))
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ ihold(dir);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mknod.mode = cpu_to_le32(mode);
req->r_args.mknod.rdev = cpu_to_le32(rdev);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+out_req:
ceph_mdsc_put_request(req);
- if (err)
+out:
+ if (!err)
+ ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ else
d_drop(dentry);
+ ceph_release_acl_sec_ctx(&as_ctx);
return err;
}
-static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int ceph_create(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return ceph_mknod(dir, dentry, mode, 0);
+ return ceph_mknod(idmap, dir, dentry, mode, 0);
}
-static int ceph_symlink(struct inode *dir, struct dentry *dentry,
- const char *dest)
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
+ const char *dest)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ int err;
+ int len = strlen(dest);
+ struct fscrypt_str osd_link = FSTR_INIT(NULL, 0);
+
+ err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX,
+ &osd_link);
+ if (err)
+ goto out;
+
+ err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link);
+ if (err)
+ goto out;
+
+ req->r_path2 = kmalloc(BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ len = base64_encode(osd_link.name, osd_link.len,
+ req->r_path2, false, BASE64_IMAP);
+ req->r_path2[len] = '\0';
+out:
+ fscrypt_fname_free_buffer(&osd_link);
+ return err;
+}
+#else
+static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
+ const char *dest)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const char *dest)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
+ struct ceph_acl_sec_ctx as_ctx = {};
+ umode_t mode = S_IFLNK | 0777;
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
- dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
+ if (ceph_quota_is_max_files_exceeded(dir)) {
+ err = -EDQUOT;
+ goto out;
+ }
+
+ doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry,
+ dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
- d_drop(dentry);
- return PTR_ERR(req);
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ err = PTR_ERR(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
+ }
+
+ req->r_parent = dir;
+ ihold(dir);
+
+ if (IS_ENCRYPTED(req->r_new_inode)) {
+ err = prep_encrypted_symlink_target(req, dest);
+ if (err)
+ goto out_req;
+ } else {
+ req->r_path2 = kstrdup(dest, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto out_req;
+ }
}
+
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_path2 = kstrdup(dest, GFP_NOFS);
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+out_req:
ceph_mdsc_put_request(req);
+out:
if (err)
d_drop(dentry);
+ ceph_release_acl_sec_ctx(&as_ctx);
return err;
}
-static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
- int err = -EROFS;
+ struct ceph_acl_sec_ctx as_ctx = {};
+ struct dentry *ret;
+ int err;
int op;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return ERR_PTR(err);
+
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
- dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
- dentry->d_name.len, dentry->d_name.name, dentry);
+ doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n",
+ ceph_vinop(dir), dentry, dentry);
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+ doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n",
+ ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
+ ret = ERR_PTR(-EROFS);
+ goto out;
+ }
+
+ if (op == CEPH_MDS_OP_MKDIR &&
+ ceph_quota_is_max_files_exceeded(dir)) {
+ ret = ERR_PTR(-EDQUOT);
+ goto out;
+ }
+ if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
+ !fscrypt_has_encryption_key(dir)) {
+ ret = ERR_PTR(-ENOKEY);
goto out;
}
+
+
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
- err = PTR_ERR(req);
+ ret = ERR_CAST(req);
goto out;
}
+ mode |= S_IFDIR;
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ ret = ERR_CAST(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
+ }
+
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
+ req->r_parent = dir;
+ ihold(dir);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ if (op == CEPH_MDS_OP_MKDIR)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
req->r_args.mkdir.mode = cpu_to_le32(mode);
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
+ if (!err &&
+ !req->r_reply_info.head->is_target &&
+ !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+ ret = ERR_PTR(err);
+out_req:
+ if (!IS_ERR(ret) && req->r_dentry != dentry)
+ /* Some other dentry was spliced in */
+ ret = dget(req->r_dentry);
ceph_mdsc_put_request(req);
out:
- if (err < 0)
+ if (!IS_ERR(ret)) {
+ if (ret)
+ dentry = ret;
+ ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+ } else {
d_drop(dentry);
- return err;
+ }
+ ceph_release_acl_sec_ctx(&as_ctx);
+ return ret;
}
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
+ if (dentry->d_flags & DCACHE_DISCONNECTED)
+ return -EINVAL;
+
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
- dout("link in dir %p old_dentry %p dentry %p\n", dir,
- old_dentry, dentry);
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
+ doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir),
+ old_dentry, dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
d_drop(dentry);
@@ -788,40 +1216,118 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_old_dentry = dget(old_dentry);
+ /*
+ * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
+ * will just pass the ino# to MDSs.
+ */
+ if (old_dentry->d_flags & DCACHE_DISCONNECTED)
+ req->r_ino2 = ceph_vino(d_inode(old_dentry));
+ req->r_parent = dir;
+ ihold(dir);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_SHARED on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
err = ceph_mdsc_do_request(mdsc, dir, req);
if (err) {
d_drop(dentry);
} else if (!req->r_reply_info.head->is_dentry) {
- ihold(old_dentry->d_inode);
- d_instantiate(dentry, old_dentry->d_inode);
+ ihold(d_inode(old_dentry));
+ d_instantiate(dentry, d_inode(old_dentry));
}
ceph_mdsc_put_request(req);
return err;
}
-/*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
- * looks like the link count will hit 0, drop any other caps (other
- * than PIN) we don't specifically want (due to the file still being
- * open).
- */
-static int drop_caps_for_unlink(struct inode *inode)
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ struct dentry *dentry = req->r_dentry;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn_client(cl,
+ "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags);
+ spin_unlock(&dentry->d_lock);
+
+ synchronize_rcu();
+
+ if (result == -EJUKEBOX)
+ goto out;
+
+ /* If op failed, mark everyone involved for errors */
+ if (result) {
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+
+ /* mark error on parent + clear complete */
+ mapping_set_error(req->r_parent->i_mapping, result);
+ ceph_dir_clear_complete(req->r_parent);
+
+ /* drop the dentry -- we don't know its status */
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ /* mark inode itself for an error (since metadata is bogus) */
+ mapping_set_error(req->r_old_inode->i_mapping, result);
+
+ pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
+ }
+out:
+ iput(req->r_old_inode);
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di;
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
spin_lock(&ci->i_ceph_lock);
- if (inode->i_nlink == 1) {
- drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
- ci->i_ceph_flags |= CEPH_I_NODELAY;
+ if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+ ceph_take_cap_refs(ci, want, false);
+ got = want;
}
spin_unlock(&ci->i_ceph_lock);
- return drop;
+
+ /* If we didn't get anything, return 0 */
+ if (!got)
+ return 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ /*
+ * - We are holding Fx, which implies Fs caps.
+ * - Only support async unlink for primary linkage
+ */
+ if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+ !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+ want = 0;
+ spin_unlock(&dentry->d_lock);
+
+ /* Do we still want what we've got? */
+ if (want == got)
+ return got;
+
+ ceph_put_cap_refs(ci, got);
+ return 0;
}
/*
@@ -829,25 +1335,55 @@ static int drop_caps_for_unlink(struct inode *inode)
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
+ struct dentry *dn;
int err = -EROFS;
int op;
+ char *path;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
- dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
- dentry->d_name.name, dentry);
+ doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir),
+ dentry);
op = CEPH_MDS_OP_RMSNAP;
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
- dout("unlink/rmdir dir %p dn %p inode %p\n",
- dir, dentry, inode);
- op = S_ISDIR(dentry->d_inode->i_mode) ?
+ doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n",
+ ceph_vinop(dir), dentry, ceph_vinop(inode));
+ op = d_is_dir(dentry) ?
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
+
+ dn = d_find_alias(dir);
+ if (!dn) {
+ try_async = false;
+ } else {
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+ ceph_mdsc_free_path_info(&path_info);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ try_async = false;
+ err = 0;
+ }
+ }
+
+retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -855,49 +1391,127 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_locked_dir = dir;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_parent = dir;
+ ihold(dir);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_inode_drop = drop_caps_for_unlink(inode);
- err = ceph_mdsc_do_request(mdsc, dir, req);
- if (!err && !req->r_reply_info.head->is_dentry)
- d_delete(dentry);
+ req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
+
+ if (try_async && op == CEPH_MDS_OP_UNLINK &&
+ (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s",
+ ceph_vinop(dir), dentry,
+ ceph_cap_string(req->r_dir_caps));
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_callback = ceph_async_unlink_cb;
+ req->r_old_inode = d_inode(dentry);
+ ihold(req->r_old_inode);
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+ dentry->d_name.hash);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ /*
+ * We have enough caps, so we assume that the unlink
+ * will succeed. Fix up the target inode and dcache.
+ */
+ drop_nlink(inode);
+ d_delete(dentry);
+ } else {
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
+ }
+ } else {
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ }
+
ceph_mdsc_put_request(req);
out:
return err;
}
-static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
+ int op = CEPH_MDS_OP_RENAME;
int err;
+ if (flags)
+ return -EINVAL;
+
if (ceph_snap(old_dir) != ceph_snap(new_dir))
return -EXDEV;
- if (ceph_snap(old_dir) != CEPH_NOSNAP ||
- ceph_snap(new_dir) != CEPH_NOSNAP)
- return -EROFS;
- dout("rename dir %p dentry %p to dir %p dentry %p\n",
- old_dir, old_dentry, new_dir, new_dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+ if (ceph_snap(old_dir) != CEPH_NOSNAP) {
+ if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
+ op = CEPH_MDS_OP_RENAMESNAP;
+ else
+ return -EROFS;
+ }
+ /* don't allow cross-quota renames */
+ if ((old_dir != new_dir) &&
+ (!ceph_quota_is_same_realm(old_dir, new_dir)))
+ return -EXDEV;
+
+ err = ceph_wait_on_conflict_unlink(new_dentry);
+ if (err)
+ return err;
+
+ err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+ if (err)
+ return err;
+
+ doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n",
+ ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir),
+ new_dentry);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ ihold(old_dir);
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
- req->r_locked_dir = new_dir;
- req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_old_dentry_dir = old_dir;
+ req->r_parent = new_dir;
+ ihold(new_dir);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
/* release LINK_RDCACHE on source inode (mds will lock it) */
- req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
- if (new_dentry->d_inode)
- req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+ if (d_really_is_positive(new_dentry)) {
+ req->r_inode_drop =
+ ceph_drop_caps_for_unlink(d_inode(new_dentry));
+ }
err = ceph_mdsc_do_request(mdsc, old_dir, req);
if (!err && !req->r_reply_info.head->is_dentry) {
/*
@@ -905,28 +1519,289 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* do_request, above). If there is no trace, we need
* to do it here.
*/
-
- /* d_move screws up d_subdirs order */
- ceph_dir_clear_complete(new_dir);
-
d_move(old_dentry, new_dentry);
-
- /* ensure target dentry is invalidated, despite
- rehashing bug in vfs_rename_dir */
- ceph_invalidate_dentry_lease(new_dentry);
}
ceph_mdsc_put_request(req);
return err;
}
/*
+ * Move dentry to tail of mdsc->dentry_leases list when lease is updated.
+ * Leases at front of the list will expire first. (Assume all leases have
+ * similar duration)
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "%p %p '%pd'\n", di, dn, dn);
+
+ di->flags |= CEPH_DENTRY_LEASE_LIST;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+
+ spin_lock(&mdsc->dentry_list_lock);
+ list_move_tail(&di->lease_list, &mdsc->dentry_leases);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
+ struct ceph_dentry_info *di)
+{
+ di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED);
+ di->lease_gen = 0;
+ di->time = jiffies;
+ list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases);
+}
+
+/*
+ * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
+ * list if it's not in the list, otherwise set 'referenced' flag.
+ *
+ * Called under dentry->d_lock.
+ */
+void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
+{
+ struct dentry *dn = di->dentry;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset);
+
+ if (!list_empty(&di->lease_list)) {
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ /* don't remove dentry from dentry lease list
+ * if its lease is valid */
+ if (__dentry_lease_is_valid(di))
+ return;
+ } else {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ return;
+ }
+ }
+
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
+ di->flags |= CEPH_DENTRY_REFERENCED;
+ di->flags &= ~CEPH_DENTRY_LEASE_LIST;
+ return;
+ }
+
+ spin_lock(&mdsc->dentry_list_lock);
+ __dentry_dir_lease_touch(mdsc, di);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+static void __dentry_lease_unlist(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_client *mdsc;
+ if (di->flags & CEPH_DENTRY_SHRINK_LIST)
+ return;
+ if (list_empty(&di->lease_list))
+ return;
+
+ mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_del_init(&di->lease_list);
+ spin_unlock(&mdsc->dentry_list_lock);
+}
+
+enum {
+ KEEP = 0,
+ DELETE = 1,
+ TOUCH = 2,
+ STOP = 4,
+};
+
+struct ceph_lease_walk_control {
+ bool dir_lease;
+ bool expire_dir_lease;
+ unsigned long nr_to_scan;
+ unsigned long dir_lease_ttl;
+};
+
+static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *);
+static int __dentry_lease_check(const struct dentry *);
+
+static unsigned long
+__dentry_leases_walk(struct ceph_mds_client *mdsc,
+ struct ceph_lease_walk_control *lwc)
+{
+ struct ceph_dentry_info *di, *tmp;
+ struct dentry *dentry, *last = NULL;
+ struct list_head* list;
+ LIST_HEAD(dispose);
+ unsigned long freed = 0;
+ int ret = 0;
+
+ list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
+ spin_lock(&mdsc->dentry_list_lock);
+ list_for_each_entry_safe(di, tmp, list, lease_list) {
+ if (!lwc->nr_to_scan)
+ break;
+ --lwc->nr_to_scan;
+
+ dentry = di->dentry;
+ if (last == dentry)
+ break;
+
+ if (!spin_trylock(&dentry->d_lock))
+ continue;
+
+ if (__lockref_is_dead(&dentry->d_lockref)) {
+ list_del_init(&di->lease_list);
+ goto next;
+ }
+
+ if (lwc->dir_lease)
+ ret = __dir_lease_check(dentry, lwc);
+ else
+ ret = __dentry_lease_check(dentry);
+ if (ret & TOUCH) {
+ /* move it into tail of dir lease list */
+ __dentry_dir_lease_touch(mdsc, di);
+ if (!last)
+ last = dentry;
+ }
+ if (ret & DELETE) {
+ /* stale lease */
+ di->flags &= ~CEPH_DENTRY_REFERENCED;
+ if (dentry->d_lockref.count > 0) {
+ /* update_dentry_lease() will re-add
+ * it to lease list, or
+ * ceph_d_delete() will return 1 when
+ * last reference is dropped */
+ list_del_init(&di->lease_list);
+ } else {
+ di->flags |= CEPH_DENTRY_SHRINK_LIST;
+ list_move_tail(&di->lease_list, &dispose);
+ dget_dlock(dentry);
+ }
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ if (ret & STOP)
+ break;
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+
+ while (!list_empty(&dispose)) {
+ di = list_first_entry(&dispose, struct ceph_dentry_info,
+ lease_list);
+ dentry = di->dentry;
+ spin_lock(&dentry->d_lock);
+
+ list_del_init(&di->lease_list);
+ di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
+ if (di->flags & CEPH_DENTRY_REFERENCED) {
+ spin_lock(&mdsc->dentry_list_lock);
+ if (di->flags & CEPH_DENTRY_LEASE_LIST) {
+ list_add_tail(&di->lease_list,
+ &mdsc->dentry_leases);
+ } else {
+ __dentry_dir_lease_touch(mdsc, di);
+ }
+ spin_unlock(&mdsc->dentry_list_lock);
+ } else {
+ freed++;
+ }
+
+ spin_unlock(&dentry->d_lock);
+ /* ceph_d_delete() does the trick */
+ dput(dentry);
+ }
+ return freed;
+}
+
+static int __dentry_lease_check(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int ret;
+
+ if (__dentry_lease_is_valid(di))
+ return STOP;
+ ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0)
+ return TOUCH;
+ return DELETE;
+}
+
+static int __dir_lease_check(const struct dentry *dentry,
+ struct ceph_lease_walk_control *lwc)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ int ret = __dir_lease_try_check(dentry);
+ if (ret == -EBUSY)
+ return KEEP;
+ if (ret > 0) {
+ if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
+ return STOP;
+ /* Move dentry to tail of dir lease list if we don't want
+ * to delete it. So dentries in the list are checked in a
+ * round robin manner */
+ if (!lwc->expire_dir_lease)
+ return TOUCH;
+ if (dentry->d_lockref.count > 0 ||
+ (di->flags & CEPH_DENTRY_REFERENCED))
+ return TOUCH;
+ /* invalidate dir lease */
+ di->lease_shared_gen = 0;
+ }
+ return DELETE;
+}
+
+int ceph_trim_dentries(struct ceph_mds_client *mdsc)
+{
+ struct ceph_lease_walk_control lwc;
+ unsigned long count;
+ unsigned long freed;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_use_max > 0 &&
+ mdsc->caps_use_count > mdsc->caps_use_max)
+ count = mdsc->caps_use_count - mdsc->caps_use_max;
+ else
+ count = 0;
+ spin_unlock(&mdsc->caps_list_lock);
+
+ lwc.dir_lease = false;
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
+ freed = __dentry_leases_walk(mdsc, &lwc);
+ if (!lwc.nr_to_scan) /* more invalid leases */
+ return -EAGAIN;
+
+ if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
+ lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
+
+ lwc.dir_lease = true;
+ lwc.expire_dir_lease = freed < count;
+ lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
+ freed +=__dentry_leases_walk(mdsc, &lwc);
+ if (!lwc.nr_to_scan) /* more to check */
+ return -EAGAIN;
+
+ return freed > 0 ? 1 : 0;
+}
+
+/*
* Ensure a dentry lease will no longer revalidate.
*/
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
- ceph_dentry(dentry)->lease_shared_gen = 0;
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+ __dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
@@ -934,35 +1809,55 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
-static int dentry_lease_is_valid(struct dentry *dentry)
+static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
+{
+ struct ceph_mds_session *session;
+
+ if (!di->lease_gen)
+ return false;
+
+ session = di->lease_session;
+ if (session) {
+ u32 gen;
+ unsigned long ttl;
+
+ gen = atomic_read(&session->s_cap_gen);
+ ttl = session->s_cap_ttl;
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, ttl) &&
+ time_before(jiffies, di->time))
+ return true;
+ }
+ di->lease_gen = 0;
+ return false;
+}
+
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
{
struct ceph_dentry_info *di;
- struct ceph_mds_session *s;
- int valid = 0;
- u32 gen;
- unsigned long ttl;
struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
u32 seq = 0;
+ int valid = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di->lease_session) {
- s = di->lease_session;
- spin_lock(&s->s_gen_ttl_lock);
- gen = s->s_cap_gen;
- ttl = s->s_cap_ttl;
- spin_unlock(&s->s_gen_ttl_lock);
+ if (di && __dentry_lease_is_valid(di)) {
+ valid = 1;
- if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
- time_before(jiffies, ttl)) {
- valid = 1;
- if (di->lease_renew_after &&
- time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = dentry->d_parent->d_inode;
- session = ceph_get_mds_session(s);
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(di->lease_session);
seq = di->lease_seq;
di->lease_renew_after = 0;
di->lease_renew_from = jiffies;
@@ -972,95 +1867,221 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_unlock(&dentry->d_lock);
if (session) {
- ceph_mdsc_lease_send_msg(session, dir, dentry,
+ ceph_mdsc_lease_send_msg(session, dentry,
CEPH_MDS_LEASE_RENEW, seq);
ceph_put_mds_session(session);
}
- dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ doutc(cl, "dentry %p = %d\n", dentry, valid);
return valid;
}
/*
- * Check if directory-wide content lease/cap is valid.
+ * Called under dentry->d_lock.
*/
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int __dir_lease_try_check(const struct dentry *dentry)
{
- struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *dir;
+ struct ceph_inode_info *ci;
int valid = 0;
+ if (!di->lease_shared_gen)
+ return 0;
+ if (IS_ROOT(dentry))
+ return 0;
+
+ dir = d_inode(dentry->d_parent);
+ ci = ceph_inode(dir);
+
+ if (spin_trylock(&ci->i_ceph_lock)) {
+ if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0))
+ valid = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ } else {
+ valid = -EBUSY;
+ }
+
+ if (!valid)
+ di->lease_shared_gen = 0;
+ return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_client *cl = mdsc->fsc->client;
+ int valid;
+ int shared_gen;
+
spin_lock(&ci->i_ceph_lock);
- if (ci->i_shared_gen == di->lease_shared_gen)
- valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ if (valid) {
+ __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+ shared_gen = atomic_read(&ci->i_shared_gen);
+ }
spin_unlock(&ci->i_ceph_lock);
- dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
- dir, (unsigned)ci->i_shared_gen, dentry,
- (unsigned)di->lease_shared_gen, valid);
+ if (valid) {
+ struct ceph_dentry_info *di;
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (dir == d_inode(dentry->d_parent) &&
+ di && di->lease_shared_gen == shared_gen)
+ __ceph_dentry_dir_lease_touch(di);
+ else
+ valid = 0;
+ spin_unlock(&dentry->d_lock);
+ }
+ doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir,
+ ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen),
+ dentry, dentry, valid);
return valid;
}
/*
* Check if cached dentry can be trusted.
*/
-static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
+static int ceph_d_revalidate(struct inode *dir, const struct qstr *name,
+ struct dentry *dentry, unsigned int flags)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int valid = 0;
- struct inode *dir;
+ struct inode *inode;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
+ valid = fscrypt_d_revalidate(dir, name, dentry, flags);
+ if (valid <= 0)
+ return valid;
- dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
- ceph_dentry(dentry)->offset);
+ inode = d_inode_rcu(dentry);
- dir = ceph_get_dentry_parent_inode(dentry);
+ doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
+ dentry, dentry, inode, ceph_dentry(dentry)->offset,
+ !!(dentry->d_flags & DCACHE_NOKEY_NAME));
+
+ mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc;
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
- dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- valid = 1;
- } else if (dentry->d_inode &&
- ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
+ doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry,
+ dentry, inode);
valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
+ } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
+ if (inode)
+ valid = ceph_is_any_caps(inode);
+ else
+ valid = 1;
+ }
}
- dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid)
- ceph_dentry_lru_touch(dentry);
- else
- d_drop(dentry);
- iput(dir);
+ if (!valid) {
+ struct ceph_mds_request *req;
+ int op, err;
+ u32 mask;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ percpu_counter_inc(&mdsc->metric.d_lease_mis);
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (!IS_ERR(req)) {
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_parent = dir;
+ ihold(dir);
+
+ req->r_dname = name;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ switch (err) {
+ case 0:
+ if (d_really_is_positive(dentry) &&
+ d_inode(dentry) == req->r_target_inode)
+ valid = 1;
+ break;
+ case -ENOENT:
+ if (d_really_is_negative(dentry))
+ valid = 1;
+ fallthrough;
+ default:
+ break;
+ }
+ ceph_mdsc_put_request(req);
+ doutc(cl, "%p '%pd', lookup result=%d\n", dentry,
+ dentry, err);
+ }
+ } else {
+ percpu_counter_inc(&mdsc->metric.d_lease_hit);
+ }
+
+ doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
+ if (!valid)
+ ceph_dir_clear_complete(dir);
return valid;
}
/*
+ * Delete unused dentry that doesn't have valid lease
+ *
+ * Called under dentry->d_lock.
+ */
+static int ceph_d_delete(const struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ /* won't release caps */
+ if (d_really_is_negative(dentry))
+ return 0;
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
+ return 0;
+ /* valid lease? */
+ di = ceph_dentry(dentry);
+ if (di) {
+ if (__dentry_lease_is_valid(di))
+ return 0;
+ if (__dir_lease_try_check(dentry))
+ return 0;
+ }
+ return 1;
+}
+
+/*
* Release our ceph_dentry_info.
*/
static void ceph_d_release(struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
- dout("d_release %p\n", dentry);
- ceph_dentry_lru_del(dentry);
- if (di->lease_session)
- ceph_put_mds_session(di->lease_session);
- kmem_cache_free(ceph_dentry_cachep, di);
+ doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry);
+
+ atomic64_dec(&fsc->mdsc->metric.total_dentries);
+
+ spin_lock(&dentry->d_lock);
+ __dentry_lease_unlist(di);
dentry->d_fsdata = NULL;
-}
+ spin_unlock(&dentry->d_lock);
-static int ceph_snapdir_d_revalidate(struct dentry *dentry,
- unsigned int flags)
-{
- /*
- * Eventually, we'll want to revalidate snapped metadata
- * too... probably...
- */
- return 1;
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
}
/*
@@ -1071,21 +2092,39 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
*/
static void ceph_d_prune(struct dentry *dentry)
{
- dout("ceph_d_prune %p\n", dentry);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_inode_info *dir_ci;
+ struct ceph_dentry_info *di;
+
+ doutc(cl, "dentry %p '%pd'\n", dentry, dentry);
/* do we have a valid parent? */
if (IS_ROOT(dentry))
return;
- /* if we are not hashed, we don't affect dir's completeness */
- if (d_unhashed(dentry))
+ /* we hold d_lock, so d_parent is stable */
+ dir_ci = ceph_inode(d_inode(dentry->d_parent));
+ if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
return;
- /*
- * we hold d_lock, so d_parent is stable, and d_fsdata is never
- * cleared until d_release
- */
- ceph_dir_clear_complete(dentry->d_parent->d_inode);
+ /* who calls d_delete() should also disable dcache readdir */
+ if (d_really_is_negative(dentry))
+ return;
+
+ /* d_fsdata does not get cleared until d_release */
+ if (!d_unhashed(dentry)) {
+ __ceph_dir_clear_complete(dir_ci);
+ return;
+ }
+
+ /* Disable dcache readdir just in case that someone called d_drop()
+ * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
+ * properly (dcache readdir is still enabled) */
+ di = ceph_dentry(dentry);
+ if (di->offset > 0 &&
+ di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen))
+ __ceph_dir_clear_ordered(dir_ci);
}
/*
@@ -1095,21 +2134,21 @@ static void ceph_d_prune(struct dentry *dentry)
static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
- struct ceph_file_info *cf = file->private_data;
+ struct ceph_dir_file_info *dfi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
const int bufsize = 1024;
- if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT))
return -EISDIR;
- if (!cf->dir_info) {
- cf->dir_info = kmalloc(bufsize, GFP_NOFS);
- if (!cf->dir_info)
+ if (!dfi->dir_info) {
+ dfi->dir_info = kmalloc(bufsize, GFP_KERNEL);
+ if (!dfi->dir_info)
return -ENOMEM;
- cf->dir_info_len =
- snprintf(cf->dir_info, bufsize,
+ dfi->dir_info_len =
+ snprintf(dfi->dir_info, bufsize,
"entries: %20lld\n"
" files: %20lld\n"
" subdirs: %20lld\n"
@@ -1117,7 +2156,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
" rfiles: %20lld\n"
" rsubdirs: %20lld\n"
"rbytes: %20lld\n"
- "rctime: %10ld.%09ld\n",
+ "rctime: %ptSp\n",
ci->i_files + ci->i_subdirs,
ci->i_files,
ci->i_subdirs,
@@ -1125,124 +2164,20 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
ci->i_rfiles,
ci->i_rsubdirs,
ci->i_rbytes,
- (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
+ &ci->i_rctime);
}
- if (*ppos >= cf->dir_info_len)
+ if (*ppos >= dfi->dir_info_len)
return 0;
- size = min_t(unsigned, size, cf->dir_info_len-*ppos);
- left = copy_to_user(buf, cf->dir_info + *ppos, size);
+ size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
+ left = copy_to_user(buf, dfi->dir_info + *ppos, size);
if (left == size)
return -EFAULT;
*ppos += (size - left);
return size - left;
}
-/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- dout("dir_fsync %p\n", inode);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_entry(head->prev,
- struct ceph_mds_request, r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
-
- dout("dir_fsync %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- if (req->r_timeout) {
- ret = wait_for_completion_timeout(
- &req->r_safe_completion, req->r_timeout);
- if (ret > 0)
- ret = 0;
- else if (ret == 0)
- ret = -EIO; /* timed out */
- } else {
- wait_for_completion(&req->r_safe_completion);
- }
- ceph_mdsc_put_request(req);
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_entry(head->next,
- struct ceph_mds_request, r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-/*
- * We maintain a private dentry LRU.
- *
- * FIXME: this needs to be changed to a per-mds lru to be useful.
- */
-void ceph_dentry_lru_add(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_add_tail(&di->lru, &mdsc->dentry_lru);
- mdsc->num_dentry++;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_touch(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
- dn->d_name.len, dn->d_name.name, di->offset);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_move_tail(&di->lru, &mdsc->dentry_lru);
- spin_unlock(&mdsc->dentry_lru_lock);
-}
-
-void ceph_dentry_lru_del(struct dentry *dn)
-{
- struct ceph_dentry_info *di = ceph_dentry(dn);
- struct ceph_mds_client *mdsc;
-
- dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
- mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
- spin_lock(&mdsc->dentry_lru_lock);
- list_del_init(&di->lru);
- mdsc->num_dentry--;
- spin_unlock(&mdsc->dentry_lru_lock);
-}
/*
* Return name hash for a given dentry. This is dependent on
@@ -1251,6 +2186,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
{
struct ceph_inode_info *dci = ceph_inode(dir);
+ unsigned hash;
switch (dci->i_dir_layout.dl_dir_hash) {
case 0: /* for backward compat */
@@ -1258,19 +2194,33 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
return dn->d_name.hash;
default:
- return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ spin_lock(&dn->d_lock);
+ hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
dn->d_name.name, dn->d_name.len);
+ spin_unlock(&dn->d_lock);
+ return hash;
}
}
+WRAP_DIR_ITER(ceph_readdir) // FIXME!
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
- .iterate = ceph_readdir,
+ .iterate_shared = shared_ceph_readdir,
.llseek = ceph_dir_llseek,
.open = ceph_open,
.release = ceph_release,
.unlocked_ioctl = ceph_ioctl,
- .fsync = ceph_dir_fsync,
+ .compat_ioctl = compat_ptr_ioctl,
+ .fsync = ceph_fsync,
+ .lock = ceph_lock,
+ .flock = ceph_flock,
+};
+
+const struct file_operations ceph_snapdir_fops = {
+ .iterate_shared = shared_ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
};
const struct inode_operations ceph_dir_iops = {
@@ -1278,10 +2228,9 @@ const struct inode_operations ceph_dir_iops = {
.permission = ceph_permission,
.getattr = ceph_getattr,
.setattr = ceph_setattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+ .get_inode_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
.mkdir = ceph_mkdir,
@@ -1293,18 +2242,19 @@ const struct inode_operations ceph_dir_iops = {
.atomic_open = ceph_atomic_open,
};
-const struct dentry_operations ceph_dentry_ops = {
- .d_revalidate = ceph_d_revalidate,
- .d_release = ceph_d_release,
- .d_prune = ceph_d_prune,
-};
-
-const struct dentry_operations ceph_snapdir_dentry_ops = {
- .d_revalidate = ceph_snapdir_d_revalidate,
- .d_release = ceph_d_release,
+const struct inode_operations ceph_snapdir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .mkdir = ceph_mkdir,
+ .rmdir = ceph_unlink,
+ .rename = ceph_rename,
};
-const struct dentry_operations ceph_snap_dentry_ops = {
+const struct dentry_operations ceph_dentry_ops = {
+ .d_revalidate = ceph_d_revalidate,
+ .d_delete = ceph_d_delete,
.d_release = ceph_d_release,
.d_prune = ceph_d_prune,
+ .d_init = ceph_d_init,
};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..b2f2af104679 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,28 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "super.h"
#include "mds_client.h"
-
-/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
+#include "crypto.h"
/*
* Basic fh
@@ -32,100 +17,145 @@ struct ceph_nfs_fh {
} __attribute__ ((packed));
/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
- u32 parent_name_hash;
} __attribute__ ((packed));
/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle. However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible. That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
+ * fh for snapped inode
*/
+struct ceph_nfs_snapfh {
+ u64 ino;
+ u64 snapid;
+ u64 parent_ino;
+ u32 hash;
+} __attribute__ ((packed));
+
+#define BYTES_PER_U32 (sizeof(u32))
+#define CEPH_FH_BASIC_SIZE \
+ (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32)
+#define CEPH_FH_WITH_PARENT_SIZE \
+ (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32)
+#define CEPH_FH_SNAPPED_INODE_SIZE \
+ (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32)
+
+static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
+ struct inode *parent_inode)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE;
+ struct ceph_nfs_snapfh *sfh = (void *)rawfh;
+ u64 snapid = ceph_snap(inode);
+ int ret;
+ bool no_parent = true;
+
+ if (*max_len < snap_handle_length) {
+ *max_len = snap_handle_length;
+ ret = FILEID_INVALID;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (snapid != CEPH_SNAPDIR) {
+ struct inode *dir;
+ struct dentry *dentry = d_find_alias(inode);
+ if (!dentry)
+ goto out;
+
+ rcu_read_lock();
+ dir = d_inode_rcu(dentry->d_parent);
+ if (ceph_snap(dir) != CEPH_SNAPDIR) {
+ sfh->parent_ino = ceph_ino(dir);
+ sfh->hash = ceph_dentry_hash(dir, dentry);
+ no_parent = false;
+ }
+ rcu_read_unlock();
+ dput(dentry);
+ }
+
+ if (no_parent) {
+ if (!S_ISDIR(inode->i_mode))
+ goto out;
+ sfh->parent_ino = sfh->ino;
+ sfh->hash = 0;
+ }
+ sfh->ino = ceph_ino(inode);
+ sfh->snapid = snapid;
+
+ *max_len = snap_handle_length;
+ ret = FILEID_BTRFS_WITH_PARENT;
+out:
+ doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret);
+ return ret;
+}
+
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ static const int handle_length = CEPH_FH_BASIC_SIZE;
+ static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE;
int type;
- struct ceph_nfs_fh *fh = (void *)rawfh;
- struct ceph_nfs_confh *cfh = (void *)rawfh;
- int connected_handle_length = sizeof(*cfh)/4;
- int handle_length = sizeof(*fh)/4;
- struct dentry *dentry;
- struct dentry *parent;
- /* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EINVAL;
+ return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode);
- dentry = d_find_alias(inode);
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
- /* if we found an alias, generate a connectable fh */
- if (*max_len >= connected_handle_length && dentry) {
- dout("encode_fh %p connectable\n", dentry);
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ if (parent_inode) {
+ struct ceph_nfs_confh *cfh = (void *)rawfh;
+ doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode,
+ ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode));
cfh->ino = ceph_ino(inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
+ cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
- type = 2;
- spin_unlock(&dentry->d_lock);
- } else if (*max_len >= handle_length) {
- if (parent_inode) {
- /* nfsd wants connectable */
- *max_len = connected_handle_length;
- type = FILEID_INVALID;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(inode);
- *max_len = handle_length;
- type = 1;
- }
+ type = FILEID_INO32_GEN_PARENT;
} else {
+ struct ceph_nfs_fh *fh = (void *)rawfh;
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
- type = FILEID_INVALID;
+ type = FILEID_INO32_GEN;
}
- if (dentry)
- dput(dentry);
return type;
}
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh, int fh_len)
+static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
struct inode *inode;
- struct dentry *dentry;
struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*fh) / 4)
+ vino.ino = ino;
+ vino.snap = CEPH_NOSNAP;
+
+ if (ceph_vino_is_reserved(vino))
return ERR_PTR(-ESTALE);
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
- vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
+ int mask;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.lookupino.mask = cpu_to_le32(mask);
+
req->r_ino1 = vino;
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -134,144 +164,455 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
ihold(inode);
ceph_mdsc_put_request(req);
if (!inode)
+ return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
+ } else {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
return ERR_PTR(-ESTALE);
+ }
}
+ return inode;
+}
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
+{
+ struct inode *inode = __lookup_inode(sb, ino);
+ if (IS_ERR(inode))
+ return inode;
+ if (inode->i_nlink == 0) {
iput(inode);
- return dentry;
+ return ERR_PTR(-ESTALE);
}
- err = ceph_init_dentry(dentry);
- if (err < 0) {
+ return inode;
+}
+
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+{
+ struct inode *inode = __lookup_inode(sb, ino);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int err;
+
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ /* We need LINK caps to reliably check i_nlink */
+ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
+ if (err) {
iput(inode);
return ERR_PTR(err);
}
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
- return dentry;
+ /* -ESTALE if inode as been unlinked and no file is open */
+ if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ return d_obtain_alias(inode);
}
-/*
- * convert connectable fh to dentry
- */
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh, int fh_len)
+static struct dentry *__snapfh_to_dentry(struct super_block *sb,
+ struct ceph_nfs_snapfh *sfh,
+ bool want_parent)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_request *req;
struct inode *inode;
- struct dentry *dentry;
struct ceph_vino vino;
+ int mask;
int err;
+ bool unlinked = false;
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ if (want_parent) {
+ vino.ino = sfh->parent_ino;
+ if (sfh->snapid == CEPH_SNAPDIR)
+ vino.snap = CEPH_NOSNAP;
+ else if (sfh->ino == sfh->parent_ino)
+ vino.snap = CEPH_SNAPDIR;
+ else
+ vino.snap = sfh->snapid;
+ } else {
+ vino.ino = sfh->ino;
+ vino.snap = sfh->snapid;
+ }
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-ESTALE);
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
+ if (inode) {
+ if (ceph_inode_is_shutdown(inode)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ return d_obtain_alias(inode);
+ }
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.lookupino.mask = cpu_to_le32(mask);
+ if (vino.snap < CEPH_NOSNAP) {
+ req->r_args.lookupino.snapid = cpu_to_le64(vino.snap);
+ if (!want_parent && sfh->ino != sfh->parent_ino) {
+ req->r_args.lookupino.parent =
+ cpu_to_le64(sfh->parent_ino);
+ req->r_args.lookupino.hash =
+ cpu_to_le32(sfh->hash);
+ }
+ }
+
+ req->r_ino1 = vino;
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode) {
+ if (vino.snap == CEPH_SNAPDIR) {
+ if (inode->i_nlink == 0)
+ unlinked = true;
+ inode = ceph_get_snapdir(inode);
+ } else if (ceph_snap(inode) == vino.snap) {
ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
+ } else {
+ /* mds does not support lookup snapped inode */
+ inode = ERR_PTR(-EOPNOTSUPP);
+ }
+ } else {
+ inode = ERR_PTR(-ESTALE);
}
+ ceph_mdsc_put_request(req);
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
+ if (want_parent) {
+ doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err);
+ } else {
+ doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
+ vino.snap, sfh->parent_ino, sfh->hash, err);
}
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
+ /* see comments in ceph_get_parent() */
+ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
+}
+
+/*
+ * convert regular fh to dentry
+ */
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+ return __snapfh_to_dentry(sb, sfh, false);
+ }
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / BYTES_PER_U32)
+ return NULL;
+
+ doutc(fsc->client, "%llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
+ struct ceph_mds_request *req;
+ struct inode *inode;
+ int mask;
+ int err;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+
+ if (child) {
+ req->r_inode = d_inode(child);
+ ihold(d_inode(child));
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
+ }
+
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
return ERR_PTR(err);
}
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
+
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ return d_obtain_alias(inode);
}
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+static struct dentry *ceph_get_parent(struct dentry *child)
{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
- fh_len);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
- fh_len);
+ struct inode *inode = d_inode(child);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct dentry *dn;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ struct inode* dir;
+ bool unlinked = false;
+ /* do not support non-directory */
+ if (!d_is_dir(child)) {
+ dn = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ dir = __lookup_inode(inode->i_sb, ceph_ino(inode));
+ if (IS_ERR(dir)) {
+ dn = ERR_CAST(dir);
+ goto out;
+ }
+ /* There can be multiple paths to access snapped inode.
+ * For simplicity, treat snapdir of head inode as parent */
+ if (ceph_snap(inode) != CEPH_SNAPDIR) {
+ struct inode *snapdir = ceph_get_snapdir(dir);
+ if (dir->i_nlink == 0)
+ unlinked = true;
+ iput(dir);
+ if (IS_ERR(snapdir)) {
+ dn = ERR_CAST(snapdir);
+ goto out;
+ }
+ dir = snapdir;
+ }
+ /* If directory has already been deleted, further get_parent
+ * will fail. Do not mark snapdir dentry as disconnected,
+ * this prevents exportfs from doing further get_parent. */
+ if (unlinked)
+ dn = d_obtain_root(dir);
+ else
+ dn = d_obtain_alias(dir);
+ } else {
+ dn = __get_parent(child->d_sb, child, 0);
+ }
+out:
+ doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode,
+ ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
+ return dn;
}
/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
+ struct fid *fid,
int fh_len, int fh_type)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
struct dentry *dentry;
- int err;
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+ struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+ return __snapfh_to_dentry(sb, sfh, true);
+ }
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*cfh) / BYTES_PER_U32)
+ return NULL;
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+ doutc(fsc->client, "%llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (unlikely(dentry == ERR_PTR(-ENOENT)))
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
+static int __get_snap_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct inode *inode = d_inode(child);
+ struct inode *dir = d_inode(parent);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_request *req = NULL;
+ char *last_name = NULL;
+ unsigned next_offset = 2;
+ int err = -EINVAL;
+
+ if (ceph_ino(inode) != ceph_ino(dir))
+ goto out;
+ if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ if (ceph_snap(dir) == CEPH_NOSNAP) {
+ /*
+ * .get_name() from struct export_operations
+ * assumes that its 'name' parameter is pointing
+ * to a NAME_MAX+1 sized buffer
+ */
+ strscpy(name, fsc->mount_options->snapdir_name,
+ NAME_MAX + 1);
+ err = 0;
+ }
+ goto out;
}
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
+ if (ceph_snap(dir) != CEPH_SNAPDIR)
+ goto out;
+
+ while (1) {
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_dir_entry *rde;
+ int i;
+
+ req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP,
+ USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ req = NULL;
+ goto out;
+ }
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err)
+ goto out;
+
+ req->r_direct_mode = USE_AUTH_MDS;
+ req->r_readdir_offset = next_offset;
+ req->r_args.readdir.flags =
+ cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
+ if (last_name) {
+ req->r_path2 = last_name;
+ last_name = NULL;
+ }
+
+ req->r_inode = dir;
+ ihold(dir);
+ req->r_dentry = dget(parent);
+
+ inode_lock(dir);
+ err = ceph_mdsc_do_request(fsc->mdsc, NULL, req);
+ inode_unlock(dir);
+
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ rde = rinfo->dir_entries + i;
+ BUG_ON(!rde->inode.in);
+ if (ceph_snap(inode) ==
+ le64_to_cpu(rde->inode.in->snapid)) {
+ memcpy(name, rde->name, rde->name_len);
+ name[rde->name_len] = '\0';
+ err = 0;
+ goto out;
+ }
+ }
+
+ if (rinfo->dir_end)
+ break;
+
+ BUG_ON(rinfo->dir_nr <= 0);
+ rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
+ next_offset += rinfo->dir_nr;
+ last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
+ if (!last_name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ceph_mdsc_put_request(req);
+ req = NULL;
}
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
+ err = -ENOENT;
+out:
+ if (req)
+ ceph_mdsc_put_request(req);
+ kfree(last_name);
+ doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child,
+ inode, ceph_vinop(inode), err);
+ return err;
+}
+
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ struct inode *dir = d_inode(parent);
+ struct inode *inode = d_inode(child);
+ struct ceph_mds_reply_info_parsed *rinfo;
+ int err;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return __get_snap_name(parent, name, child);
+
+ mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ inode_lock(dir);
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_ino2 = ceph_vino(d_inode(parent));
+ req->r_parent = dir;
+ ihold(dir);
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode_unlock(dir);
+
+ if (err)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ if (!IS_ENCRYPTED(dir)) {
+ memcpy(name, rinfo->dname, rinfo->dname_len);
+ name[rinfo->dname_len] = 0;
+ } else {
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname = { .dir = dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+ .ctext_len = rinfo->altname_len };
+
+ err = ceph_fname_alloc_buffer(dir, &oname);
+ if (err < 0)
+ goto out;
+
+ err = ceph_fname_to_usr(&fname, NULL, &oname, NULL);
+ if (!err) {
+ memcpy(name, oname.name, oname.len);
+ name[oname.len] = 0;
+ }
+ ceph_fname_free_buffer(dir, &oname);
+ }
+out:
+ doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n",
+ child, inode, ceph_vinop(inode), err, err ? "" : "name ",
+ err ? "" : name);
+ ceph_mdsc_put_request(req);
+ return err;
}
const struct export_operations ceph_export_ops = {
.encode_fh = ceph_encode_fh,
.fh_to_dentry = ceph_fh_to_dentry,
.fh_to_parent = ceph_fh_to_parent,
+ .get_parent = ceph_get_parent,
+ .get_name = ceph_get_name,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2ddf061c1c4a..983390069f73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,4 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/striper.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -7,10 +9,51 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/falloc.h>
+#include <linux/iversion.h>
+#include <linux/ktime.h>
+#include <linux/splice.h>
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
+#include "io.h"
+#include "metric.h"
+
+static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ u32 wire_flags = 0;
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ wire_flags |= CEPH_O_RDONLY;
+ break;
+ case O_WRONLY:
+ wire_flags |= CEPH_O_WRONLY;
+ break;
+ case O_RDWR:
+ wire_flags |= CEPH_O_RDWR;
+ break;
+ }
+
+ flags &= ~O_ACCMODE;
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+ ceph_sys2wire(O_CREAT);
+ ceph_sys2wire(O_EXCL);
+ ceph_sys2wire(O_TRUNC);
+ ceph_sys2wire(O_DIRECTORY);
+ ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+ if (flags)
+ doutc(cl, "unused open flags: %x\n", flags);
+
+ return cpu_to_le32(wire_flags);
+}
/*
* Ceph file operations
@@ -33,6 +76,101 @@
* need to wait for MDS acknowledgement.
*/
+/*
+ * How many pages to get in one call to iov_iter_get_pages(). This
+ * determines the size of the on-stack array used as a buffer.
+ */
+#define ITER_GET_BVECS_PAGES 64
+
+static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec *bvecs)
+{
+ size_t size = 0;
+ int bvec_idx = 0;
+
+ if (maxsize > iov_iter_count(iter))
+ maxsize = iov_iter_count(iter);
+
+ while (size < maxsize) {
+ struct page *pages[ITER_GET_BVECS_PAGES];
+ ssize_t bytes;
+ size_t start;
+ int idx = 0;
+
+ bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
+ ITER_GET_BVECS_PAGES, &start);
+ if (bytes < 0)
+ return size ?: bytes;
+
+ size += bytes;
+
+ for ( ; bytes; idx++, bvec_idx++) {
+ int len = min_t(int, bytes, PAGE_SIZE - start);
+
+ bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start);
+ bytes -= len;
+ start = 0;
+ }
+ }
+
+ return size;
+}
+
+/*
+ * iov_iter_get_pages() only considers one iov_iter segment, no matter
+ * what maxsize or maxpages are given. For ITER_BVEC that is a single
+ * page.
+ *
+ * Attempt to get up to @maxsize bytes worth of pages from @iter.
+ * Return the number of bytes in the created bio_vec array, or an error.
+ */
+static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
+ struct bio_vec **bvecs, int *num_bvecs)
+{
+ struct bio_vec *bv;
+ size_t orig_count = iov_iter_count(iter);
+ ssize_t bytes;
+ int npages;
+
+ iov_iter_truncate(iter, maxsize);
+ npages = iov_iter_npages(iter, INT_MAX);
+ iov_iter_reexpand(iter, orig_count);
+
+ /*
+ * __iter_get_bvecs() may populate only part of the array -- zero it
+ * out.
+ */
+ bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
+ if (!bv)
+ return -ENOMEM;
+
+ bytes = __iter_get_bvecs(iter, maxsize, bv);
+ if (bytes < 0) {
+ /*
+ * No pages were pinned -- just free the array.
+ */
+ kvfree(bv);
+ return bytes;
+ }
+
+ *bvecs = bv;
+ *num_bvecs = npages;
+ return bytes;
+}
+
+static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
+{
+ int i;
+
+ for (i = 0; i < num_bvecs; i++) {
+ if (bvecs[i].bv_page) {
+ if (should_dirty)
+ set_page_dirty_lock(bvecs[i].bv_page);
+ put_page(bvecs[i].bv_page);
+ }
+ }
+ kvfree(bvecs);
+}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -41,8 +179,7 @@
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -54,51 +191,101 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
}
+static int ceph_init_file_info(struct inode *inode, struct file *file,
+ int fmode, bool isdir)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mount_options *opt =
+ ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_file_info *fi;
+ int ret;
+
+ doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode),
+ file, inode->i_mode, isdir ? "dir" : "regular");
+ BUG_ON(inode->i_fop->release != ceph_release);
+
+ if (isdir) {
+ struct ceph_dir_file_info *dfi =
+ kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
+ if (!dfi)
+ return -ENOMEM;
+
+ file->private_data = dfi;
+ fi = &dfi->file_info;
+ dfi->next_offset = 2;
+ dfi->readdir_cache_idx = -1;
+ } else {
+ fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
+ if (!fi)
+ return -ENOMEM;
+
+ if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ fi->flags |= CEPH_F_SYNC;
+
+ file->private_data = fi;
+ }
+
+ ceph_get_fmode(ci, fmode, 1);
+ fi->fmode = fmode;
+
+ spin_lock_init(&fi->rw_contexts_lock);
+ INIT_LIST_HEAD(&fi->rw_contexts);
+ fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen);
+
+ if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
+}
+
/*
* initialize private struct file data.
* if we fail, clean up by dropping fmode reference on the ceph_inode
*/
static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
- struct ceph_file_info *cf;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int ret = 0;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
+ ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
+ fallthrough;
case S_IFDIR:
- dout("init_file %p %p 0%o (regular)\n", inode, file,
- inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
- if (cf == NULL) {
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
- return -ENOMEM;
- }
- cf->fmode = fmode;
- cf->next_offset = 2;
- file->private_data = cf;
- BUG_ON(inode->i_fop->release != ceph_release);
+ ret = ceph_init_file_info(inode, file, fmode,
+ S_ISDIR(inode->i_mode));
break;
case S_IFLNK:
- dout("init_file %p %p 0%o (symlink)\n", inode, file,
- inode->i_mode);
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+ doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
break;
default:
- dout("init_file %p %p 0%o (special)\n", inode, file,
- inode->i_mode);
+ doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode,
+ ceph_vinop(inode), file, inode->i_mode);
/*
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
- ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
@@ -108,6 +295,62 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
}
/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode, int fmode)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_request *req;
+ int err, flags, wanted;
+
+ spin_lock(&ci->i_ceph_lock);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ wanted = __ceph_caps_file_wanted(ci);
+ if (__ceph_is_any_real_caps(ci) &&
+ (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
+ int issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+ doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n",
+ inode, ceph_vinop(inode), ceph_cap_string(wanted),
+ ceph_cap_string(issued));
+ ceph_check_caps(ci, 0);
+ return 0;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ flags = 0;
+ if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+ flags = O_RDWR;
+ else if (wanted & CEPH_CAP_FILE_RD)
+ flags = O_RDONLY;
+ else if (wanted & CEPH_CAP_FILE_WR)
+ flags = O_WRONLY;
+#ifdef O_LAZY
+ if (wanted & CEPH_CAP_FILE_LAZYIO)
+ flags |= O_LAZY;
+#endif
+
+ req = prepare_open_request(inode->i_sb, flags, 0);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ ceph_mdsc_put_request(req);
+out:
+ doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode),
+ err);
+ return err < 0 ? err : 0;
+}
+
+/*
* If we already have the requisite capabilities, we can satisfy
* the open request locally (no need to request new caps from the
* MDS). We do, however, need to inform the MDS (asynchronously)
@@ -116,38 +359,70 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
int ceph_open(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
- struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = NULL;
+ struct ceph_file_info *fi = file->private_data;
int err;
int flags, fmode, wanted;
+ struct dentry *dentry;
+ char *path;
+ bool do_sync = false;
+ int mask = MAY_READ;
- if (cf) {
- dout("open file %p is already opened\n", file);
+ if (fi) {
+ doutc(cl, "file %p is already opened\n", file);
return 0;
}
/* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
flags = file->f_flags & ~(O_CREAT|O_EXCL);
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
flags = O_DIRECTORY; /* mds likes to know */
+ } else if (S_ISREG(inode->i_mode)) {
+ err = fscrypt_file_open(inode, file);
+ if (err)
+ return err;
+ }
- dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
- ceph_vinop(inode), file, flags, file->f_flags);
+ doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode,
+ ceph_vinop(inode), file, flags, file->f_flags);
fmode = ceph_flags_to_mode(flags);
wanted = ceph_caps_for_mode(fmode);
+ if (fmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ do_sync = true;
+ } else {
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+ ceph_mdsc_free_path_info(&path_info);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ do_sync = true;
+ err = 0;
+ }
+ }
+
/* snapped files are read-only */
if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
return -EROFS;
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- spin_lock(&ci->i_ceph_lock);
- __ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
@@ -157,33 +432,34 @@ int ceph_open(struct inode *inode, struct file *file)
* asynchronously.
*/
spin_lock(&ci->i_ceph_lock);
- if (__ceph_is_any_real_caps(ci) &&
+ if (!do_sync && __ceph_is_any_real_caps(ci) &&
(((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
- int mds_wanted = __ceph_caps_mds_wanted(ci);
+ int mds_wanted = __ceph_caps_mds_wanted(ci, true);
int issued = __ceph_caps_issued(ci, NULL);
- dout("open %p fmode %d want %s issued %s using existing\n",
- inode, fmode, ceph_cap_string(wanted),
- ceph_cap_string(issued));
- __ceph_get_fmode(ci, fmode);
+ doutc(cl, "open %p fmode %d want %s issued %s using existing\n",
+ inode, fmode, ceph_cap_string(wanted),
+ ceph_cap_string(issued));
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
if ((issued & wanted) != wanted &&
(mds_wanted & wanted) != wanted &&
ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return ceph_init_file(inode, file, fmode);
- } else if (ceph_snap(inode) != CEPH_NOSNAP &&
+ } else if (!do_sync && ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
- __ceph_get_fmode(ci, fmode);
+ __ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
+
spin_unlock(&ci->i_ceph_lock);
- dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+ doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
req = prepare_open_request(inode->i_sb, flags, 0);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -191,67 +467,486 @@ int ceph_open(struct inode *inode, struct file *file)
}
req->r_inode = inode;
ihold(inode);
+
req->r_num_caps = 1;
- if (flags & (O_CREAT|O_TRUNC))
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
- dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+ doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
out:
return err;
}
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+ struct ceph_inode_info *cdst = ceph_inode(dst);
+ struct ceph_inode_info *csrc = ceph_inode(src);
+
+ spin_lock(&cdst->i_ceph_lock);
+ if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+ !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+ memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+ sizeof(cdst->i_cached_layout));
+ rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+ ceph_try_get_string(csrc->i_layout.pool_ns));
+ }
+ spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+ struct ceph_file_layout *lo, u64 *pino)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+ u64 ino;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* No auth cap means no chance for Dc caps */
+ if (!ci->i_auth_cap)
+ goto no_async;
+
+ /* Any delegated inos? */
+ if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+ goto no_async;
+
+ if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+ goto no_async;
+
+ if ((__ceph_caps_issued(ci, NULL) & want) != want)
+ goto no_async;
+
+ if (d_in_lookup(dentry)) {
+ if (!__ceph_dir_is_complete(ci))
+ goto no_async;
+ spin_lock(&dentry->d_lock);
+ di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+ spin_unlock(&dentry->d_lock);
+ } else if (atomic_read(&ci->i_shared_gen) !=
+ READ_ONCE(di->lease_shared_gen)) {
+ goto no_async;
+ }
+
+ ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+ if (!ino)
+ goto no_async;
+
+ *pino = ino;
+ ceph_take_cap_refs(ci, want, false);
+ memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+ rcu_assign_pointer(lo->pool_ns,
+ ceph_try_get_string(ci->i_cached_layout.pool_ns));
+ got = want;
+no_async:
+ spin_unlock(&ci->i_ceph_lock);
+ return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_session *s = NULL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ s = ceph_get_mds_session(ci->i_auth_cap->session);
+ spin_unlock(&ci->i_ceph_lock);
+ if (s) {
+ int err = ceph_restore_deleg_ino(s, ino);
+ if (err)
+ pr_warn_client(cl,
+ "unable to restore delegated ino 0x%llx to session: %d\n",
+ ino, err);
+ ceph_put_mds_session(s);
+ }
+}
+
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool check_cap = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ clear_and_wake_up_bit(CEPH_ASYNC_CREATE_BIT, &ci->i_ceph_flags);
+
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
+ check_cap = true;
+ }
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (check_cap)
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
+ int result = req->r_err ? req->r_err :
+ le32_to_cpu(req->r_reply_info.head->result);
+
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
+ if (result == -EJUKEBOX)
+ goto out;
+
+ mapping_set_error(req->r_parent->i_mapping, result);
+
+ if (result) {
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
+
+ pr_warn_client(cl,
+ "async create failure path=(%llx)%s result=%d!\n",
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
+
+ ceph_dir_clear_complete(req->r_parent);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
+ }
+
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
+
+ if (req->r_deleg_ino != ino)
+ pr_warn_client(cl,
+ "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+ req->r_err, req->r_deleg_ino, ino);
+
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
+ } else if (!result) {
+ pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n",
+ req->r_deleg_ino);
+ }
+out:
+ ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
+ struct dentry *dentry,
+ struct file *file, umode_t mode,
+ struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx,
+ struct ceph_file_layout *lo)
+{
+ int ret;
+ char xattr_buf[4];
+ struct ceph_mds_reply_inode in = { };
+ struct ceph_mds_reply_info_in iinfo = { .in = &in };
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct timespec64 now;
+ struct ceph_string *pool_ns;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+
+ ktime_get_real_ts64(&now);
+
+ iinfo.inline_version = CEPH_INLINE_NONE;
+ iinfo.change_attr = 1;
+ ceph_encode_timespec64(&iinfo.btime, &now);
+
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
+
+ in.ino = cpu_to_le64(vino.ino);
+ in.snapid = cpu_to_le64(CEPH_NOSNAP);
+ in.version = cpu_to_le64(1); // ???
+ in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+ in.cap.cap_id = cpu_to_le64(1);
+ in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+ in.cap.flags = CEPH_CAP_FLAG_AUTH;
+ in.ctime = in.mtime = in.atime = iinfo.btime;
+ in.truncate_seq = cpu_to_le32(1);
+ in.truncate_size = cpu_to_le64(-1ULL);
+ in.xattr_version = cpu_to_le64(1);
+ in.uid = cpu_to_le32(from_kuid(&init_user_ns,
+ mapped_fsuid(req->r_mnt_idmap,
+ &init_user_ns)));
+ if (dir->i_mode & S_ISGID) {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
+
+ /* Directories always inherit the setgid bit. */
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ in.gid = cpu_to_le32(from_kgid(&init_user_ns,
+ mapped_fsgid(req->r_mnt_idmap,
+ &init_user_ns)));
+ }
+ in.mode = cpu_to_le32((u32)mode);
+
+ in.nlink = cpu_to_le32(1);
+ in.max_size = cpu_to_le64(lo->stripe_unit);
+
+ ceph_file_layout_to_legacy(lo, &in.layout);
+ /* lo is private, so pool_ns can't change */
+ pool_ns = rcu_dereference_raw(lo->pool_ns);
+ if (pool_ns) {
+ iinfo.pool_ns_len = pool_ns->len;
+ iinfo.pool_ns_data = pool_ns->str;
+ }
+
+ down_read(&mdsc->snap_rwsem);
+ ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+ req->r_fmode, NULL);
+ up_read(&mdsc->snap_rwsem);
+ if (ret) {
+ doutc(cl, "failed to fill inode: %d\n", ret);
+ ceph_dir_clear_complete(dir);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ discard_new_inode(inode);
+ } else {
+ struct dentry *dn;
+
+ doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n",
+ vino.ino, ceph_ino(dir), dentry->d_name.name);
+ ceph_dir_clear_ordered(dir);
+ ceph_init_inode_acls(inode, as_ctx);
+ if (inode_state_read_once(inode) & I_NEW) {
+ /*
+ * If it's not I_NEW, then someone created this before
+ * we got here. Assume the server is aware of it at
+ * that point and don't worry about setting
+ * CEPH_I_ASYNC_CREATE.
+ */
+ ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+ unlock_new_inode(inode);
+ }
+ if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
+ dn = d_splice_alias(inode, dentry);
+ WARN_ON_ONCE(dn && dn != dentry);
+ }
+ file->f_mode |= FMODE_CREATED;
+ ret = finish_open(file, dentry, ceph_open);
+ }
+
+ spin_lock(&dentry->d_lock);
+ clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_CREATE_BIT, &di->flags);
+ spin_unlock(&dentry->d_lock);
+
+ return ret;
+}
/*
* Do a lookup + open with a single request. If we get a non-existent
* file or symlink, return 1 so the VFS can retry.
*/
int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t mode,
- int *opened)
+ struct file *file, unsigned flags, umode_t mode)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ struct inode *new_inode = NULL;
struct dentry *dn;
+ struct ceph_acl_sec_ctx as_ctx = {};
+ bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
+ int mask;
int err;
+ char *path;
- dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
- dir, dentry, dentry->d_name.len, dentry->d_name.name,
- d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+ doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
+ dir, ceph_vinop(dir), dentry, dentry,
+ d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
- err = ceph_init_dentry(dentry);
- if (err < 0)
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
return err;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
+ dn = d_find_alias(dir);
+ if (!dn) {
+ try_async = false;
+ } else {
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
+ if (IS_ERR(path)) {
+ try_async = false;
+ err = 0;
+ } else {
+ int fmode = ceph_flags_to_mode(flags);
+
+ mask = MAY_READ;
+ if (fmode & CEPH_FILE_MODE_WR)
+ mask |= MAY_WRITE;
+ err = ceph_mds_check_access(mdsc, path, mask);
+ }
+ ceph_mdsc_free_path_info(&path_info);
+ dput(dn);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ try_async = false;
+ err = 0;
+ }
+ }
+
+retry:
+ if (flags & O_CREAT) {
+ if (ceph_quota_is_max_files_exceeded(dir))
+ return -EDQUOT;
+
+ new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(new_inode)) {
+ err = PTR_ERR(new_inode);
+ goto out_ctx;
+ }
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
+ } else if (!d_in_lookup(dentry)) {
+ /* If it's not being looked up, it's negative */
+ return -ENOENT;
+ }
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out_ctx;
+ }
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+ req->r_parent = dir;
+ if (req->r_op == CEPH_MDS_OP_CREATE)
+ req->r_mnt_idmap = mnt_idmap_get(idmap);
+ ihold(dir);
+ if (IS_ENCRYPTED(dir)) {
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ err = fscrypt_prepare_lookup_partial(dir, dentry);
+ if (err < 0)
+ goto out_req;
+ }
+
if (flags & O_CREAT) {
- req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ struct ceph_file_layout lo;
+
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
+ if (try_async && (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+ req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+ req->r_callback = ceph_async_create_cb;
+
+ /* Hash inode before RPC */
+ new_inode = ceph_get_inode(dir->i_sb, vino, new_inode);
+ if (IS_ERR(new_inode)) {
+ err = PTR_ERR(new_inode);
+ new_inode = NULL;
+ goto out_req;
+ }
+ WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW));
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_CREATE;
+ spin_unlock(&dentry->d_lock);
+
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err) {
+ err = ceph_finish_async_create(dir, new_inode,
+ dentry, file,
+ mode, req,
+ &as_ctx, &lo);
+ new_inode = NULL;
+ } else if (err == -EJUKEBOX) {
+ restore_deleg_ino(dir, req->r_deleg_ino);
+ ceph_mdsc_put_request(req);
+ discard_new_inode(new_inode);
+ ceph_release_acl_sec_ctx(&as_ctx);
+ memset(&as_ctx, 0, sizeof(as_ctx));
+ new_inode = NULL;
+ try_async = false;
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
+ goto retry;
+ }
+ ceph_put_string(rcu_dereference_raw(lo.pool_ns));
+ goto out_req;
+ }
}
- req->r_locked_dir = dir; /* caller holds dir->i_mutex */
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
- if (err)
- goto out_err;
- err = ceph_handle_snapdir(req, dentry, err);
- if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_new_inode = new_inode;
+ new_inode = NULL;
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
+ if (err == -ENOENT) {
+ dentry = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_req;
+ }
+ err = 0;
+ }
+
+ if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
- if (d_unhashed(dentry)) {
+ if (d_in_lookup(dentry)) {
dn = ceph_finish_lookup(req, dentry, err);
if (IS_ERR(dn))
err = PTR_ERR(dn);
@@ -260,365 +955,1140 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
dn = NULL;
}
if (err)
- goto out_err;
- if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+ goto out_req;
+ if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
- dout("atomic_open finish_no_open on dn %p\n", dn);
+ doutc(cl, "finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
- dout("atomic_open finish_open on dn %p\n", dn);
+ if (IS_ENCRYPTED(dir) &&
+ !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
+ pr_warn_client(cl,
+ "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
+ ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
+ goto out_req;
+ }
+
+ doutc(cl, "finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- *opened |= FILE_CREATED;
+ struct inode *newino = d_inode(dentry);
+
+ cache_file_layout(dir, newino);
+ ceph_init_inode_acls(newino, &as_ctx);
+ file->f_mode |= FMODE_CREATED;
}
- err = finish_open(file, dentry, ceph_open, opened);
+ err = finish_open(file, dentry, ceph_open);
}
-
-out_err:
+out_req:
ceph_mdsc_put_request(req);
- dout("atomic_open result=%d\n", err);
+ iput(new_inode);
+out_ctx:
+ ceph_release_acl_sec_ctx(&as_ctx);
+ doutc(cl, "result=%d\n", err);
return err;
}
int ceph_release(struct inode *inode, struct file *file)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_file_info *cf = file->private_data;
- dout("release inode %p file %p\n", inode, file);
- ceph_put_fmode(ci, cf->fmode);
- if (cf->last_readdir)
- ceph_mdsc_put_request(cf->last_readdir);
- kfree(cf->last_name);
- kfree(cf->dir_info);
- dput(cf->dentry);
- kmem_cache_free(ceph_file_cachep, cf);
+ if (S_ISDIR(inode->i_mode)) {
+ struct ceph_dir_file_info *dfi = file->private_data;
+ doutc(cl, "%p %llx.%llx dir file %p\n", inode,
+ ceph_vinop(inode), file);
+ WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
+
+ ceph_put_fmode(ci, dfi->file_info.fmode, 1);
+
+ if (dfi->last_readdir)
+ ceph_mdsc_put_request(dfi->last_readdir);
+ kfree(dfi->last_name);
+ kfree(dfi->dir_info);
+ kmem_cache_free(ceph_dir_file_cachep, dfi);
+ } else {
+ struct ceph_file_info *fi = file->private_data;
+ doutc(cl, "%p %llx.%llx regular file %p\n", inode,
+ ceph_vinop(inode), file);
+ WARN_ON(!list_empty(&fi->rw_contexts));
+
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+
+ kmem_cache_free(ceph_file_cachep, fi);
+ }
/* wake up anyone waiting for caps on this inode */
wake_up_all(&ci->i_cap_wq);
return 0;
}
+enum {
+ HAVE_RETRIED = 1,
+ CHECK_EOF = 2,
+ READ_INLINE = 3,
+};
+
/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
+ * Completely synchronous read and write methods. Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads. (That's not
+ * atomic, but good enough for now.)
*
* If we get a short result from the OSD, check against i_size; we need to
* only return a short read to the caller if we hit EOF.
*/
-static int striped_read(struct inode *inode,
- u64 off, u64 len,
- struct page **pages, int num_pages,
- int *checkeof, bool o_direct,
- unsigned long buf_align)
+ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
+ struct iov_iter *to, int *retry_op,
+ u64 *last_objver)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- u64 pos, this_len;
- int io_align, page_align;
- int left, pages_left;
- int read;
- struct page **page_pos;
- int ret;
- bool hit_stripe, was_short;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ ssize_t ret;
+ u64 off = *ki_pos;
+ u64 len = iov_iter_count(to);
+ u64 i_size = i_size_read(inode);
+ bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+ u64 objver = 0;
+
+ doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode,
+ ceph_vinop(inode), *ki_pos, len);
+ if (ceph_inode_is_shutdown(inode))
+ return -EIO;
+
+ if (!len || !i_size)
+ return 0;
/*
- * we may need to do multiple reads. not atomic, unfortunately.
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and sync io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
*/
- pos = off;
- left = len;
- page_pos = pages;
- pages_left = num_pages;
- read = 0;
- io_align = off & ~PAGE_MASK;
-
-more:
- if (o_direct)
- page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
- else
- page_align = pos & ~PAGE_MASK;
- this_len = left;
- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- page_pos, pages_left, page_align);
- if (ret == -ENOENT)
- ret = 0;
- hit_stripe = this_len < left;
- was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ off, off + len - 1);
+ if (ret < 0)
+ return ret;
- if (ret > 0) {
- int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+ ret = 0;
+ while ((len = iov_iter_count(to)) > 0) {
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages;
+ size_t page_off;
+ bool more;
+ int idx = 0;
+ size_t left;
+ struct ceph_osd_req_op *op;
+ u64 read_off = off;
+ u64 read_len = len;
+ int extent_cnt;
+
+ /* determine new offset/length if encrypted */
+ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
+
+ doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len,
+ read_off, read_len);
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, read_off, &read_len, 0, 1,
+ sparse ? CEPH_OSD_OP_SPARSE_READ :
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
- if (read < pos - off) {
- dout(" zero gap %llu to %llu\n", off + read, pos);
- ceph_zero_page_vector_range(page_align + read,
- pos - off - read, pages);
+ /* adjust len downward if the request truncated the len */
+ if (off + len > read_off + read_len)
+ len = read_off + read_len - off;
+ more = len < iov_iter_count(to);
+
+ op = &req->r_ops[0];
+ if (sparse) {
+ extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ break;
+ }
}
- pos += ret;
- read = pos - off;
- left -= ret;
- page_pos += didpages;
- pages_left -= didpages;
- /* hit stripe? */
- if (left && hit_stripe)
- goto more;
- }
+ num_pages = calc_pages_for(read_off, read_len);
+ page_off = offset_in_page(off);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
+ }
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
+ offset_in_page(read_off),
+ false, true);
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_read_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ read_len, ret);
+
+ if (ret > 0)
+ objver = req->r_version;
+
+ i_size = i_size_read(inode);
+ doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len,
+ ret, i_size, (more ? " MORE" : ""));
+
+ /* Fix it to go to end of extent map */
+ if (sparse && ret >= 0)
+ ret = ceph_sparse_ext_map_end(op);
+ else if (ret == -ENOENT)
+ ret = 0;
+
+ if (ret < 0) {
+ ceph_osdc_put_request(req);
+ if (ret == -EBLOCKLISTED)
+ fsc->blocklisted = true;
+ break;
+ }
+
+ if (IS_ENCRYPTED(inode)) {
+ int fret;
+
+ fret = ceph_fscrypt_decrypt_extents(inode, pages,
+ read_off, op->extent.sparse_ext,
+ op->extent.sparse_ext_cnt);
+ if (fret < 0) {
+ ret = fret;
+ ceph_osdc_put_request(req);
+ break;
+ }
+
+ /* account for any partial block at the beginning */
+ fret -= (off - read_off);
+
+ /*
+ * Short read after big offset adjustment?
+ * Nothing is usable, just call it a zero
+ * len read.
+ */
+ fret = max(fret, 0);
+
+ /* account for partial block at the end */
+ ret = min_t(ssize_t, fret, len);
+ }
- if (was_short) {
- /* did we bounce off eof? */
- if (pos + left > inode->i_size)
- *checkeof = 1;
+ /* Short read but not EOF? Zero out the remainder. */
+ if (ret < len && (off + ret < i_size)) {
+ int zlen = min(len - ret, i_size - off - ret);
+ int zoff = page_off + ret;
- /* zero trailing bytes (inside i_size) */
- if (left > 0 && pos < inode->i_size) {
- if (pos + left > inode->i_size)
- left = inode->i_size - pos;
+ doutc(cl, "zero gap %llu~%llu\n", off + ret,
+ off + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
+ }
- dout("zero tail %d\n", left);
- ceph_zero_page_vector_range(page_align + read, left,
- pages);
- read += left;
+ if (off + ret > i_size)
+ left = (i_size > off) ? i_size - off : 0;
+ else
+ left = ret;
+
+ while (left > 0) {
+ size_t plen, copied;
+
+ plen = min_t(size_t, left, PAGE_SIZE - page_off);
+ SetPageUptodate(pages[idx]);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, plen, to);
+ off += copied;
+ left -= copied;
+ page_off = 0;
+ if (copied < plen) {
+ ret = -EFAULT;
+ break;
+ }
}
+
+ ceph_osdc_put_request(req);
+
+ if (off >= i_size || !more)
+ break;
}
- if (ret >= 0)
- ret = read;
- dout("striped_read returns %d\n", ret);
+ if (ret > 0) {
+ if (off >= i_size) {
+ *retry_op = CHECK_EOF;
+ ret = i_size - *ki_pos;
+ *ki_pos = i_size;
+ } else {
+ ret = off - *ki_pos;
+ *ki_pos = off;
+ }
+
+ if (last_objver)
+ *last_objver = objver;
+ }
+ doutc(cl, "result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
-/*
- * Completely synchronous read and write methods. Direct from __user
- * buffer to osd, or directly to user pages (if O_DIRECT).
- *
- * If the read spans object boundary, just do multiple reads.
- */
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
- unsigned len, loff_t *poff, int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
+ int *retry_op)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct page **pages;
- u64 off = *poff;
- int num_pages, ret;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos,
+ iov_iter_count(to),
+ (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
+}
+
+struct ceph_aio_request {
+ struct kiocb *iocb;
+ size_t total_len;
+ bool write;
+ bool should_dirty;
+ int error;
+ struct list_head osd_reqs;
+ unsigned num_reqs;
+ atomic_t pending_reqs;
+ struct timespec64 mtime;
+ struct ceph_cap_flush *prealloc_cf;
+};
- dout("sync_read on file %p %llu~%u %s\n", file, off, len,
- (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+struct ceph_aio_work {
+ struct work_struct work;
+ struct ceph_osd_request *req;
+};
+
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+ struct ceph_aio_request *aio_req)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!atomic_dec_and_test(&aio_req->pending_reqs))
+ return;
+
+ if (aio_req->iocb->ki_flags & IOCB_DIRECT)
+ inode_dio_end(inode);
+
+ ret = aio_req->error;
+ if (!ret)
+ ret = aio_req->total_len;
+
+ doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret);
+
+ if (ret >= 0 && aio_req->write) {
+ int dirty;
+
+ loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+ if (endoff > i_size_read(inode)) {
+ if (ceph_inode_set_size(inode, endoff))
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
+ }
+
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &aio_req->prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+
+ }
+
+ ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD));
- if (file->f_flags & O_DIRECT) {
- num_pages = calc_pages_for((unsigned long)data, len);
- pages = ceph_get_direct_page_vector(data, num_pages, true);
+ aio_req->iocb->ki_complete(aio_req->iocb, ret);
+
+ ceph_free_cap_flush(aio_req->prealloc_cf);
+ kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
+{
+ int rc = req->r_result;
+ struct inode *inode = req->r_inode;
+ struct ceph_aio_request *aio_req = req->r_priv;
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct ceph_osd_req_op *op = &req->r_ops[0];
+ struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
+ unsigned int len = osd_data->bvec_pos.iter.bi_size;
+ bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
+ BUG_ON(!osd_data->num_bvecs);
+
+ doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req,
+ inode, ceph_vinop(inode), rc, len);
+
+ if (rc == -EOLDSNAPC) {
+ struct ceph_aio_work *aio_work;
+ BUG_ON(!aio_req->write);
+
+ aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+ if (aio_work) {
+ INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+ aio_work->req = req;
+ queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
+ &aio_work->work);
+ return;
+ }
+ rc = -ENOMEM;
+ } else if (!aio_req->write) {
+ if (sparse && rc >= 0)
+ rc = ceph_sparse_ext_map_end(op);
+ if (rc == -ENOENT)
+ rc = 0;
+ if (rc >= 0 && len > rc) {
+ struct iov_iter i;
+ int zlen = len - rc;
+
+ /*
+ * If read is satisfied by single OSD request,
+ * it can pass EOF. Otherwise read is within
+ * i_size.
+ */
+ if (aio_req->num_reqs == 1) {
+ loff_t i_size = i_size_read(inode);
+ loff_t endoff = aio_req->iocb->ki_pos + rc;
+ if (endoff < i_size)
+ zlen = min_t(size_t, zlen,
+ i_size - endoff);
+ aio_req->total_len = rc + zlen;
+ }
+
+ iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
+ osd_data->num_bvecs, len);
+ iov_iter_advance(&i, rc);
+ iov_iter_zero(zlen, &i);
+ }
+ }
+
+ /* r_start_latency == 0 means the request was not submitted */
+ if (req->r_start_latency) {
+ if (aio_req->write)
+ ceph_update_write_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+ else
+ ceph_update_read_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, rc);
+ }
+
+ put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
+ aio_req->should_dirty);
+ ceph_osdc_put_request(req);
+
+ if (rc < 0)
+ cmpxchg(&aio_req->error, 0, rc);
+
+ ceph_aio_complete(inode, aio_req);
+ return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+ struct ceph_aio_work *aio_work =
+ container_of(work, struct ceph_aio_work, work);
+ struct ceph_osd_request *orig_req = aio_work->req;
+ struct ceph_aio_request *aio_req = orig_req->r_priv;
+ struct inode *inode = orig_req->r_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_osd_request *req;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
} else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
}
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ spin_unlock(&ci->i_ceph_lock);
- /*
- * flush any page cache pages in this range. this
- * will make concurrent normal and sync io slow,
- * but it will at least behave sensibly when they are
- * in sequence.
- */
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret < 0)
- goto done;
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
+ false, GFP_NOFS);
+ if (!req) {
+ ret = -ENOMEM;
+ req = orig_req;
+ goto out;
+ }
- ret = striped_read(inode, off, len, pages, num_pages, checkeof,
- file->f_flags & O_DIRECT,
- (unsigned long)data & ~PAGE_MASK);
+ req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+ ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
+ ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
- if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
- if (ret >= 0)
- *poff = off + ret;
+ req->r_ops[0] = orig_req->r_ops[0];
-done:
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, true);
- else
- ceph_release_page_vector(pages, num_pages);
- dout("sync_read result %d\n", ret);
- return ret;
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ req = orig_req;
+ goto out;
+ }
+
+ ceph_osdc_put_request(orig_req);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+
+ ceph_osdc_start_request(req->r_osdc, req);
+out:
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req);
+ }
+
+ ceph_put_snap_context(snapc);
+ kfree(aio_work);
}
-/*
- * Write commit request unsafe callback, called to tell us when a
- * request is unsafe (that is, in flight--has been handed to the
- * messenger to send to its target osd). It is called again when
- * we've received a response message indicating the request is
- * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
- * is completed early (and unsuccessfully) due to a timeout or
- * interrupt.
- *
- * This is used if we requested both an ACK and ONDISK commit reply
- * from the OSD.
- */
-static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
-{
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
-
- dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
- unsafe ? "un" : "");
- if (unsafe) {
- ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
- spin_lock(&ci->i_unsafe_lock);
- list_add_tail(&req->r_unsafe_item,
- &ci->i_unsafe_writes);
- spin_unlock(&ci->i_unsafe_lock);
+static ssize_t
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct ceph_snap_context *snapc,
+ struct ceph_cap_flush **pcf)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_client_metric *metric = &fsc->mdsc->metric;
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ struct bio_vec *bvecs;
+ struct ceph_aio_request *aio_req = NULL;
+ int num_pages = 0;
+ int flags;
+ int ret = 0;
+ struct timespec64 mtime = current_time(inode);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos;
+ bool write = iov_iter_rw(iter) == WRITE;
+ bool should_dirty = !write && user_backed_iter(iter);
+ bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
+
+ if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ return -EROFS;
+
+ doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count,
+ snapc, snapc ? snapc->seq : 0);
+
+ if (write) {
+ int ret2;
+
+ ceph_fscache_invalidate(inode, true);
+
+ ret2 = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + count - 1) >> PAGE_SHIFT);
+ if (ret2 < 0)
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret2);
+
+ flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
} else {
- spin_lock(&ci->i_unsafe_lock);
- list_del_init(&req->r_unsafe_item);
- spin_unlock(&ci->i_unsafe_lock);
- ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+ flags = CEPH_OSD_FLAG_READ;
}
+
+ while (iov_iter_count(iter) > 0) {
+ u64 size = iov_iter_count(iter);
+ ssize_t len;
+ struct ceph_osd_req_op *op;
+ int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+ int extent_cnt;
+
+ if (write)
+ size = min_t(u64, size, fsc->mount_options->wsize);
+ else
+ size = min_t(u64, size, fsc->mount_options->rsize);
+
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &size, 0,
+ 1,
+ write ? CEPH_OSD_OP_WRITE : readop,
+ flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ op = &req->r_ops[0];
+ if (!write && sparse) {
+ extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ break;
+ }
+ }
+
+ len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
+ if (len < 0) {
+ ceph_osdc_put_request(req);
+ ret = len;
+ break;
+ }
+ if (len != size)
+ osd_req_op_extent_update(req, 0, len);
+
+ osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
+
+ /*
+ * To simplify error handling, allow AIO when IO within i_size
+ * or IO can be satisfied by single OSD request.
+ */
+ if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+ (len == count || pos + count <= i_size_read(inode))) {
+ aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+ if (aio_req) {
+ aio_req->iocb = iocb;
+ aio_req->write = write;
+ aio_req->should_dirty = should_dirty;
+ INIT_LIST_HEAD(&aio_req->osd_reqs);
+ if (write) {
+ aio_req->mtime = mtime;
+ swap(aio_req->prealloc_cf, *pcf);
+ }
+ }
+ /* ignore error */
+ }
+
+ if (write) {
+ /*
+ * throw out any page cache pages in this range. this
+ * may block.
+ */
+ truncate_inode_pages_range(inode->i_mapping, pos,
+ PAGE_ALIGN(pos + len) - 1);
+
+ req->r_mtime = mtime;
+ }
+
+ if (aio_req) {
+ aio_req->total_len += len;
+ aio_req->num_reqs++;
+ atomic_inc(&aio_req->pending_reqs);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+ list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
+
+ pos += len;
+ continue;
+ }
+
+ ceph_osdc_start_request(req->r_osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ if (write)
+ ceph_update_write_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
+ else
+ ceph_update_read_metrics(metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
+
+ size = i_size_read(inode);
+ if (!write) {
+ if (sparse && ret >= 0)
+ ret = ceph_sparse_ext_map_end(op);
+ else if (ret == -ENOENT)
+ ret = 0;
+
+ if (ret >= 0 && ret < len && pos + ret < size) {
+ struct iov_iter i;
+ int zlen = min_t(size_t, len - ret,
+ size - pos - ret);
+
+ iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
+ iov_iter_advance(&i, ret);
+ iov_iter_zero(zlen, &i);
+ ret += zlen;
+ }
+ if (ret >= 0)
+ len = ret;
+ }
+
+ put_bvecs(bvecs, num_pages, should_dirty);
+ ceph_osdc_put_request(req);
+ if (ret < 0)
+ break;
+
+ pos += len;
+ if (!write && pos >= size)
+ break;
+
+ if (write && pos > size) {
+ if (ceph_inode_set_size(inode, pos))
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY);
+ }
+ }
+
+ if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
+ if (aio_req->num_reqs == 0) {
+ kfree(aio_req);
+ return ret;
+ }
+
+ ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD);
+
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ inode_dio_begin(inode);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
+ struct ceph_osd_request,
+ r_private_item);
+ list_del_init(&req->r_private_item);
+ if (ret >= 0)
+ ceph_osdc_start_request(req->r_osdc, req);
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req);
+ }
+ }
+ return -EIOCBQUEUED;
+ }
+
+ if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+ ret = pos - iocb->ki_pos;
+ iocb->ki_pos = pos;
+ }
+ return ret;
}
/*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
+ * Synchronous write, straight from __user pointer or user pages.
*
* If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t pos, loff_t *ppos)
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
- struct ceph_vino vino;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
- int num_ops = 1;
struct page **pages;
- int num_pages;
u64 len;
+ int num_pages;
int written = 0;
- int flags;
- int check_caps = 0;
- int page_align, io_align;
- unsigned long buf_align;
int ret;
- struct timespec mtime = CURRENT_TIME;
- bool own_pages = false;
+ bool check_caps = false;
+ struct timespec64 mtime = current_time(inode);
+ size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u %s\n", file, pos,
- (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos,
+ (unsigned)count, snapc, snapc->seq);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ pos, pos + count - 1);
if (ret < 0)
return ret;
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + left) >> PAGE_CACHE_SHIFT);
- if (ret < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret);
-
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
- if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
- flags |= CEPH_OSD_FLAG_ACK;
- else
- num_ops++; /* Also include a 'startsync' command. */
+ ceph_fscache_invalidate(inode, false);
+
+ while ((len = iov_iter_count(from)) > 0) {
+ size_t left;
+ int n;
+ u64 write_pos = pos;
+ u64 write_len = len;
+ u64 objnum, objoff;
+ u32 xlen;
+ u64 assert_ver = 0;
+ bool rmw;
+ bool first, last;
+ struct iov_iter saved_iter = *from;
+ size_t off;
+
+ ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
+
+ /* clamp the length to the end of first object */
+ ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
+ write_len, &objnum, &objoff,
+ &xlen);
+ write_len = xlen;
+
+ /* adjust len downward if it goes beyond current object */
+ if (pos + len > write_pos + write_len)
+ len = write_pos + write_len - pos;
- /*
- * we may need to do multiple writes here if we span an object
- * boundary. this isn't atomic, unfortunately. :(
- */
-more:
- io_align = pos & ~PAGE_MASK;
- buf_align = (unsigned long)data & ~PAGE_MASK;
- len = left;
+ /*
+ * If we had to adjust the length or position to align with a
+ * crypto block, then we must do a read/modify/write cycle. We
+ * use a version assertion to redrive the thing if something
+ * changes in between.
+ */
+ first = pos != write_pos;
+ last = (pos + len) != (write_pos + write_len);
+ rmw = first || last;
- snapc = ci->i_snap_realm->cached_context;
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, num_ops,
- CEPH_OSD_OP_WRITE, flags, snapc,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* write from beginning of first page, regardless of io alignment */
- page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
- num_pages = calc_pages_for(page_align, len);
- if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, false);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
- }
+ doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+ ci->i_vino.ino, pos, len, write_pos, write_len,
+ rmw ? "" : "no ");
/*
- * throw out any page cache pages in this range. this
- * may block.
+ * The data is emplaced into the page as it would be if it were
+ * in an array of pagecache pages.
*/
- truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE-1));
- } else {
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ num_pages = calc_pages_for(write_pos, write_len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
- goto out;
+ break;
+ }
+
+ /* Do we need to preload the pages? */
+ if (rmw) {
+ u64 first_pos = write_pos;
+ u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
+ u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
+ struct ceph_osd_req_op *op;
+
+ /* We should only need to do this for encrypted inodes */
+ WARN_ON_ONCE(!IS_ENCRYPTED(inode));
+
+ /* No need to do two reads if first and last blocks are same */
+ if (first && last_pos == first_pos)
+ last = false;
+
+ /*
+ * Allocate a read request for one or two extents,
+ * depending on how the request was aligned.
+ */
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, first ? first_pos : last_pos,
+ &read_len, 0, (first && last) ? 2 : 1,
+ CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ceph_release_page_vector(pages, num_pages);
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /* Something is misaligned! */
+ if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ ret = -EIO;
+ break;
+ }
+
+ /* Add extent for first block? */
+ op = &req->r_ops[0];
+
+ if (first) {
+ osd_req_op_extent_osd_data_pages(req, 0, pages,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(first_pos),
+ false, false);
+ /* We only expect a single extent here */
+ ret = __ceph_alloc_sparse_ext_map(op, 1);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+
+ /* Add extent for last block */
+ if (last) {
+ /* Init the other extent if first extent has been used */
+ if (first) {
+ op = &req->r_ops[1];
+ osd_req_op_extent_init(req, 1,
+ CEPH_OSD_OP_SPARSE_READ,
+ last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
+ ci->i_truncate_size,
+ ci->i_truncate_seq);
+ }
+
+ ret = __ceph_alloc_sparse_ext_map(op, 1);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
+ &pages[num_pages - 1],
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(last_pos),
+ false, false);
+ }
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ /* FIXME: length field is wrong if there are 2 extents */
+ ceph_update_read_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ read_len, ret);
+
+ /* Ok if object is not already present */
+ if (ret == -ENOENT) {
+ /*
+ * If there is no object, then we can't assert
+ * on its version. Set it to 0, and we'll use an
+ * exclusive create instead.
+ */
+ ceph_osdc_put_request(req);
+ ret = 0;
+
+ /*
+ * zero out the soon-to-be uncopied parts of the
+ * first and last pages.
+ */
+ if (first)
+ zero_user_segment(pages[0], 0,
+ offset_in_page(first_pos));
+ if (last)
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else {
+ if (ret < 0) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ op = &req->r_ops[0];
+ if (op->extent.sparse_ext_cnt == 0) {
+ if (first)
+ zero_user_segment(pages[0], 0,
+ offset_in_page(first_pos));
+ else
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else if (op->extent.sparse_ext_cnt != 1 ||
+ ceph_sparse_ext_map_end(op) !=
+ CEPH_FSCRYPT_BLOCK_SIZE) {
+ ret = -EIO;
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ if (first && last) {
+ op = &req->r_ops[1];
+ if (op->extent.sparse_ext_cnt == 0) {
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else if (op->extent.sparse_ext_cnt != 1 ||
+ ceph_sparse_ext_map_end(op) !=
+ CEPH_FSCRYPT_BLOCK_SIZE) {
+ ret = -EIO;
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+
+ /* Grab assert version. It must be non-zero. */
+ assert_ver = req->r_version;
+ WARN_ON_ONCE(ret > 0 && assert_ver == 0);
+
+ ceph_osdc_put_request(req);
+ if (first) {
+ ret = ceph_fscrypt_decrypt_block_inplace(inode,
+ pages[0], CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(first_pos),
+ first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+ if (last) {
+ ret = ceph_fscrypt_decrypt_block_inplace(inode,
+ pages[num_pages - 1],
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(last_pos),
+ last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+ }
+ }
+
+ left = len;
+ off = offset_in_page(pos);
+ for (n = 0; n < num_pages; n++) {
+ size_t plen = min_t(size_t, left, PAGE_SIZE - off);
+
+ /* copy the data */
+ ret = copy_page_from_iter(pages[n], off, plen, from);
+ if (ret != plen) {
+ ret = -EFAULT;
+ break;
+ }
+ off = 0;
+ left -= ret;
}
- ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
if (ret < 0) {
+ doutc(cl, "write failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
- goto out;
+ break;
}
- if ((file->f_flags & O_SYNC) == 0) {
- /* get a second commit callback */
- req->r_unsafe_callback = ceph_sync_write_unsafe;
- req->r_inode = inode;
- own_pages = true;
+ if (IS_ENCRYPTED(inode)) {
+ ret = ceph_fscrypt_encrypt_pages(inode, pages,
+ write_pos, write_len);
+ if (ret < 0) {
+ doutc(cl, "encryption failed with %d\n", ret);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
}
- }
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, own_pages);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, write_pos, &write_len,
+ rmw ? 1 : 0, rmw ? 2 : 1,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ snapc, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ doutc(cl, "write op %lld~%llu\n", write_pos, write_len);
+ osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
+ offset_in_page(write_pos), false,
+ true);
+ req->r_inode = inode;
+ req->r_mtime = mtime;
+
+ /* Set up the assertion */
+ if (rmw) {
+ /*
+ * Set up the assertion. If we don't have a version
+ * number, then the object doesn't exist yet. Use an
+ * exclusive create instead of a version assertion in
+ * that case.
+ */
+ if (assert_ver) {
+ osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0);
+ req->r_ops[0].assert_ver.ver = assert_ver;
+ } else {
+ osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE,
+ CEPH_OSD_OP_FLAG_EXCL);
+ }
+ }
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, false);
- else if (file->f_flags & O_SYNC)
- ceph_release_page_vector(pages, num_pages);
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
+ req->r_end_latency, len, ret);
+ ceph_osdc_put_request(req);
+ if (ret != 0) {
+ doutc(cl, "osd write returned %d\n", ret);
+ /* Version changed! Must re-do the rmw cycle */
+ if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
+ (!assert_ver && ret == -EEXIST)) {
+ /* We should only ever see this on a rmw */
+ WARN_ON_ONCE(!rmw);
+
+ /* The version should never go backward */
+ WARN_ON_ONCE(ret == -EOVERFLOW);
+
+ *from = saved_iter;
+
+ /* FIXME: limit number of times we loop? */
+ continue;
+ }
+ ceph_set_error_write(ci);
+ break;
+ }
-out:
- ceph_osdc_put_request(req);
- if (ret == 0) {
+ ceph_clear_error_write(ci);
+
+ /*
+ * We successfully wrote to a range of the file. Declare
+ * that region of the pagecache invalid.
+ */
+ ret = invalidate_inode_pages2_range(
+ inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + len - 1) >> PAGE_SHIFT);
+ if (ret < 0) {
+ doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+ ret);
+ ret = 0;
+ }
pos += len;
written += len;
- left -= len;
- data += len;
- if (left)
- goto more;
+ doutc(cl, "written %d\n", written);
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY);
+ }
+
+ }
+ if (ret != -EOLDSNAPC && written > 0) {
ret = written;
- *ppos = pos;
- if (pos > i_size_read(inode))
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
- NULL);
+ iocb->ki_pos = pos;
}
+ doutc(cl, "returning %d\n", ret);
return ret;
}
@@ -629,61 +2099,150 @@ out:
*
* Hmm, the sync read case isn't actually async... should it be?
*/
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
- loff_t *ppos = &iocb->ki_pos;
- size_t len = iov->iov_len;
+ size_t len = iov_iter_count(to);
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- void __user *base = iov->iov_base;
+ bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
ssize_t ret;
- int want, got = 0;
- int checkeof = 0, read = 0;
+ int want = 0, got = 0;
+ int retry_op = 0, read = 0;
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), pos, (unsigned)len, inode);
again:
+ doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n",
+ iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode));
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ ret = direct_lock ? ceph_start_io_direct(inode) :
+ ceph_start_io_read(inode);
+ if (ret)
+ return ret;
+
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_CACHE;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_CACHE;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
- if (ret < 0)
- goto out;
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
- ceph_cap_string(got));
+ want |= CEPH_CAP_FILE_LAZYIO;
+
+ ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
+ if (ret < 0) {
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
+ return ret;
+ }
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC))
- /* hmm, this isn't really async... */
- ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
- else
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ (iocb->ki_flags & IOCB_DIRECT) ||
+ (fi->flags & CEPH_F_SYNC)) {
-out:
- dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
- inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+ doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+
+ if (!ceph_has_inline_data(ci)) {
+ if (!retry_op &&
+ (iocb->ki_flags & IOCB_DIRECT) &&
+ !IS_ENCRYPTED(inode)) {
+ ret = ceph_direct_read_write(iocb, to,
+ NULL, NULL);
+ if (ret >= 0 && ret < len)
+ retry_op = CHECK_EOF;
+ } else {
+ ret = ceph_sync_read(iocb, to, &retry_op);
+ }
+ } else {
+ retry_op = READ_INLINE;
+ }
+ } else {
+ CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
+ doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+ ceph_add_rw_context(fi, &rw_ctx);
+ ret = generic_file_read_iter(iocb, to);
+ ceph_del_rw_context(fi, &rw_ctx);
+ }
+
+ doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
- if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
+
+ if (retry_op > HAVE_RETRIED && ret >= 0) {
+ int statret;
+ struct page *page = NULL;
+ loff_t i_size;
+ int mask = CEPH_STAT_CAP_SIZE;
+ if (retry_op == READ_INLINE) {
+ page = __page_cache_alloc(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ mask = CEPH_STAT_CAP_INLINE_DATA;
+ }
+
+ statret = __ceph_do_getattr(inode, page, mask, !!page);
+ if (statret < 0) {
+ if (page)
+ __free_page(page);
+ if (statret == -ENODATA) {
+ BUG_ON(retry_op != READ_INLINE);
+ goto again;
+ }
+ return statret;
+ }
+
+ i_size = i_size_read(inode);
+ if (retry_op == READ_INLINE) {
+ BUG_ON(ret > 0 || read > 0);
+ if (iocb->ki_pos < i_size &&
+ iocb->ki_pos < PAGE_SIZE) {
+ loff_t end = min_t(loff_t, i_size,
+ iocb->ki_pos + len);
+ end = min_t(loff_t, end, PAGE_SIZE);
+ if (statret < end)
+ zero_user_segment(page, statret, end);
+ ret = copy_page_to_iter(page,
+ iocb->ki_pos & ~PAGE_MASK,
+ end - iocb->ki_pos, to);
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+ if (iocb->ki_pos < i_size && read < len) {
+ size_t zlen = min_t(size_t, len - read,
+ i_size - iocb->ki_pos);
+ ret = iov_iter_zero(zlen, to);
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+ __free_pages(page, 0);
+ return read;
+ }
/* hit EOF or hole? */
- if (statret == 0 && *ppos < inode->i_size) {
- dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
+ ret < len) {
+ doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n",
+ iocb->ki_pos, i_size);
+
read += ret;
- base += ret;
len -= ret;
- checkeof = 0;
+ retry_op = HAVE_RETRIED;
goto again;
}
}
+
if (ret >= 0)
ret += read;
@@ -691,6 +2250,71 @@ out:
}
/*
+ * Wrap filemap_splice_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ */
+static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct ceph_file_info *fi = in->private_data;
+ struct inode *inode = file_inode(in);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ ssize_t ret;
+ int want = 0, got = 0;
+ CEPH_DEFINE_RW_CONTEXT(rw_ctx, 0);
+
+ dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n",
+ inode, ceph_vinop(inode), *ppos, len, inode);
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ if (ceph_has_inline_data(ci) ||
+ (fi->flags & CEPH_F_SYNC))
+ return copy_splice_read(in, ppos, pipe, len, flags);
+
+ ret = ceph_start_io_read(inode);
+ if (ret)
+ return ret;
+
+ want = CEPH_CAP_FILE_CACHE;
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want |= CEPH_CAP_FILE_LAZYIO;
+
+ ret = ceph_get_caps(in, CEPH_CAP_FILE_RD, want, -1, &got);
+ if (ret < 0)
+ goto out_end;
+
+ if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) == 0) {
+ dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n",
+ inode, ceph_vinop(inode), *ppos, len,
+ ceph_cap_string(got));
+
+ ceph_put_cap_refs(ci, got);
+ ceph_end_io_read(inode);
+ return copy_splice_read(in, ppos, pipe, len, flags);
+ }
+
+ dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n",
+ inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got));
+
+ rw_ctx.caps = got;
+ ceph_add_rw_context(fi, &rw_ctx);
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ ceph_del_rw_context(fi, &rw_ctx);
+
+ dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n",
+ inode, ceph_vinop(inode), ceph_cap_string(got), ret);
+
+ ceph_put_cap_refs(ci, got);
+out_end:
+ ceph_end_io_read(inode);
+ return ret;
+}
+
+/*
* Take cap references to avoid releasing caps to MDS mid-write.
*
* If we are synchronous, and write with an old snap context, the OSD
@@ -700,116 +2324,188 @@ out:
*
* If we are near ENOSPC, write synchronously.
*/
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
- int err, want, got;
- bool hold_mutex;
+ int err, want = 0, got;
+ bool direct_lock = false;
+ u32 map_flags;
+ u64 pool_flags;
+ loff_t pos;
+ loff_t limit = max(i_size_read(inode), fsc->max_file_size);
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- mutex_lock(&inode->i_mutex);
- hold_mutex = true;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- goto out;
+ if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
+ direct_lock = true;
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = file->f_mapping->backing_dev_info;
-
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+retry_snap:
+ err = direct_lock ? ceph_start_io_direct(inode) :
+ ceph_start_io_write(inode);
if (err)
- goto out;
+ goto out_unlocked;
- if (count == 0)
+ if (iocb->ki_flags & IOCB_APPEND) {
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (err < 0)
+ goto out;
+ }
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
goto out;
- err = file_remove_suid(file);
- if (err)
+ pos = iocb->ki_pos;
+ if (unlikely(pos >= limit)) {
+ err = -EFBIG;
goto out;
+ } else {
+ iov_iter_truncate(from, limit - pos);
+ }
- err = file_update_time(file);
- if (err)
+ count = iov_iter_count(from);
+ if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
+ err = -EDQUOT;
goto out;
+ }
-retry_snap:
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
err = -ENOSPC;
goto out;
}
- dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, count, inode->i_size);
+ err = file_remove_privs(file);
+ if (err)
+ goto out;
+
+ doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, count,
+ i_size_read(inode));
+ if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
+ want |= CEPH_CAP_FILE_BUFFER;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
+ want |= CEPH_CAP_FILE_LAZYIO;
got = 0;
- err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
+ err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
if (err < 0)
goto out;
- dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+ err = file_update_time(file);
+ if (err)
+ goto out_caps;
+
+ inode_inc_iversion_raw(inode);
+
+ doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC)) {
- mutex_unlock(&inode->i_mutex);
- written = ceph_sync_write(file, iov->iov_base, count,
- pos, &iocb->ki_pos);
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
+ (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
+ struct ceph_snap_context *snapc;
+ struct iov_iter data;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ /* we might need to revert back to that point */
+ data = *from;
+ if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode))
+ written = ceph_direct_read_write(iocb, &data, snapc,
+ &prealloc_cf);
+ else
+ written = ceph_sync_write(iocb, &data, pos, snapc);
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_write(inode);
+ if (written > 0)
+ iov_iter_advance(from, written);
+ ceph_put_snap_context(snapc);
} else {
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, &iocb->ki_pos,
- count, 0);
- mutex_unlock(&inode->i_mutex);
+ /*
+ * No need to acquire the i_truncate_mutex. Because
+ * the MDS revokes Fwb caps before sending truncate
+ * message to us. We can't get Fwb cap while there
+ * are pending vmtruncate. So write and vmtruncate
+ * can not run at the same time
+ */
+ written = generic_perform_write(iocb, from);
+ ceph_end_io_write(inode);
}
- hold_mutex = false;
if (written >= 0) {
int dirty;
+
spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
- dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
+ doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count,
+ ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
- if (written >= 0 &&
- ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
- ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + written - 1, 1);
- if (err < 0)
- written = err;
- }
-
if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
- mutex_lock(&inode->i_mutex);
- hold_mutex = true;
+ doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count);
goto retry_snap;
}
-out:
- if (hold_mutex)
- mutex_unlock(&inode->i_mutex);
- current->backing_dev_info = NULL;
+ if (written >= 0) {
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
+ iocb->ki_flags |= IOCB_DSYNC;
+ written = generic_write_sync(iocb, written);
+ }
+
+ goto out_unlocked;
+out_caps:
+ ceph_put_cap_refs(ci, got);
+out:
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_write(inode);
+out_unlocked:
+ ceph_free_cap_flush(prealloc_cf);
return written ? written : err;
}
@@ -818,74 +2514,667 @@ out:
*/
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_mapping->host;
+ if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *inode = file_inode(file);
+ int ret;
+
+ ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (ret < 0)
+ return ret;
+ }
+ return generic_file_llseek(file, offset, whence);
+}
+
+static inline void ceph_zero_partial_page(struct inode *inode,
+ loff_t offset, size_t size)
+{
+ struct folio *folio;
+
+ folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ folio_wait_writeback(folio);
+ folio_zero_range(folio, offset_in_folio(folio, offset), size);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ loff_t nearly = round_up(offset, PAGE_SIZE);
+ if (offset < nearly) {
+ loff_t size = nearly - offset;
+ if (length < size)
+ size = length;
+ ceph_zero_partial_page(inode, offset, size);
+ offset += size;
+ length -= size;
+ }
+ if (length >= PAGE_SIZE) {
+ loff_t size = round_down(length, PAGE_SIZE);
+ truncate_pagecache_range(inode, offset, offset + size - 1);
+ offset += size;
+ length -= size;
+ }
+ if (length)
+ ceph_zero_partial_page(inode, offset, length);
+}
+
+static int ceph_zero_partial_object(struct inode *inode,
+ loff_t offset, loff_t *length)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_osd_request *req;
+ int ret = 0;
+ loff_t zero = 0;
+ int op;
+
+ if (ceph_inode_is_shutdown(inode))
+ return -EIO;
+
+ if (!length) {
+ op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
+ length = &zero;
+ } else {
+ op = CEPH_OSD_OP_ZERO;
+ }
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ ceph_vino(inode),
+ offset, length,
+ 0, 1, op,
+ CEPH_OSD_FLAG_WRITE,
+ NULL, 0, 0, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ goto out;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
+ ceph_osdc_put_request(req);
+
+out:
+ return ret;
+}
+
+static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
+{
+ int ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
+ u64 object_set_size = (u64) object_size * stripe_count;
+ u64 nearly, t;
+
+ /* round offset up to next period boundary */
+ nearly = offset + object_set_size - 1;
+ t = nearly;
+ nearly -= do_div(t, object_set_size);
+
+ while (length && offset < nearly) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ while (length >= object_set_size) {
+ int i;
+ loff_t pos = offset;
+ for (i = 0; i < stripe_count; ++i) {
+ ret = ceph_zero_partial_object(inode, pos, NULL);
+ if (ret < 0)
+ return ret;
+ pos += stripe_unit;
+ }
+ offset += object_set_size;
+ length -= object_set_size;
+ }
+ while (length) {
+ loff_t size = length;
+ ret = ceph_zero_partial_object(inode, offset, &size);
+ if (ret < 0)
+ return ret;
+ offset += size;
+ length -= size;
+ }
+ return ret;
+}
+
+static long ceph_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t length)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *prealloc_cf;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int want, got = 0;
+ int dirty;
+ int ret = 0;
+ loff_t endoff = 0;
+ loff_t size;
+
+ doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
+ inode, ceph_vinop(inode), mode, offset, length);
+
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (IS_ENCRYPTED(inode))
+ return -EOPNOTSUPP;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ inode_lock(inode);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto unlock;
+ }
+
+ size = i_size_read(inode);
+
+ /* Are we punching a hole beyond EOF? */
+ if (offset >= size)
+ goto unlock;
+ if ((offset + length) > size)
+ length = size - offset;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
+ if (ret < 0)
+ goto unlock;
+
+ ret = file_modified(file);
+ if (ret)
+ goto put_caps;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ ceph_fscache_invalidate(inode, false);
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
+
+ if (!ret) {
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+ filemap_invalidate_unlock(inode->i_mapping);
+
+put_caps:
+ ceph_put_cap_refs(ci, got);
+unlock:
+ inode_unlock(inode);
+ ceph_free_cap_flush(prealloc_cf);
+ return ret;
+}
+
+/*
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
+ * this fails; zero is returned on success.
+ */
+static int get_rd_wr_caps(struct file *src_filp, int *src_got,
+ struct file *dst_filp,
+ loff_t dst_endoff, int *dst_got)
+{
+ int ret = 0;
+ bool retrying = false;
+
+retry_caps:
+ ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+ dst_endoff, dst_got);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Since we're already holding the FILE_WR capability for the dst file,
+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
+ * retry dance instead to try to get both capabilities.
+ */
+ ret = ceph_try_get_caps(file_inode(src_filp),
+ CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+ false, src_got);
+ if (ret <= 0) {
+ /* Start by dropping dst_ci caps and getting src_ci caps */
+ ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
+ if (retrying) {
+ if (!ret)
+ /* ceph_try_get_caps masks EAGAIN */
+ ret = -EAGAIN;
+ return ret;
+ }
+ ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
+ CEPH_CAP_FILE_SHARED, -1, src_got);
+ if (ret < 0)
+ return ret;
+ /*... drop src_ci caps too, and retry */
+ ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
+ retrying = true;
+ goto retry_caps;
+ }
+ return ret;
+}
+
+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
+ struct ceph_inode_info *dst_ci, int dst_got)
+{
+ ceph_put_cap_refs(src_ci, src_got);
+ ceph_put_cap_refs(dst_ci, dst_got);
+}
+
+/*
+ * This function does several size-related checks, returning an error if:
+ * - source file is smaller than off+len
+ * - destination file size is not OK (inode_newsize_ok())
+ * - max bytes quotas is exceeded
+ */
+static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+ loff_t src_off, loff_t dst_off, size_t len)
+{
+ struct ceph_client *cl = ceph_inode_to_client(src_inode);
+ loff_t size, endoff;
+
+ size = i_size_read(src_inode);
+ /*
+ * Don't copy beyond source file EOF. Instead of simply setting length
+ * to (size - src_off), just drop to VFS default implementation, as the
+ * local i_size may be stale due to other clients writing to the source
+ * inode.
+ */
+ if (src_off + len > size) {
+ doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off,
+ len, size);
+ return -EOPNOTSUPP;
+ }
+ size = i_size_read(dst_inode);
+
+ endoff = dst_off + len;
+ if (inode_newsize_ok(dst_inode, endoff))
+ return -EOPNOTSUPP;
+
+ if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
+ return -EDQUOT;
+
+ return 0;
+}
+
+static struct ceph_osd_request *
+ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
+ u64 src_snapid,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ struct ceph_object_id *dst_oid,
+ struct ceph_object_locator *dst_oloc,
+ u32 truncate_seq, u64 truncate_size)
+{
+ struct ceph_osd_request *req;
int ret;
+ u32 src_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
+ u32 dst_fadvise_flags =
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+
+ ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
+ ceph_oid_copy(&req->r_t.base_oid, dst_oid);
+
+ ret = osd_req_op_copy_from_init(req, src_snapid, 0,
+ src_oid, src_oloc,
+ src_fadvise_flags,
+ dst_fadvise_flags,
+ truncate_seq,
+ truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+ if (ret)
+ goto out;
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
+ ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+ if (ret)
+ goto out;
- if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
- ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
- if (ret < 0) {
- offset = ret;
+ return req;
+
+out:
+ ceph_osdc_put_request(req);
+ return ERR_PTR(ret);
+}
+
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+ struct ceph_inode_info *dst_ci, u64 *dst_off,
+ struct ceph_fs_client *fsc,
+ size_t len, unsigned int flags)
+{
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *req;
+ ssize_t bytes = 0;
+ u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ u32 object_size = src_ci->i_layout.object_size;
+ struct ceph_client *cl = fsc->client;
+ int ret;
+
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+ osdc = &fsc->client->osdc;
+
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
+ &src_oid, &src_oloc,
+ &dst_oid, &dst_oloc,
+ dst_ci->i_truncate_seq,
+ dst_ci->i_truncate_size);
+ if (IS_ERR(req))
+ ret = PTR_ERR(req);
+ else {
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ object_size, ret);
+ ceph_osdc_put_request(req);
+ }
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ fsc->have_copy_from2 = false;
+ pr_notice_client(cl,
+ "OSDs don't support copy-from2; disabling copy offload\n");
+ }
+ doutc(cl, "returned %d\n", ret);
+ if (bytes <= 0)
+ bytes = ret;
goto out;
}
+ len -= object_size;
+ bytes += object_size;
+ *src_off += object_size;
+ *dst_off += object_size;
}
- switch (whence) {
- case SEEK_END:
- offset += inode->i_size;
- break;
- case SEEK_CUR:
+out:
+ ceph_oloc_destroy(&src_oloc);
+ ceph_oloc_destroy(&dst_oloc);
+ return bytes;
+}
+
+static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ struct ceph_inode_info *src_ci = ceph_inode(src_inode);
+ struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
+ struct ceph_cap_flush *prealloc_cf;
+ struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode);
+ struct ceph_client *cl = src_fsc->client;
+ loff_t size;
+ ssize_t ret = -EIO, bytes;
+ u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
+ u32 src_objlen, dst_objlen;
+ int src_got = 0, dst_got = 0, err, dirty;
+
+ if (src_inode->i_sb != dst_inode->i_sb) {
+ struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode);
+
+ if (ceph_fsid_compare(&src_fsc->client->fsid,
+ &dst_fsc->client->fsid)) {
+ dout("Copying files across clusters: src: %pU dst: %pU\n",
+ &src_fsc->client->fsid, &dst_fsc->client->fsid);
+ return -EXDEV;
+ }
+ }
+ if (ceph_snap(dst_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ /*
+ * Some of the checks below will return -EOPNOTSUPP, which will force a
+ * fallback to the default VFS copy_file_range implementation. This is
+ * desirable in several cases (for ex, the 'len' is smaller than the
+ * size of the objects, or in cases where that would be more
+ * efficient).
+ */
+
+ if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
+ return -EOPNOTSUPP;
+
+ if (!src_fsc->have_copy_from2)
+ return -EOPNOTSUPP;
+
+ /*
+ * Striped file layouts require that we copy partial objects, but the
+ * OSD copy-from operation only supports full-object copies. Limit
+ * this to non-striped file layouts for now.
+ */
+ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
+ (src_ci->i_layout.stripe_count != 1) ||
+ (dst_ci->i_layout.stripe_count != 1) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
+ doutc(cl, "Invalid src/dst files layout\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* Every encrypted inode gets its own key, so we can't offload them */
+ if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode))
+ return -EOPNOTSUPP;
+
+ if (len < src_ci->i_layout.object_size)
+ return -EOPNOTSUPP; /* no remote copy will be done */
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ /* Start by sync'ing the source and destination files */
+ ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+ if (ret < 0) {
+ doutc(cl, "failed to write src file (%zd)\n", ret);
+ goto out;
+ }
+ ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
+ if (ret < 0) {
+ doutc(cl, "failed to write dst file (%zd)\n", ret);
+ goto out;
+ }
+
+ /*
+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
+ * clients may have dirty data in their caches. And OSDs know nothing
+ * about caps, so they can't safely do the remote object copies.
+ */
+ err = get_rd_wr_caps(src_file, &src_got,
+ dst_file, (dst_off + len), &dst_got);
+ if (err < 0) {
+ doutc(cl, "get_rd_wr_caps returned %d\n", err);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+ if (ret < 0)
+ goto out_caps;
+
+ /* Drop dst file cached pages */
+ ceph_fscache_invalidate(dst_inode, false);
+ ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+ dst_off >> PAGE_SHIFT,
+ (dst_off + len) >> PAGE_SHIFT);
+ if (ret < 0) {
+ doutc(cl, "Failed to invalidate inode pages (%zd)\n",
+ ret);
+ ret = 0; /* XXX */
+ }
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ src_ci->i_layout.object_size,
+ &src_objnum, &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ dst_ci->i_layout.object_size,
+ &dst_objnum, &dst_objoff, &dst_objlen);
+ /* object-level offsets need to the same */
+ if (src_objoff != dst_objoff) {
+ ret = -EOPNOTSUPP;
+ goto out_caps;
+ }
+
+ /*
+ * Do a manual copy if the object offset isn't object aligned.
+ * 'src_objlen' contains the bytes left until the end of the object,
+ * starting at the src_off
+ */
+ if (src_objoff) {
+ doutc(cl, "Initial partial copy of %u bytes\n", src_objlen);
+
/*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
+ * we need to temporarily drop all caps as we'll be calling
+ * {read,write}_iter, which will get caps again.
*/
- if (offset == 0) {
- offset = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset >= inode->i_size) {
- ret = -ENXIO;
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+ ret = splice_file_range(src_file, &src_off, dst_file, &dst_off,
+ src_objlen);
+ /* Abort on short copies or on error */
+ if (ret < (long)src_objlen) {
+ doutc(cl, "Failed partial copy (%zd)\n", ret);
goto out;
}
- break;
- case SEEK_HOLE:
- if (offset >= inode->i_size) {
- ret = -ENXIO;
+ len -= ret;
+ err = get_rd_wr_caps(src_file, &src_got,
+ dst_file, (dst_off + len), &dst_got);
+ if (err < 0)
goto out;
- }
- offset = inode->i_size;
- break;
+ err = is_file_size_ok(src_inode, dst_inode,
+ src_off, dst_off, len);
+ if (err < 0)
+ goto out_caps;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+ size = i_size_read(dst_inode);
+ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+ src_fsc, len, flags);
+ if (bytes <= 0) {
+ if (!ret)
+ ret = bytes;
+ goto out_caps;
+ }
+ doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len);
+ len -= bytes;
+ ret += bytes;
+
+ file_update_time(dst_file);
+ inode_inc_iversion_raw(dst_inode);
+
+ if (dst_off > size) {
+ /* Let the MDS know about dst file size change */
+ if (ceph_inode_set_size(dst_inode, dst_off) ||
+ ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
+ }
+ /* Mark Fw dirty */
+ spin_lock(&dst_ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&dst_ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(dst_inode, dirty);
+
+out_caps:
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+
+ /*
+ * Do the final manual copy if we still have some bytes left, unless
+ * there were errors in remote object copies (len >= object_size).
+ */
+ if (len && (len < src_ci->i_layout.object_size)) {
+ doutc(cl, "Final partial copy of %zu bytes\n", len);
+ bytes = splice_file_range(src_file, &src_off, dst_file,
+ &dst_off, len);
+ if (bytes > 0)
+ ret += bytes;
+ else
+ doutc(cl, "Failed partial copy (%zd)\n", bytes);
+ }
out:
- mutex_unlock(&inode->i_mutex);
- return offset;
+ ceph_free_cap_flush(prealloc_cf);
+
+ return ret;
+}
+
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
+ len, flags);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
+ return ret;
}
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
- .mmap = ceph_mmap,
+ .read_iter = ceph_read_iter,
+ .write_iter = ceph_write_iter,
+ .mmap_prepare = ceph_mmap_prepare,
.fsync = ceph_fsync,
.lock = ceph_lock,
+ .setlease = simple_nosetlease,
.flock = ceph_flock,
- .splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_read = ceph_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
- .compat_ioctl = ceph_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .fallocate = ceph_fallocate,
+ .copy_file_range = ceph_copy_file_range,
};
-
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f3a2abf28a77..2966f88310e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/module.h>
@@ -6,12 +7,19 @@
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
-#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/sort.h>
+#include <linux/iversion.h>
+#include <linux/fscrypt.h>
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/decode.h>
/*
@@ -27,73 +35,255 @@
*/
static const struct inode_operations ceph_symlink_iops;
+static const struct inode_operations ceph_encrypted_symlink_iops;
-static void ceph_invalidate_work(struct work_struct *work);
-static void ceph_writeback_work(struct work_struct *work);
-static void ceph_vmtruncate_work(struct work_struct *work);
+static void ceph_inode_work(struct work_struct *work);
/*
* find or create an inode, given the ceph ino number
*/
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
- ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
- inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+
+ ci->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
+ inode_set_iversion_raw(inode, 0);
+ percpu_counter_inc(&mdsc->metric.total_inodes);
+
return 0;
}
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent,
+ struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
+/**
+ * ceph_new_inode - allocate a new inode in advance of an expected create
+ * @dir: parent directory for new inode
+ * @dentry: dentry that may eventually point to new inode
+ * @mode: mode of new inode
+ * @as_ctx: pointer to inherited security context
+ *
+ * Allocate a new inode in advance of an operation to create a new inode.
+ * This allocates the inode and sets up the acl_sec_ctx with appropriate
+ * info for the new inode.
+ *
+ * Returns a pointer to the new inode or an ERR_PTR.
+ */
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+ umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
{
+ int err;
struct inode *inode;
- ino_t t = ceph_vino_to_ino(vino);
- inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
- if (inode == NULL)
+ inode = new_inode(dir->i_sb);
+ if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- dout("get_inode created new inode %p %llx.%llx ino %llx\n",
- inode, ceph_vinop(inode), (u64)inode->i_ino);
- unlock_new_inode(inode);
+
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+
+ if (!S_ISLNK(*mode)) {
+ err = ceph_pre_init_acls(dir, mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+ }
+
+ inode_state_assign_raw(inode, 0);
+ inode->i_mode = *mode;
+
+ err = ceph_security_init_secctx(dentry, *mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+
+ /*
+ * We'll skip setting fscrypt context for snapshots, leaving that for
+ * the handle_reply().
+ */
+ if (ceph_snap(dir) != CEPH_SNAPDIR) {
+ err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
+ if (err)
+ goto out_err;
}
- dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
- vino.snap, inode);
+ return inode;
+out_err:
+ iput(inode);
+ return ERR_PTR(err);
+}
+
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ if (as_ctx->pagelist) {
+ req->r_pagelist = as_ctx->pagelist;
+ as_ctx->pagelist = NULL;
+ }
+ ceph_fscrypt_as_ctx_to_req(req, as_ctx);
+}
+
+/**
+ * ceph_get_inode - find or create/hash a new inode
+ * @sb: superblock to search and allocate in
+ * @vino: vino to search for
+ * @newino: optional new inode to insert if one isn't found (may be NULL)
+ *
+ * Search for or insert a new inode into the hash for the given vino, and
+ * return a reference to it. If new is non-NULL, its reference is consumed.
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
+ struct inode *newino)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct inode *inode;
+
+ if (ceph_vino_is_reserved(vino))
+ return ERR_PTR(-EREMOTEIO);
+
+ if (newino) {
+ inode = inode_insert5(newino, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (inode != newino)
+ iput(newino);
+ } else {
+ inode = iget5_locked(sb, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ }
+
+ if (!inode) {
+ doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
+ ceph_present_inode(inode), ceph_vinop(inode), inode,
+ !!(inode_state_read_once(inode) & I_NEW));
return inode;
}
/*
- * get/constuct snapdir inode for a given directory
+ * get/construct snapdir inode for a given directory
*/
struct inode *ceph_get_snapdir(struct inode *parent)
{
+ struct ceph_client *cl = ceph_inode_to_client(parent);
struct ceph_vino vino = {
.ino = ceph_ino(parent),
.snap = CEPH_SNAPDIR,
};
- struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+ struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret = -ENOTDIR;
- BUG_ON(!S_ISDIR(parent->i_mode));
if (IS_ERR(inode))
return inode;
+
+ if (!S_ISDIR(parent->i_mode)) {
+ pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
+ goto err;
+ }
+
+ if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
+ goto err;
+ }
+
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
- ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
+ inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
+ inode_set_atime_to_ts(inode, inode_get_atime(parent));
ci->i_rbytes = 0;
+ ci->i_btime = ceph_inode(parent)->i_btime;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ /* if encrypted, just borrow fscrypt_auth from parent */
+ if (IS_ENCRYPTED(parent)) {
+ struct ceph_inode_info *pci = ceph_inode(parent);
+
+ ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
+ pci->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (ci->fscrypt_auth) {
+ inode->i_flags |= S_ENCRYPTED;
+ ci->fscrypt_auth_len = pci->fscrypt_auth_len;
+ } else {
+ doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+#endif
+ if (inode_state_read_once(inode) & I_NEW) {
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
+ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+ unlock_new_inode(inode);
+ }
+
return inode;
+err:
+ if ((inode_state_read_once(inode) & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(ret);
}
const struct inode_operations ceph_file_iops = {
.permission = ceph_permission,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+ .get_inode_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
};
@@ -111,6 +301,8 @@ const struct inode_operations ceph_file_iops = {
static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
u32 f)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_frag *frag;
@@ -130,12 +322,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
}
frag = kmalloc(sizeof(*frag), GFP_NOFS);
- if (!frag) {
- pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
- "frag %x\n", &ci->vfs_inode,
- ceph_vinop(&ci->vfs_inode), f);
+ if (!frag)
return ERR_PTR(-ENOMEM);
- }
+
frag->frag = f;
frag->split_by = 0;
frag->mds = -1;
@@ -144,8 +333,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_link_node(&frag->node, parent, p);
rb_insert_color(&frag->node, &ci->i_fragtree);
- dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
return frag;
}
@@ -175,10 +363,10 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
* specified, copy the frag delegation info to the caller if
* it is present.
*/
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
unsigned nway, i;
@@ -187,7 +375,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
if (found)
*found = 0;
- mutex_lock(&ci->i_fragtree_mutex);
while (1) {
WARN_ON(!ceph_frag_contains_value(t, v));
frag = __ceph_find_frag(ci, t);
@@ -203,8 +390,8 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
/* choose child */
nway = 1 << frag->split_by;
- dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
- frag->split_by, nway);
+ doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
+ frag->split_by, nway);
for (i = 0; i < nway; i++) {
n = ceph_frag_make_child(t, frag->split_by, i);
if (ceph_frag_contains_value(n, v)) {
@@ -214,12 +401,21 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
BUG_ON(i == nway);
}
- dout("choose_frag(%x) = %x\n", v, t);
+ doutc(cl, "frag(%x) = %x\n", v, t);
- mutex_unlock(&ci->i_fragtree_mutex);
return t;
}
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 ret;
+ mutex_lock(&ci->i_fragtree_mutex);
+ ret = __ceph_choose_frag(ci, v, pfrag, found);
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return ret;
+}
+
/*
* Process dirfrag (delegation) info from the mds. Include leaf
* fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -229,29 +425,39 @@ static int ceph_fill_dirfrag(struct inode *inode,
struct ceph_mds_reply_dirfrag *dirinfo)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
int ndist = le32_to_cpu(dirinfo->ndist);
+ int diri_auth = -1;
int i;
int err = 0;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ diri_auth = ci->i_auth_cap->mds;
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (mds == -1) /* CDIR_AUTH_PARENT */
+ mds = diri_auth;
+
mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
+ if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
frag = __ceph_find_frag(ci, id);
if (!frag)
goto out;
if (frag->split_by == 0) {
/* tree leaf, remove */
- dout("fill_dirfrag removed %llx.%llx frag %x"
- " (no ref)\n", ceph_vinop(inode), id);
+ doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
+ inode, ceph_vinop(inode), id);
rb_erase(&frag->node, &ci->i_fragtree);
kfree(frag);
} else {
/* tree branch, keep and clear */
- dout("fill_dirfrag cleared %llx.%llx frag %x"
- " referral\n", ceph_vinop(inode), id);
+ doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
+ inode, ceph_vinop(inode), id);
frag->mds = -1;
frag->ndist = 0;
}
@@ -264,8 +470,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
if (IS_ERR(frag)) {
/* this is not the end of the world; we can continue
with bad/inaccurate delegation info */
- pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
- ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+ pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
+ inode, ceph_vinop(inode),
+ le32_to_cpu(dirinfo->frag));
err = -ENOMEM;
goto out;
}
@@ -274,39 +481,167 @@ static int ceph_fill_dirfrag(struct inode *inode,
frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
for (i = 0; i < frag->ndist; i++)
frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
- dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
- ceph_vinop(inode), frag->frag, frag->ndist);
+ doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
+ ceph_vinop(inode), frag->frag, frag->ndist);
out:
mutex_unlock(&ci->i_fragtree_mutex);
return err;
}
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+ struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+ struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+ return ceph_frag_compare(le32_to_cpu(ls->frag),
+ le32_to_cpu(rs->frag));
+}
+
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+ if (!frag)
+ return f == ceph_frag_make(0, 0);
+ if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+ return false;
+ return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
+static int ceph_fill_fragtree(struct inode *inode,
+ struct ceph_frag_tree_head *fragtree,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag, *prev_frag = NULL;
+ struct rb_node *rb_node;
+ unsigned i, split_by, nsplits;
+ u32 id;
+ bool update = false;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ nsplits = le32_to_cpu(fragtree->nsplits);
+ if (nsplits != ci->i_fragtree_nsplits) {
+ update = true;
+ } else if (nsplits) {
+ i = get_random_u32_below(nsplits);
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ if (!__ceph_find_frag(ci, id))
+ update = true;
+ } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+ rb_node = rb_first(&ci->i_fragtree);
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+ update = true;
+ }
+ if (!update && dirinfo) {
+ id = le32_to_cpu(dirinfo->frag);
+ if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+ update = true;
+ }
+ if (!update)
+ goto out_unlock;
+
+ if (nsplits > 1) {
+ sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+ frag_tree_split_cmp, NULL);
+ }
+
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
+ rb_node = rb_first(&ci->i_fragtree);
+ for (i = 0; i < nsplits; i++) {
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ split_by = le32_to_cpu(fragtree->splits[i].by);
+ if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+ pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
+ "frag %x split by %d\n", inode,
+ ceph_vinop(inode), i, nsplits, id, split_by);
+ continue;
+ }
+ frag = NULL;
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (ceph_frag_compare(frag->frag, id) >= 0) {
+ if (frag->frag != id)
+ frag = NULL;
+ else
+ rb_node = rb_next(rb_node);
+ break;
+ }
+ rb_node = rb_next(rb_node);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
+ frag = NULL;
+ }
+ if (!frag) {
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag))
+ continue;
+ }
+ if (frag->split_by == 0)
+ ci->i_fragtree_nsplits++;
+ frag->split_by = split_by;
+ doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
+ prev_frag = frag;
+ }
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ rb_node = rb_next(rb_node);
+ /* delete stale split/leaf node */
+ if (frag->split_by > 0 ||
+ !is_frag_child(frag->frag, prev_frag)) {
+ rb_erase(&frag->node, &ci->i_fragtree);
+ if (frag->split_by > 0)
+ ci->i_fragtree_nsplits--;
+ kfree(frag);
+ }
+ }
+out_unlock:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return 0;
+}
/*
* initialize a newly allocated inode.
*/
struct inode *ceph_alloc_inode(struct super_block *sb)
{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ doutc(fsc->client, "%p\n", &ci->netfs.inode);
+
+ /* Set parameters for the netfs library */
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
spin_lock_init(&ci->i_ceph_lock);
ci->i_version = 0;
+ ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
- atomic_set(&ci->i_release_count, 1);
- atomic_set(&ci->i_complete_count, 0);
+ atomic64_set(&ci->i_ordered_count, 1);
+ atomic64_set(&ci->i_release_count, 1);
+ atomic64_set(&ci->i_complete_seq[0], 0);
+ atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
+ ci->i_max_bytes = 0;
+ ci->i_max_files = 0;
+
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -327,26 +662,24 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
- ci->i_cap_flush_last_tid = 0;
- memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+ ci->i_prealloc_cap_flush = NULL;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
- ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
- ci->i_cap_exporting_mds = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_issued = 0;
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
+ mutex_init(&ci->i_truncate_mutex);
ci->i_truncate_seq = 0;
ci->i_truncate_size = 0;
ci->i_truncate_pending = 0;
+ ci->i_truncate_pagecache_size = 0;
ci->i_max_size = 0;
ci->i_reported_size = 0;
@@ -358,68 +691,92 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
+ ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
- ci->i_shared_gen = 0;
+ atomic_set(&ci->i_filelock_ref, 0);
+ atomic_set(&ci->i_shared_gen, 1);
ci->i_rdcache_gen = 0;
ci->i_rdcache_revoking = 0;
- INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
INIT_LIST_HEAD(&ci->i_snap_realm_item);
INIT_LIST_HEAD(&ci->i_snap_flush_item);
- INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
- INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
-
- INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
-
- return &ci->vfs_inode;
+ INIT_WORK(&ci->i_work, ceph_inode_work);
+ ci->i_work_mask = 0;
+ memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+#ifdef CONFIG_FS_ENCRYPTION
+ ci->i_crypt_info = NULL;
+ ci->fscrypt_auth = NULL;
+ ci->fscrypt_auth_len = 0;
+#endif
+ return &ci->netfs.inode;
}
-static void ceph_i_callback(struct rcu_head *head)
+void ceph_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
struct ceph_inode_info *ci = ceph_inode(inode);
+ kfree(ci->i_symlink);
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(ci->fscrypt_auth);
+#endif
+ fscrypt_free_inode(inode);
kmem_cache_free(ceph_inode_cachep, ci);
}
-void ceph_destroy_inode(struct inode *inode)
+void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_frag *frag;
struct rb_node *n;
- dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+ percpu_counter_dec(&mdsc->metric.total_inodes);
+
+ netfs_wait_for_outstanding_io(inode);
+ truncate_inode_pages_final(&inode->i_data);
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
+ ceph_fscache_unuse_cookie(inode, true);
+ clear_inode(inode);
- ceph_queue_caps_release(inode);
+ ceph_fscache_unregister_inode_cookie(ci);
+ fscrypt_put_encryption_info(inode);
+
+ __ceph_remove_caps(ci);
+
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
+ ceph_adjust_quota_realms_count(inode, false);
/*
* we may still have a snap_realm reference if there are stray
- * caps in i_cap_exporting_issued or i_snap_caps.
+ * caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct ceph_snap_realm *realm = ci->i_snap_realm;
-
- dout(" dropping residual ref to snap realm %p\n", realm);
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
+ if (ceph_snap(inode) == CEPH_NOSNAP) {
+ doutc(cl, " dropping residual ref to snap realm %p\n",
+ ci->i_snap_realm);
+ ceph_change_snap_realm(inode, NULL);
+ } else {
+ ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+ ci->i_snap_realm = NULL;
+ }
}
- kfree(ci->i_symlink);
while ((n = rb_first(&ci->i_fragtree)) != NULL) {
frag = rb_entry(n, struct ceph_inode_frag, node);
rb_erase(n, &ci->i_fragtree);
kfree(frag);
}
+ ci->i_fragtree_nsplits = 0;
__ceph_destroy_xattrs(ci);
if (ci->i_xattrs.blob)
@@ -427,9 +784,14 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- call_rcu(&inode->i_rcu, ceph_i_callback);
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+ return (size + (1<<9) - 1) >> 9;
+}
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
@@ -442,50 +804,84 @@ void ceph_destroy_inode(struct inode *inode)
int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
int queue_trunc = 0;
+ loff_t isize = i_size_read(inode);
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
- (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
- dout("size %lld -> %llu\n", inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1<<9) - 1) >> 9;
+ (truncate_seq == ci->i_truncate_seq && size > isize)) {
+ doutc(cl, "size %lld -> %llu\n", isize, size);
+ if (size > 0 && S_ISDIR(inode->i_mode)) {
+ pr_err_client(cl, "non-zero size for directory\n");
+ size = 0;
+ }
+ i_size_write(inode, size);
+ inode->i_blocks = calc_inode_blocks(size);
+ /*
+ * If we're expanding, then we should be able to just update
+ * the existing cookie.
+ */
+ if (size > isize)
+ ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
- dout("truncate_seq %u -> %u\n",
- ci->i_truncate_seq, truncate_seq);
+ doutc(cl, "truncate_seq %u -> %u\n",
+ ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
+
+ /* the MDS should have revoked these caps */
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
+ CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
* not the only client referencing this file and we
* don't hold those caps, then we need to check whether
* the file is either opened or mmaped
*/
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
- CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
- CEPH_CAP_FILE_EXCL|
- CEPH_CAP_FILE_LAZYIO)) ||
+ if ((issued & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
- __ceph_caps_file_wanted(ci)) {
+ __ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
}
}
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
- ci->i_truncate_size != truncate_size) {
- dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
- truncate_size);
+
+ /*
+ * It's possible that the new sizes of the two consecutive
+ * size truncations will be in the same fscrypt last block,
+ * and we need to truncate the corresponding page caches
+ * anyway.
+ */
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
+ doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
+ ci->i_truncate_size, truncate_size,
+ !!IS_ENCRYPTED(inode));
+
ci->i_truncate_size = truncate_size;
+
+ if (IS_ENCRYPTED(inode)) {
+ doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
+ ci->i_truncate_pagecache_size, size);
+ ci->i_truncate_pagecache_size = size;
+ } else {
+ ci->i_truncate_pagecache_size = truncate_size;
+ }
}
return queue_trunc;
}
void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime)
+ u64 time_warp_seq, struct timespec64 *ctime,
+ struct timespec64 *mtime, struct timespec64 *atime)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct timespec64 iatime = inode_get_atime(inode);
+ struct timespec64 ictime = inode_get_ctime(inode);
+ struct timespec64 imtime = inode_get_mtime(inode);
int warn = 0;
if (issued & (CEPH_CAP_FILE_EXCL|
@@ -493,38 +889,29 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_FILE_BUFFER|
CEPH_CAP_AUTH_EXCL|
CEPH_CAP_XATTR_EXCL)) {
- if (timespec_compare(ctime, &inode->i_ctime) > 0) {
- dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- ctime->tv_sec, ctime->tv_nsec);
- inode->i_ctime = *ctime;
+ if (ci->i_version == 0 ||
+ timespec64_compare(ctime, &ictime) > 0) {
+ doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
+ inode_set_ctime_to_ts(inode, *ctime);
}
- if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+ if (ci->i_version == 0 ||
+ ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
/* the MDS did a utimes() */
- dout("mtime %ld.%09ld -> %ld.%09ld "
- "tw %d -> %d\n",
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec,
- ci->i_time_warp_seq, (int)time_warp_seq);
-
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
+ ci->i_time_warp_seq, (int)time_warp_seq);
+
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else if (time_warp_seq == ci->i_time_warp_seq) {
/* nobody did utimes(); take the max */
- if (timespec_compare(mtime, &inode->i_mtime) > 0) {
- dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_mtime.tv_sec,
- inode->i_mtime.tv_nsec,
- mtime->tv_sec, mtime->tv_nsec);
- inode->i_mtime = *mtime;
+ if (timespec64_compare(mtime, &imtime) > 0) {
+ doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
+ inode_set_mtime_to_ts(inode, *mtime);
}
- if (timespec_compare(atime, &inode->i_atime) > 0) {
- dout("atime %ld.%09ld -> %ld.%09ld inc\n",
- inode->i_atime.tv_sec,
- inode->i_atime.tv_nsec,
- atime->tv_sec, atime->tv_nsec);
- inode->i_atime = *atime;
+ if (timespec64_compare(atime, &iatime) > 0) {
+ doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
+ inode_set_atime_to_ts(inode, *atime);
}
} else if (issued & CEPH_CAP_FILE_EXCL) {
/* we did a utimes(); ignore mds values */
@@ -534,43 +921,115 @@ void ceph_fill_file_time(struct inode *inode, int issued,
} else {
/* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
- inode->i_ctime = *ctime;
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ inode_set_ctime_to_ts(inode, *ctime);
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else {
warn = 1;
}
}
if (warn) /* time_warp_seq shouldn't go backwards */
- dout("%p mds time_warp_seq %llu < %u\n",
- inode, time_warp_seq, ci->i_time_warp_seq);
+ doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
+ time_warp_seq, ci->i_time_warp_seq);
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int enclen, u8 **decsym)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int declen;
+ u8 *sym;
+
+ sym = kmalloc(enclen + 1, GFP_NOFS);
+ if (!sym)
+ return -ENOMEM;
+
+ declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP);
+ if (declen < 0) {
+ pr_err_client(cl,
+ "can't decode symlink (%d). Content: %.*s\n",
+ declen, enclen, encsym);
+ kfree(sym);
+ return -EIO;
+ }
+ sym[declen + 1] = '\0';
+ *decsym = sym;
+ return declen;
+}
+#else
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+ const char *encsym,
+ int symlen, u8 **decsym)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
/*
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
-static int fill_inode(struct inode *inode,
- struct ceph_mds_reply_info_in *iinfo,
- struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session,
- unsigned long ttl_from, int cap_fmode,
- struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued = 0, implemented;
- struct timespec mtime, atime, ctime;
- u32 nsplits;
+ int issued, new_issued, info_caps;
+ struct timespec64 mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_buffer *old_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
+ struct ceph_cap *new_cap = NULL;
int err = 0;
- int queue_trunc = 0;
+ bool wake = false;
+ bool queue_trunc = false;
+ bool new_version = false;
+ bool fill_inline = false;
+ umode_t mode = le32_to_cpu(info->mode);
+ dev_t rdev = le32_to_cpu(info->rdev);
+
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
+ doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
+ le64_to_cpu(info->version), ci->i_version);
- dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
- inode, ceph_vinop(inode), le64_to_cpu(info->version),
- ci->i_version);
+ /* Once I_NEW is cleared, we can't change type or dev numbers */
+ if (inode_state_read_once(inode) & I_NEW) {
+ inode->i_mode = mode;
+ } else {
+ if (inode_wrong_type(inode, mode)) {
+ pr_warn_once_client(cl,
+ "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ return -ESTALE;
+ }
+
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
+ pr_warn_once_client(cl,
+ "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
+ return -ESTALE;
+ }
+ }
+
+ info_caps = le32_to_cpu(info->cap.caps);
+
+ /* prealloc new cap struct */
+ if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
+ new_cap = ceph_get_cap(mdsc, caps_reservation);
+ if (!new_cap)
+ return -ENOMEM;
+ }
/*
* prealloc xattr data, if it looks like we'll need it. only
@@ -580,10 +1039,17 @@ static int fill_inode(struct inode *inode,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
- pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
- iinfo->xattr_len);
+ pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
+ iinfo->xattr_len);
}
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
+ if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+ ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -597,77 +1063,158 @@ static int fill_inode(struct inode *inode,
* 3 2 skip
* 3 3 update
*/
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) >= le64_to_cpu(info->version))
- goto no_change;
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
- /* update inode */
- ci->i_version = le64_to_cpu(info->version);
- inode->i_version++;
- inode->i_rdev = le32_to_cpu(info->rdev);
-
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
+ if (ci->i_version == 0 ||
+ ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ le64_to_cpu(info->version) > (ci->i_version & ~1)))
+ new_version = true;
+
+ /* Update change_attribute */
+ inode_set_max_iversion_raw(inode, iinfo->change_attr);
+
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+ new_issued = ~issued & info_caps;
+
+ __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+
+#ifdef CONFIG_FS_ENCRYPTION
+ if (iinfo->fscrypt_auth_len &&
+ ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+ ci->fscrypt_auth = iinfo->fscrypt_auth;
+ iinfo->fscrypt_auth = NULL;
+ iinfo->fscrypt_auth_len = 0;
+ inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+ }
+#endif
+
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
- dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kgid(&init_user_ns, inode->i_gid));
+ doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+ ceph_vinop(inode), inode->i_mode,
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
+ ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
+ ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ /* directories have fl_stripe_unit set to zero */
+ if (IS_ENCRYPTED(inode))
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+ else if (le32_to_cpu(info->layout.fl_stripe_unit))
+ inode->i_blkbits =
+ fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ else
+ inode->i_blkbits = CEPH_BLOCK_SHIFT;
+
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
- }
-
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec64(&atime, &info->atime);
+ ceph_decode_timespec64(&mtime, &info->mtime);
+ ceph_decode_timespec64(&ctime, &info->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
+ ci->i_files = le64_to_cpu(info->files);
+ ci->i_subdirs = le64_to_cpu(info->subdirs);
+ }
+
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ u64 size = le64_to_cpu(info->size);
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ pool_ns = old_ns;
+
+ if (IS_ENCRYPTED(inode) && size &&
+ iinfo->fscrypt_file_len == sizeof(__le64)) {
+ u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
+
+ if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
+ size = fsize;
+ } else {
+ pr_warn_client(cl,
+ "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+ info->size, size);
+ }
+ }
+
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ size);
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ doutc(cl, "max_size %lld -> %llu\n",
+ ci->i_max_size, le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+ }
+
+ /* layout and rstat are not tracked by capability, update them if
+ * the inode info is from auth mds */
+ if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
+ if (S_ISDIR(inode->i_mode)) {
+ ci->i_dir_layout = iinfo->dir_layout;
+ ci->i_rbytes = le64_to_cpu(info->rbytes);
+ ci->i_rfiles = le64_to_cpu(info->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+ ci->i_dir_pin = iinfo->dir_pin;
+ ci->i_rsnaps = iinfo->rsnaps;
+ ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
+ }
+ }
/* xattrs */
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
- if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+ if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = xattr_blob;
if (xattr_blob)
memcpy(ci->i_xattrs.blob->vec.iov_base,
iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ ceph_forget_all_cached_acls(inode);
+ ceph_security_invalidate_secctx(inode);
xattr_blob = NULL;
}
+ /* finally update i_version */
+ if (le64_to_cpu(info->version) > ci->i_version)
+ ci->i_version = le64_to_cpu(info->version);
+
inode->i_mapping->a_ops = &ceph_aops;
- inode->i_mapping->backing_dev_info =
- &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFBLK:
case S_IFCHR:
case S_IFSOCK:
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ inode->i_blkbits = PAGE_SHIFT;
+ init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ceph_file_iops;
break;
case S_IFREG:
@@ -675,21 +1222,45 @@ static int fill_inode(struct inode *inode,
inode->i_fop = &ceph_file_fops;
break;
case S_IFLNK:
- inode->i_op = &ceph_symlink_iops;
if (!ci->i_symlink) {
u32 symlen = iinfo->symlink_len;
char *sym;
spin_unlock(&ci->i_ceph_lock);
- err = -EINVAL;
- if (WARN_ON(symlen != inode->i_size))
- goto out;
+ if (IS_ENCRYPTED(inode)) {
+ if (symlen != i_size_read(inode))
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
+ i_size_read(inode));
+
+ err = decode_encrypted_symlink(mdsc, iinfo->symlink,
+ symlen, (u8 **)&sym);
+ if (err < 0) {
+ pr_err_client(cl,
+ "decoding encrypted symlink failed: %d\n",
+ err);
+ goto out;
+ }
+ symlen = err;
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ } else {
+ if (symlen != i_size_read(inode)) {
+ pr_err_client(cl,
+ "%p %llx.%llx BAD symlink size %lld\n",
+ inode, ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
- err = -ENOMEM;
- sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
- if (!sym)
- goto out;
+ err = -ENOMEM;
+ sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+ if (!sym)
+ goto out;
+ }
spin_lock(&ci->i_ceph_lock);
if (!ci->i_symlink)
@@ -697,228 +1268,276 @@ static int fill_inode(struct inode *inode,
else
kfree(sym); /* lost a race */
}
+
+ if (IS_ENCRYPTED(inode)) {
+ /*
+ * Encrypted symlinks need to be decrypted before we can
+ * cache their targets in i_link. Don't touch it here.
+ */
+ inode->i_op = &ceph_encrypted_symlink_iops;
+ } else {
+ inode->i_link = ci->i_symlink;
+ inode->i_op = &ceph_symlink_iops;
+ }
break;
case S_IFDIR:
inode->i_op = &ceph_dir_iops;
inode->i_fop = &ceph_dir_fops;
-
- ci->i_dir_layout = iinfo->dir_layout;
-
- ci->i_files = le64_to_cpu(info->files);
- ci->i_subdirs = le64_to_cpu(info->subdirs);
- ci->i_rbytes = le64_to_cpu(info->rbytes);
- ci->i_rfiles = le64_to_cpu(info->rfiles);
- ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
- ceph_decode_timespec(&ci->i_rctime, &info->rctime);
break;
default:
- pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
- ceph_vinop(inode), inode->i_mode);
- }
-
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
- __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
- ci->i_max_offset = 2;
- }
-no_change:
- spin_unlock(&ci->i_ceph_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
- if (IS_ERR(frag))
- continue;
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode);
}
- mutex_unlock(&ci->i_fragtree_mutex);
/* were we issued a capability? */
- if (info->cap.caps) {
+ if (info_caps) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
+ info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
+ info->cap.flags, &new_cap);
+
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ (info_caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ doutc(cl, " marking %p complete (empty)\n",
+ inode);
+ i_size_write(inode, 0);
+ __ceph_dir_set_complete(ci,
+ atomic64_read(&ci->i_release_count),
+ atomic64_read(&ci->i_ordered_count));
+ }
+
+ wake = true;
} else {
- spin_lock(&ci->i_ceph_lock);
- dout(" %p got snap_caps %s\n", inode,
- ceph_cap_string(le32_to_cpu(info->cap.caps)));
- ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
- if (cap_fmode >= 0)
- __ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&ci->i_ceph_lock);
+ doutc(cl, " %p got snap_caps %s\n", inode,
+ ceph_cap_string(info_caps));
+ ci->i_snap_caps |= info_caps;
}
- } else if (cap_fmode >= 0) {
- pr_warning("mds issued no caps on %llx.%llx\n",
- ceph_vinop(inode));
- __ceph_get_fmode(ci, cap_fmode);
}
+ if (iinfo->inline_version > 0 &&
+ iinfo->inline_version >= ci->i_inline_version) {
+ int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ ci->i_inline_version = iinfo->inline_version;
+ if (ceph_has_inline_data(ci) &&
+ (locked_page || (info_caps & cache_caps)))
+ fill_inline = true;
+ }
+
+ if (cap_fmode >= 0) {
+ if (!info_caps)
+ pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
+ ceph_vinop(inode));
+ __ceph_touch_fmode(ci, mdsc, cap_fmode);
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+
+ ceph_fscache_register_inode_cookie(inode);
+
+ if (fill_inline)
+ ceph_fill_inline_data(inode, locked_page,
+ iinfo->inline_data, iinfo->inline_len);
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ /* populate frag tree */
+ if (S_ISDIR(inode->i_mode))
+ ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
+
/* update delegation info? */
if (dirinfo)
ceph_fill_dirfrag(inode, dirinfo);
err = 0;
-
out:
- if (xattr_blob)
- ceph_buffer_put(xattr_blob);
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
+ ceph_buffer_put(old_blob);
+ ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
return err;
}
/*
- * caller should hold session s_mutex.
+ * caller should hold session s_mutex and dentry->d_lock.
*/
-static void update_dentry_lease(struct dentry *dentry,
- struct ceph_mds_reply_lease *lease,
- struct ceph_mds_session *session,
- unsigned long from_time)
+static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time,
+ struct ceph_mds_session **old_lease_session)
{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
+ unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
- struct inode *dir;
+
+ doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
/* only track leases on regular dentries */
- if (dentry->d_op != &ceph_dentry_ops)
+ if (ceph_snap(dir) != CEPH_NOSNAP)
return;
- spin_lock(&dentry->d_lock);
- dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
- dentry, duration, ttl);
-
- /* make lease_rdcache_gen match directory */
- dir = dentry->d_parent->d_inode;
- di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
-
- if (duration == 0)
- goto out_unlock;
+ if (mask & CEPH_LEASE_PRIMARY_LINK)
+ di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+ else
+ di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
- if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
- goto out_unlock; /* we already have a newer lease. */
+ di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
+ if (!(mask & CEPH_LEASE_VALID)) {
+ __ceph_dentry_dir_lease_touch(di);
+ return;
+ }
- if (di->lease_session && di->lease_session != session)
- goto out_unlock;
+ if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
+ time_before(ttl, di->time))
+ return; /* we already have a newer lease. */
- ceph_dentry_lru_touch(dentry);
+ if (di->lease_session && di->lease_session != session) {
+ *old_lease_session = di->lease_session;
+ di->lease_session = NULL;
+ }
if (!di->lease_session)
di->lease_session = ceph_get_mds_session(session);
- di->lease_gen = session->s_cap_gen;
+ di->lease_gen = atomic_read(&session->s_cap_gen);
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
- dentry->d_time = ttl;
-out_unlock:
+ di->time = ttl;
+
+ __ceph_dentry_lease_touch(di);
+}
+
+static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time)
+{
+ struct ceph_mds_session *old_lease_session = NULL;
+ spin_lock(&dentry->d_lock);
+ __update_dentry_lease(dir, dentry, lease, session, from_time,
+ &old_lease_session);
spin_unlock(&dentry->d_lock);
- return;
+ ceph_put_mds_session(old_lease_session);
}
/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
+ * update dentry lease without having parent inode locked
*/
-static void ceph_set_dentry_offset(struct dentry *dn)
+static void update_dentry_lease_careful(struct dentry *dentry,
+ struct ceph_mds_reply_lease *lease,
+ struct ceph_mds_session *session,
+ unsigned long from_time,
+ char *dname, u32 dname_len,
+ struct ceph_vino *pdvino,
+ struct ceph_vino *ptvino)
+
{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci;
- struct ceph_dentry_info *di;
+ struct inode *dir;
+ struct ceph_mds_session *old_lease_session = NULL;
- BUG_ON(!inode);
+ spin_lock(&dentry->d_lock);
+ /* make sure dentry's name matches target */
+ if (dentry->d_name.len != dname_len ||
+ memcmp(dentry->d_name.name, dname, dname_len))
+ goto out_unlock;
- ci = ceph_inode(inode);
- di = ceph_dentry(dn);
+ dir = d_inode(dentry->d_parent);
+ /* make sure parent matches dvino */
+ if (!ceph_ino_compare(dir, pdvino))
+ goto out_unlock;
- spin_lock(&ci->i_ceph_lock);
- if (!__ceph_dir_is_complete(ci)) {
- spin_unlock(&ci->i_ceph_lock);
- return;
+ /* make sure dentry's inode matches target. NULL ptvino means that
+ * we expect a negative dentry */
+ if (ptvino) {
+ if (d_really_is_negative(dentry))
+ goto out_unlock;
+ if (!ceph_ino_compare(d_inode(dentry), ptvino))
+ goto out_unlock;
+ } else {
+ if (d_really_is_positive(dentry))
+ goto out_unlock;
}
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&ci->i_ceph_lock);
- spin_lock(&dir->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &dir->d_subdirs);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dir->d_lock);
+ __update_dentry_lease(dir, dentry, lease, session,
+ from_time, &old_lease_session);
+out_unlock:
+ spin_unlock(&dentry->d_lock);
+ ceph_put_mds_session(old_lease_session);
}
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
+ * caller must hold directory i_rwsem for this to be safe.
*/
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash, bool set_offset)
+static int splice_dentry(struct dentry **pdn, struct inode *in)
{
+ struct ceph_client *cl = ceph_inode_to_client(in);
+ struct dentry *dn = *pdn;
struct dentry *realdn;
- BUG_ON(dn->d_inode);
+ BUG_ON(d_inode(dn));
+
+ if (S_ISDIR(in->i_mode)) {
+ /* If inode is directory, d_splice_alias() below will remove
+ * 'realdn' from its origin parent. We need to ensure that
+ * origin parent's readdir cache will not reference 'realdn'
+ */
+ realdn = d_find_any_alias(in);
+ if (realdn) {
+ struct ceph_dentry_info *di = ceph_dentry(realdn);
+ spin_lock(&realdn->d_lock);
+
+ realdn->d_op->d_prune(realdn);
+
+ di->time = jiffies;
+ di->lease_shared_gen = 0;
+ di->offset = 0;
+
+ spin_unlock(&realdn->d_lock);
+ dput(realdn);
+ }
+ }
/* dn must be unhashed */
if (!d_unhashed(dn))
d_drop(dn);
- realdn = d_materialise_unique(dn, in);
+ realdn = d_splice_alias(in, dn);
if (IS_ERR(realdn)) {
- pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
- PTR_ERR(realdn), dn, in, ceph_vinop(in));
- if (prehash)
- *prehash = false; /* don't rehash on error */
- dn = realdn; /* note realdn contains the error */
- goto out;
- } else if (realdn) {
- dout("dn %p (%d) spliced with %p (%d) "
- "inode %p ino %llx.%llx\n",
- dn, d_count(dn),
- realdn, d_count(realdn),
- realdn->d_inode, ceph_vinop(realdn->d_inode));
+ pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
+ PTR_ERR(realdn), dn, in, ceph_vinop(in));
+ return PTR_ERR(realdn);
+ }
+
+ if (realdn) {
+ doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
+ dn, d_count(dn), realdn, d_count(realdn),
+ d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
- dn = realdn;
+ *pdn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
- dout("dn %p attached to %p ino %llx.%llx\n",
- dn, dn->d_inode, ceph_vinop(dn->d_inode));
+ doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
+ d_inode(dn), ceph_vinop(d_inode(dn)));
}
- if ((!prehash || *prehash) && d_unhashed(dn))
- d_rehash(dn);
- if (set_offset)
- ceph_set_dentry_offset(dn);
-out:
- return dn;
+ return 0;
}
/*
@@ -932,81 +1551,153 @@ out:
*
* Called with snap_rwsem (read).
*/
-int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
- struct ceph_mds_session *session)
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
{
+ struct ceph_mds_session *session = req->r_session;
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
- struct ceph_vino vino;
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- int i = 0;
+ struct ceph_vino tvino, dvino;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
+ struct inode *parent_dir = NULL;
int err = 0;
- dout("fill_trace %p is_dentry %d is_target %d\n", req,
- rinfo->head->is_dentry, rinfo->head->is_target);
-
-#if 0
- /*
- * Debugging hook:
- *
- * If we resend completed ops to a recovering mds, we get no
- * trace. Since that is very rare, pretend this is the case
- * to ensure the 'no trace' handlers in the callers behave.
- *
- * Fill in inodes unconditionally to avoid breaking cap
- * invariants.
- */
- if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
- pr_info("fill_trace faking empty trace on %lld %s\n",
- req->r_tid, ceph_mds_op_name(rinfo->head->op));
- if (rinfo->head->is_dentry) {
- rinfo->head->is_dentry = 0;
- err = fill_inode(req->r_locked_dir,
- &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1);
- }
- if (rinfo->head->is_target) {
- rinfo->head->is_target = 0;
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- err = fill_inode(in, &rinfo->targeti, NULL,
- session, req->r_request_started,
- req->r_fmode);
- iput(in);
- }
- }
-#endif
+ doutc(cl, "%p is_dentry %d is_target %d\n", req,
+ rinfo->head->is_dentry, rinfo->head->is_target);
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
- dout("fill_trace reply is empty!\n");
- if (rinfo->head->result == 0 && req->r_locked_dir)
+ doutc(cl, "reply is empty!\n");
+ if (rinfo->head->result == 0 && req->r_parent)
ceph_invalidate_dir_request(req);
return 0;
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_locked_dir;
-
- if (dir) {
- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1,
- &req->r_caps_reservation);
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
+ if (unlikely(IS_ERR(parent_dir))) {
+ err = PTR_ERR(parent_dir);
+ goto done;
+ }
+ if (parent_dir) {
+ err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
+ rinfo->dirfrag, session, -1,
+ &req->r_caps_reservation);
if (err < 0)
- return err;
+ goto done;
} else {
WARN_ON_ONCE(1);
}
+
+ if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ bool is_nokey = false;
+ struct qstr dname;
+ struct dentry *dn, *parent;
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname = { .dir = parent_dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+ .ctext_len = rinfo->altname_len };
+
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+ parent = d_find_any_alias(parent_dir);
+ BUG_ON(!parent);
+
+ err = ceph_fname_alloc_buffer(parent_dir, &oname);
+ if (err < 0) {
+ dput(parent);
+ goto done;
+ }
+
+ err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+ if (err < 0) {
+ dput(parent);
+ ceph_fname_free_buffer(parent_dir, &oname);
+ goto done;
+ }
+ dname.name = oname.name;
+ dname.len = oname.len;
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (!dn) {
+ dput(parent);
+ ceph_fname_free_buffer(parent_dir, &oname);
+ err = -ENOMEM;
+ goto done;
+ }
+ if (is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
+ err = 0;
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
+ ceph_dir_clear_ordered(parent_dir);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+ ceph_fname_free_buffer(parent_dir, &oname);
+
+ req->r_dentry = dn;
+ dput(parent);
+ }
+ }
+
+ if (rinfo->head->is_target) {
+ /* Should be filled in by handle_reply */
+ BUG_ON(!req->r_target_inode);
+
+ in = req->r_target_inode;
+ err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+ NULL, session,
+ (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
+ rinfo->head->result == 0) ? req->r_fmode : -1,
+ &req->r_caps_reservation);
+ if (err < 0) {
+ pr_err_client(cl, "badness %p %llx.%llx\n", in,
+ ceph_vinop(in));
+ req->r_target_inode = NULL;
+ if (inode_state_read_once(in) & I_NEW)
+ discard_new_inode(in);
+ else
+ iput(in);
+ goto done;
+ }
+ if (inode_state_read_once(in) & I_NEW)
+ unlock_new_inode(in);
}
/*
* ignore null lease/binding on snapdir ENOENT, or else we
* will have trouble splicing in the virtual snapdir later
*/
- if (rinfo->head->is_dentry && !req->r_aborted &&
- req->r_locked_dir &&
+ if (rinfo->head->is_dentry &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
fsc->mount_options->snapdir_name,
req->r_dentry->d_name.len))) {
@@ -1015,17 +1706,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* mknod symlink mkdir : null -> new inode
* unlink : linked -> null
*/
- struct inode *dir = req->r_locked_dir;
+ struct inode *dir = req->r_parent;
struct dentry *dn = req->r_dentry;
bool have_dir_cap, have_lease;
BUG_ON(!dn);
BUG_ON(!dir);
- BUG_ON(dn->d_parent->d_inode != dir);
- BUG_ON(ceph_ino(dir) !=
- le64_to_cpu(rinfo->diri.in->ino));
- BUG_ON(ceph_snap(dir) !=
- le64_to_cpu(rinfo->diri.in->snapid));
+ BUG_ON(d_inode(dn->d_parent) != dir);
+
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ BUG_ON(ceph_ino(dir) != dvino.ino);
+ BUG_ON(ceph_snap(dir) != dvino.snap);
/* do we have a lease on the whole dir? */
have_dir_cap =
@@ -1036,157 +1729,130 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
have_lease = have_dir_cap ||
le32_to_cpu(rinfo->dlease->duration_ms);
if (!have_lease)
- dout("fill_trace no dentry lease or dir cap\n");
+ doutc(cl, "no dentry lease or dir cap\n");
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
- dout("fill_trace doing d_move %p -> %p\n",
- req->r_old_dentry, dn);
+ struct inode *olddir = req->r_old_dentry_dir;
+ BUG_ON(!olddir);
+
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
+ doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
+
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn);
- dout(" src %p '%.*s' dst %p '%.*s'\n",
- req->r_old_dentry,
- req->r_old_dentry->d_name.len,
- req->r_old_dentry->d_name.name,
- dn, dn->d_name.len, dn->d_name.name);
+ doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+ req->r_old_dentry, req->r_old_dentry, dn, dn);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /*
- * d_move() puts the renamed dentry at the end of
- * d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when dir is
- * complete.
- */
- ceph_set_dentry_offset(req->r_old_dentry);
- dout("dn %p gets new offset %lld\n", req->r_old_dentry,
- ceph_dentry(req->r_old_dentry)->offset);
+ doutc(cl, "dn %p gets new offset %lld\n",
+ req->r_old_dentry,
+ ceph_dentry(req->r_old_dentry)->offset);
- dn = req->r_old_dentry; /* use old_dentry */
- in = dn->d_inode;
+ /* swap r_dentry and r_old_dentry in case that
+ * splice_dentry() gets called later. This is safe
+ * because no other place will use them */
+ req->r_dentry = req->r_old_dentry;
+ req->r_old_dentry = dn;
+ dn = req->r_dentry;
}
/* null dentry? */
if (!rinfo->head->is_target) {
- dout("fill_trace null dentry\n");
- if (dn->d_inode) {
- dout("d_delete %p\n", dn);
+ doutc(cl, "null dentry\n");
+ if (d_really_is_positive(dn)) {
+ doutc(cl, "d_delete %p\n", dn);
+ ceph_dir_clear_ordered(dir);
d_delete(dn);
- } else {
- dout("d_instantiate %p NULL\n", dn);
- d_instantiate(dn, NULL);
- if (have_lease && d_unhashed(dn))
- d_rehash(dn);
- update_dentry_lease(dn, rinfo->dlease,
- session,
- req->r_request_started);
+ } else if (have_lease) {
+ if (d_unhashed(dn))
+ d_add(dn, NULL);
}
+
+ if (!d_unhashed(dn) && have_lease)
+ update_dentry_lease(dir, dn,
+ rinfo->dlease, session,
+ req->r_request_started);
+ goto done;
+ }
+
+ if (unlikely(!in)) {
+ err = -EINVAL;
goto done;
}
/* attach proper inode */
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = dn->d_inode;
- if (!in) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_trace bad get_inode "
- "%llx.%llx\n", vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_drop(dn);
- goto done;
- }
- dn = splice_dentry(dn, in, &have_lease, true);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
- goto done;
- }
- req->r_dentry = dn; /* may have spliced */
- ihold(in);
- } else if (ceph_ino(in) == vino.ino &&
- ceph_snap(in) == vino.snap) {
+ if (d_really_is_negative(dn)) {
+ ceph_dir_clear_ordered(dir);
ihold(in);
- } else {
- dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, in, ceph_ino(in), ceph_snap(in),
- vino.ino, vino.snap);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
+ goto done;
+ dn = req->r_dentry; /* may have spliced */
+ } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
+ doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+ ceph_vinop(in));
+ d_invalidate(dn);
have_lease = false;
- in = NULL;
}
- if (have_lease)
- update_dentry_lease(dn, rinfo->dlease, session,
+ if (have_lease) {
+ update_dentry_lease(dir, dn,
+ rinfo->dlease, session,
req->r_request_started);
- dout(" final dn %p\n", dn);
- i++;
+ }
+ doutc(cl, " final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
- struct dentry *dn = req->r_dentry;
+ req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ struct inode *dir = req->r_parent;
/* fill out a snapdir LOOKUPSNAP dentry */
- BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_inode get_inode badness %llx.%llx\n",
- vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
- dout(" linking snapped dir %p to dn %p\n", in, dn);
- dn = splice_dentry(dn, in, NULL, true);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ BUG_ON(!dir);
+ BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
+ BUG_ON(!req->r_dentry);
+ doutc(cl, " linking snapped dir %p to dn %p\n", in,
+ req->r_dentry);
+ ceph_dir_clear_ordered(dir);
+
+ if (unlikely(!in)) {
+ err = -EINVAL;
goto done;
}
- req->r_dentry = dn; /* may have spliced */
- ihold(in);
- rinfo->head->is_dentry = 1; /* fool notrace handlers */
- }
-
- if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
- if (in == NULL || ceph_ino(in) != vino.ino ||
- ceph_snap(in) != vino.snap) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
- }
- req->r_target_inode = in;
-
- err = fill_inode(in,
- &rinfo->targeti, NULL,
- session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
- req->r_fmode : -1,
- &req->r_caps_reservation);
- if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
+ ihold(in);
+ err = splice_dentry(&req->r_dentry, in);
+ if (err < 0)
goto done;
+ } else if (rinfo->head->is_dentry && req->r_dentry) {
+ /* parent inode is not locked, be careful */
+ struct ceph_vino *ptvino = NULL;
+ dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+ if (rinfo->head->is_target) {
+ tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+ ptvino = &tvino;
}
+ update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
+ session, req->r_request_started,
+ rinfo->dname, rinfo->dname_len,
+ &dvino, ptvino);
}
-
done:
- dout("fill_trace done err=%d\n", err);
+ /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
+ if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
+ iput(parent_dir);
+ doutc(cl, "done err=%d\n", err);
return err;
}
@@ -1197,297 +1863,393 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
int i, err = 0;
for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
struct inode *in;
int rc;
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ vino.ino = le64_to_cpu(rde->inode.in->ino);
+ vino.snap = le64_to_cpu(rde->inode.in->snapid);
- in = ceph_get_inode(req->r_dentry->d_sb, vino);
+ in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
if (IS_ERR(in)) {
err = PTR_ERR(in);
- dout("new_inode badness got %d\n", err);
+ doutc(cl, "badness got %d\n", err);
continue;
}
- rc = fill_inode(in, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation);
+ rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
if (rc < 0) {
- pr_err("fill_inode badness on %p got %d\n", in, rc);
+ pr_err_client(cl, "inode badness on %p got %d\n", in,
+ rc);
err = rc;
- continue;
+ if (inode_state_read_once(in) & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
+ } else if (inode_state_read_once(in) & I_NEW) {
+ unlock_new_inode(in);
}
+
+ iput(in);
}
return err;
}
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+ if (ctl->folio) {
+ folio_release_kmap(ctl->folio, ctl->dentries);
+ ctl->folio = NULL;
+ }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+ struct ceph_readdir_cache_control *ctl,
+ struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = ceph_inode_to_client(dir);
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
+ unsigned idx = ctl->index % nsize;
+ pgoff_t pgoff = ctl->index / nsize;
+
+ if (!ctl->folio || pgoff != ctl->folio->index) {
+ ceph_readdir_cache_release(ctl);
+ fgf_t fgf = FGP_LOCK;
+
+ if (idx == 0)
+ fgf |= FGP_ACCESSED | FGP_CREAT;
+
+ ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
+ fgf, mapping_gfp_mask(&dir->i_data));
+ if (IS_ERR(ctl->folio)) {
+ int err = PTR_ERR(ctl->folio);
+
+ ctl->folio = NULL;
+ ctl->index = -1;
+ return idx == 0 ? err : 0;
+ }
+ /* reading/filling the cache are serialized by
+ * i_rwsem, no need to use folio lock */
+ folio_unlock(ctl->folio);
+ ctl->dentries = kmap_local_folio(ctl->folio, 0);
+ if (idx == 0)
+ memset(ctl->dentries, 0, PAGE_SIZE);
+ }
+
+ if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+ req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+ doutc(cl, "dn %p idx %d\n", dn, ctl->index);
+ ctl->dentries[idx] = dn;
+ ctl->index++;
+ } else {
+ doutc(cl, "disable readdir cache\n");
+ ctl->index = -1;
+ }
+ return 0;
+}
+
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct dentry *parent = req->r_dentry;
+ struct inode *inode = d_inode(parent);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct qstr dname;
struct dentry *dn;
struct inode *in;
- int err = 0, i;
- struct inode *snapdir = NULL;
- struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u64 frag = le32_to_cpu(rhead->args.readdir.frag);
- struct ceph_dentry_info *di;
+ int err = 0, skipped = 0, ret, i;
+ u32 frag = le32_to_cpu(req->r_args.readdir.frag);
+ u32 last_hash = 0;
+ u32 fpos_offset;
+ struct ceph_readdir_cache_control cache_ctl = {};
- if (req->r_aborted)
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session);
+ if (rinfo->hash_order) {
+ if (req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2,
+ strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ } else if (rinfo->offset_hash) {
+ /* mds understands offset_hash */
+ WARN_ON_ONCE(req->r_readdir_offset != 2);
+ last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
+ }
+ }
+
+ if (rinfo->dir_dir &&
+ le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ doutc(cl, "got new frag %x -> %x\n", frag,
+ le32_to_cpu(rinfo->dir_dir->frag));
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (!rinfo->hash_order)
+ req->r_readdir_offset = 2;
+ }
+
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- snapdir = ceph_get_snapdir(parent->d_inode);
- parent = d_find_alias(snapdir);
- dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under SNAPDIR dn %p\n",
+ rinfo->dir_nr, parent);
} else {
- dout("readdir_prepopulate %d items under dn %p\n",
- rinfo->dir_nr, parent);
+ doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
if (rinfo->dir_dir)
- ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+ ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
+
+ if (ceph_frag_is_leftmost(frag) &&
+ req->r_readdir_offset == 2 &&
+ !(rinfo->hash_order && last_hash)) {
+ /* note dir version at start of readdir so we can
+ * tell if any dentries get dropped */
+ req->r_dir_release_cnt =
+ atomic64_read(&ci->i_release_count);
+ req->r_dir_ordered_cnt =
+ atomic64_read(&ci->i_ordered_count);
+ req->r_readdir_cache_idx = 0;
+ }
}
- for (i = 0; i < rinfo->dir_nr; i++) {
- struct ceph_vino vino;
-
- dname.name = rinfo->dir_dname[i];
- dname.len = rinfo->dir_dname_len[i];
- dname.hash = full_name_hash(dname.name, dname.len);
+ cache_ctl.index = req->r_readdir_cache_idx;
+ fpos_offset = req->r_readdir_offset;
- vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
- vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+ /* FIXME: release caps/leases if error occurs */
+ for (i = 0; i < rinfo->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
+ struct ceph_vino tvino;
+
+ dname.name = rde->name;
+ dname.len = rde->name_len;
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
+
+ tvino.ino = le64_to_cpu(rde->inode.in->ino);
+ tvino.snap = le64_to_cpu(rde->inode.in->snapid);
+
+ if (rinfo->hash_order) {
+ u32 hash = ceph_frag_value(rde->raw_hash);
+ if (hash != last_hash)
+ fpos_offset = 2;
+ last_hash = hash;
+ rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+ } else {
+ rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+ }
retry_lookup:
dn = d_lookup(parent, &dname);
- dout("d_lookup on parent=%p name=%.*s got %p\n",
- parent, dname.len, dname.name, dn);
+ doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
if (!dn) {
dn = d_alloc(parent, &dname);
- dout("d_alloc %p '%.*s' = %p\n", parent,
- dname.len, dname.name, dn);
- if (dn == NULL) {
- dout("d_alloc badness\n");
+ doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (!dn) {
+ doutc(cl, "d_alloc badness\n");
err = -ENOMEM;
goto out;
}
- err = ceph_init_dentry(dn);
- if (err < 0) {
- dput(dn);
- goto out;
+ if (rde->is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != tvino.ino ||
+ ceph_snap(d_inode(dn)) != tvino.snap)) {
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ doutc(cl, " dn %p points to wrong inode %p\n",
+ dn, d_inode(dn));
+
+ spin_lock(&dn->d_lock);
+ if (di->offset > 0 &&
+ di->lease_shared_gen ==
+ atomic_read(&ci->i_shared_gen)) {
+ __ceph_dir_clear_ordered(ci);
+ di->offset = 0;
}
- } else if (dn->d_inode &&
- (ceph_ino(dn->d_inode) != vino.ino ||
- ceph_snap(dn->d_inode) != vino.snap)) {
- dout(" dn %p points to wrong inode %p\n",
- dn, dn->d_inode);
+ spin_unlock(&dn->d_lock);
+
d_delete(dn);
dput(dn);
goto retry_lookup;
- } else {
- /* reorder parent's d_subdirs */
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &parent->d_subdirs);
- spin_unlock(&dn->d_lock);
- spin_unlock(&parent->d_lock);
}
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
/* inode */
- if (dn->d_inode) {
- in = dn->d_inode;
+ if (d_really_is_positive(dn)) {
+ in = d_inode(dn);
} else {
- in = ceph_get_inode(parent->d_sb, vino);
+ in = ceph_get_inode(parent->d_sb, tvino, NULL);
if (IS_ERR(in)) {
- dout("new_inode badness\n");
+ doutc(cl, "new_inode badness\n");
d_drop(dn);
dput(dn);
err = PTR_ERR(in);
goto out;
}
- dn = splice_dentry(dn, in, NULL, false);
- if (IS_ERR(dn))
- dn = NULL;
}
- if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation) < 0) {
- pr_err("fill_inode badness on %p\n", in);
+ ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+ -1, &req->r_caps_reservation);
+ if (ret < 0) {
+ pr_err_client(cl, "badness on %p %llx.%llx\n", in,
+ ceph_vinop(in));
+ if (d_really_is_negative(dn)) {
+ if (inode_state_read_once(in) & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
+ iput(in);
+ }
+ d_drop(dn);
+ err = ret;
goto next_item;
}
- if (dn)
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
- req->r_request_started);
+ if (inode_state_read_once(in) & I_NEW)
+ unlock_new_inode(in);
+
+ if (d_really_is_negative(dn)) {
+ if (ceph_security_xattr_deadlock(in)) {
+ doutc(cl, " skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
+ iput(in);
+ skipped++;
+ goto next_item;
+ }
+
+ err = splice_dentry(&dn, in);
+ if (err < 0)
+ goto next_item;
+ }
+
+ ceph_dentry(dn)->offset = rde->offset;
+
+ update_dentry_lease(d_inode(parent), dn,
+ rde->lease, req->r_session,
+ req->r_request_started);
+
+ if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
+ ret = fill_readdir_cache(d_inode(parent), dn,
+ &cache_ctl, req);
+ if (ret < 0)
+ err = ret;
+ }
next_item:
- if (dn)
- dput(dn);
+ dput(dn);
}
- req->r_did_prepopulate = true;
-
out:
- if (snapdir) {
- iput(snapdir);
- dput(parent);
+ if (err == 0 && skipped == 0) {
+ set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
+ req->r_readdir_cache_idx = cache_ctl.index;
}
- dout("readdir_prepopulate done\n");
+ ceph_readdir_cache_release(&cache_ctl);
+ doutc(cl, "done\n");
return err;
}
-int ceph_inode_set_size(struct inode *inode, loff_t size)
+bool ceph_inode_set_size(struct inode *inode, loff_t size)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- int ret = 0;
+ bool ret;
spin_lock(&ci->i_ceph_lock);
- dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
- inode->i_size = size;
- inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+ doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
+ i_size_write(inode, size);
+ ceph_fscache_update(inode);
+ inode->i_blocks = calc_inode_blocks(size);
- /* tell the MDS if we are approaching max_size */
- if ((size << 1) >= ci->i_max_size &&
- (ci->i_reported_size << 1) < ci->i_max_size)
- ret = 1;
+ ret = __ceph_should_report_size(ci);
spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->wb_wq,
- &ceph_inode(inode)->i_wb_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p failed\n", inode);
- iput(inode);
- }
+ return ret;
}
-static void ceph_writeback_work(struct work_struct *work)
+void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_wb_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("writeback %p\n", inode);
- filemap_fdatawrite(&inode->i_data);
- iput(inode);
-}
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ set_bit(work_bit, &ci->i_work_mask);
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
- &ceph_inode(inode)->i_pg_inv_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
+ if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
+ ceph_vinop(inode), ci->i_work_mask);
} else {
- dout("ceph_queue_invalidate %p failed\n", inode);
+ doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
+ inode, ceph_vinop(inode), ci->i_work_mask);
iput(inode);
}
}
-/*
- * Invalidate inode pages in a worker thread. (This can't be done
- * in the message handler context.)
- */
-static void ceph_invalidate_work(struct work_struct *work)
+static void ceph_do_invalidate_pages(struct inode *inode)
{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_pg_inv_work);
- struct inode *inode = &ci->vfs_inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
u32 orig_gen;
int check = 0;
+ ceph_fscache_invalidate(inode, false);
+
+ mutex_lock(&ci->i_truncate_mutex);
+
+ if (ceph_inode_is_shutdown(inode)) {
+ pr_warn_ratelimited_client(cl,
+ "%p %llx.%llx is shut down\n", inode,
+ ceph_vinop(inode));
+ mapping_set_error(inode->i_mapping, -EIO);
+ truncate_pagecache(inode, 0);
+ mutex_unlock(&ci->i_truncate_mutex);
+ goto out;
+ }
+
spin_lock(&ci->i_ceph_lock);
- dout("invalidate_pages %p gen %d revoking %d\n", inode,
- ci->i_rdcache_gen, ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- /* nevermind! */
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
goto out;
}
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(&inode->i_data, 0);
+ if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+ pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
+ ceph_vinop(inode));
+ }
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
orig_gen == ci->i_rdcache_revoking) {
- dout("invalidate_pages %p gen %d successful\n", inode,
- ci->i_rdcache_gen);
+ doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
+ ceph_vinop(inode), ci->i_rdcache_gen);
ci->i_rdcache_revoking--;
check = 1;
} else {
- dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
- inode, orig_gen, ci->i_rdcache_gen,
- ci->i_rdcache_revoking);
+ doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
+ inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
}
spin_unlock(&ci->i_ceph_lock);
-
- if (check)
- ceph_check_caps(ci, 0, NULL);
+ mutex_unlock(&ci->i_truncate_mutex);
out:
- iput(inode);
-}
-
-
-/*
- * called by trunc_wq;
- *
- * We also truncate in a separate thread as well.
- */
-static void ceph_vmtruncate_work(struct work_struct *work)
-{
- struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
- i_vmtruncate_work);
- struct inode *inode = &ci->vfs_inode;
-
- dout("vmtruncate_work %p\n", inode);
- mutex_lock(&inode->i_mutex);
- __ceph_do_pending_vmtruncate(inode);
- mutex_unlock(&inode->i_mutex);
- iput(inode);
-}
-
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- ihold(inode);
- if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
- &ci->i_vmtruncate_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
- } else {
- dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
- inode, ci->i_truncate_pending);
- iput(inode);
- }
+ if (check)
+ ceph_check_caps(ci, 0);
}
/*
@@ -1496,15 +2258,19 @@ void ceph_queue_vmtruncate(struct inode *inode)
*/
void __ceph_do_pending_vmtruncate(struct inode *inode)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
int wrbuffer_refs, finish = 0;
+ mutex_lock(&ci->i_truncate_mutex);
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
- dout("__do_pending_vmtruncate %p none pending\n", inode);
+ doutc(cl, "%p %llx.%llx none pending\n", inode,
+ ceph_vinop(inode));
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&ci->i_truncate_mutex);
return;
}
@@ -1513,24 +2279,28 @@ retry:
* possibly truncate them.. so write AND block!
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
- dout("__do_pending_vmtruncate %p flushing snaps first\n",
- inode);
spin_unlock(&ci->i_ceph_lock);
+ doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
+ ceph_vinop(inode));
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
}
- to = ci->i_truncate_size;
+ /* there should be no reader or writer */
+ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
+
+ to = ci->i_truncate_pagecache_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
- ci->i_truncate_pending, to);
+ doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
+ ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, to);
+ ceph_fscache_resize(inode, to);
+ truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
- if (to == ci->i_truncate_size) {
+ if (to == ci->i_truncate_pagecache_size) {
ci->i_truncate_pending = 0;
finish = 1;
}
@@ -1538,107 +2308,381 @@ retry:
if (!finish)
goto retry;
+ mutex_unlock(&ci->i_truncate_mutex);
+
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, 0);
wake_up_all(&ci->i_cap_wq);
}
+static void ceph_inode_work(struct work_struct *work)
+{
+ struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+ i_work);
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
-/*
- * symlinks
- */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+ if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
+ doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
+ filemap_fdatawrite(&inode->i_data);
+ }
+ if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
+ ceph_do_invalidate_pages(inode);
+
+ if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
+ __ceph_do_pending_vmtruncate(inode);
+
+ if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ ceph_check_caps(ci, 0);
+
+ if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ ceph_flush_snaps(ci, NULL);
+
+ iput(inode);
+}
+
+static const char *ceph_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
- nd_set_link(nd, ci->i_symlink);
- return NULL;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
+ done);
}
+static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ int ret;
+
+ ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
+ if (ret)
+ return ret;
+ return fscrypt_symlink_getattr(path, stat);
+}
+
+/*
+ * symlinks
+ */
static const struct inode_operations ceph_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = ceph_sym_follow_link,
+ .get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
- .setxattr = ceph_setxattr,
- .getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
- .removexattr = ceph_removexattr,
+};
+
+static const struct inode_operations ceph_encrypted_symlink_iops = {
+ .get_link = ceph_encrypted_get_link,
+ .setattr = ceph_setattr,
+ .getattr = ceph_encrypted_symlink_getattr,
+ .listxattr = ceph_listxattr,
};
/*
- * setattr
+ * Transfer the encrypted last block to the MDS and the MDS
+ * will help update it when truncating a smaller size.
+ *
+ * We don't support a PAGE_SIZE that is smaller than the
+ * CEPH_FSCRYPT_BLOCK_SIZE.
*/
-int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+static int fill_fscrypt_truncate(struct inode *inode,
+ struct ceph_mds_request *req,
+ struct iattr *attr)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t pos, orig_pos = round_down(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ struct ceph_pagelist *pagelist = NULL;
+ struct kvec iov = {0};
+ struct iov_iter iter;
+ struct page *page = NULL;
+ struct ceph_fscrypt_truncate_size_header header;
+ int retry_op = 0;
+ int len = CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t i_size = i_size_read(inode);
+ int got, ret, issued;
+ u64 objver;
+
+ ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
+ if (ret < 0)
+ return ret;
+
+ issued = __ceph_caps_issued(ci, NULL);
+
+ doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
+ i_size, attr->ia_size, ceph_cap_string(got),
+ ceph_cap_string(issued));
+
+ /* Try to writeback the dirty pagecaches */
+ if (issued & (CEPH_CAP_FILE_BUFFER)) {
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ orig_pos, lend);
+ if (ret < 0)
+ goto out;
+ }
+
+ page = __page_cache_alloc(GFP_KERNEL);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ iov.iov_base = kmap_local_page(page);
+ iov.iov_len = len;
+ iov_iter_kvec(&iter, READ, &iov, 1, len);
+
+ pos = orig_pos;
+ ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
+ if (ret < 0)
+ goto out;
+
+ /* Insert the header first */
+ header.ver = 1;
+ header.compat = 1;
+ header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
+
+ /*
+ * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
+ * because in MDS it may need this to do the truncate.
+ */
+ header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /*
+ * If we hit a hole here, we should just skip filling
+ * the fscrypt for the request, because once the fscrypt
+ * is enabled, the file will be split into many blocks
+ * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
+ * has a hole, the hole size should be multiple of block
+ * size.
+ *
+ * If the Rados object doesn't exist, it will be set to 0.
+ */
+ if (!objver) {
+ doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
+
+ header.data_len = cpu_to_le32(8 + 8 + 4);
+ header.file_offset = 0;
+ ret = 0;
+ } else {
+ header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
+ header.file_offset = cpu_to_le64(orig_pos);
+
+ doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /* truncate and zero out the extra contents for the last block */
+ memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
+
+ /* encrypt the last block */
+ ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ 0, block);
+ if (ret)
+ goto out;
+ }
+
+ /* Insert the header */
+ ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
+ if (ret)
+ goto out;
+
+ if (header.block_size) {
+ /* Append the last block contents to pagelist */
+ ret = ceph_pagelist_append(pagelist, iov.iov_base,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ if (ret)
+ goto out;
+ }
+ req->r_pagelist = pagelist;
+out:
+ doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
+ ceph_vinop(inode), ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+ if (iov.iov_base)
+ kunmap_local(iov.iov_base);
+ if (page)
+ __free_pages(page, 0);
+ if (ret && pagelist)
+ ceph_pagelist_release(pagelist);
+ return ret;
+}
+
+int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia)
{
- struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
- const unsigned int ia_valid = attr->ia_valid;
+ unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_cap_flush *prealloc_cf;
+ loff_t isize = i_size_read(inode);
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
+ bool lock_snap_rwsem = false;
+ bool fill_fscrypt;
+ int truncate_retry = 20; /* The RMW will take around 50ms */
+ struct dentry *dentry;
+ char *path;
+ bool do_sync = false;
+
+ dentry = d_find_alias(inode);
+ if (!dentry) {
+ do_sync = true;
+ } else {
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
+ if (IS_ERR(path)) {
+ do_sync = true;
+ err = 0;
+ } else {
+ err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
+ }
+ ceph_mdsc_free_path_info(&path_info);
+ dput(dentry);
+
+ /* For none EACCES cases will let the MDS do the mds auth check */
+ if (err == -EACCES) {
+ return err;
+ } else if (err < 0) {
+ do_sync = true;
+ err = 0;
+ }
+ }
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
- __ceph_do_pending_vmtruncate(inode);
-
- err = inode_change_ok(inode, attr);
- if (err != 0)
- return err;
+retry:
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS);
- if (IS_ERR(req))
+ if (IS_ERR(req)) {
+ ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req);
+ }
+ fill_fscrypt = false;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
- dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+ if (!ci->i_head_snapc &&
+ (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ }
+ }
+
+ doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
+ ceph_cap_string(issued));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (cia && cia->fscrypt_auth) {
+ u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+
+ if (len > sizeof(*cia->fscrypt_auth)) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+
+ doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
+ ceph_vinop(inode), ci->fscrypt_auth_len, len);
+
+ /* It should never be re-set once set */
+ WARN_ON_ONCE(ci->fscrypt_auth);
+
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+ ci->fscrypt_auth_len = len;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ ci->fscrypt_auth_len != len ||
+ memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+ req->r_fscrypt_auth = cia->fscrypt_auth;
+ mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ cia->fscrypt_auth = NULL;
+ }
+#else
+ if (cia && cia->fscrypt_auth) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
if (ia_valid & ATTR_UID) {
- dout("setattr %p uid %d -> %d\n", inode,
- from_kuid(&init_user_ns, inode->i_uid),
- from_kuid(&init_user_ns, attr->ia_uid));
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_uid = attr->ia_uid;
+ kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
+
+ doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kuid(&init_user_ns, attr->ia_uid));
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
+ inode->i_uid = fsuid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !uid_eq(attr->ia_uid, inode->i_uid)) {
+ !uid_eq(fsuid, inode->i_uid)) {
req->r_args.setattr.uid = cpu_to_le32(
- from_kuid(&init_user_ns, attr->ia_uid));
+ from_kuid(&init_user_ns, fsuid));
mask |= CEPH_SETATTR_UID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_GID) {
- dout("setattr %p gid %d -> %d\n", inode,
- from_kgid(&init_user_ns, inode->i_gid),
- from_kgid(&init_user_ns, attr->ia_gid));
- if (issued & CEPH_CAP_AUTH_EXCL) {
- inode->i_gid = attr->ia_gid;
+ kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
+
+ doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
+ ceph_vinop(inode),
+ from_kgid(&init_user_ns, inode->i_gid),
+ from_kgid(&init_user_ns, attr->ia_gid));
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
+ inode->i_gid = fsgid;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
- !gid_eq(attr->ia_gid, inode->i_gid)) {
+ !gid_eq(fsgid, inode->i_gid)) {
req->r_args.setattr.gid = cpu_to_le32(
- from_kgid(&init_user_ns, attr->ia_gid));
+ from_kgid(&init_user_ns, fsgid));
mask |= CEPH_SETATTR_GID;
release |= CEPH_CAP_AUTH_SHARED;
}
}
if (ia_valid & ATTR_MODE) {
- dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
- attr->ia_mode);
- if (issued & CEPH_CAP_AUTH_EXCL) {
+ doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
+ ceph_vinop(inode), inode->i_mode, attr->ia_mode);
+ if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
inode->i_mode = attr->ia_mode;
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
attr->ia_mode != inode->i_mode) {
+ inode->i_mode = attr->ia_mode;
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
mask |= CEPH_SETATTR_MODE;
release |= CEPH_CAP_AUTH_SHARED;
@@ -1646,84 +2690,111 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
}
if (ia_valid & ATTR_ATIME) {
- dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
- attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
+ struct timespec64 atime = inode_get_atime(inode);
+
+ doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &atime, &attr->ia_atime);
+ if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_atime,
- &attr->ia_atime) < 0) {
- inode->i_atime = attr->ia_atime;
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
+ timespec64_compare(&atime,
+ &attr->ia_atime) < 0) {
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
- ceph_encode_timespec(&req->r_args.setattr.atime,
- &attr->ia_atime);
+ !timespec64_equal(&atime, &attr->ia_atime)) {
+ ceph_encode_timespec64(&req->r_args.setattr.atime,
+ &attr->ia_atime);
mask |= CEPH_SETATTR_ATIME;
- release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
+ release |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ }
+ }
+ if (ia_valid & ATTR_SIZE) {
+ doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
+ ceph_vinop(inode), isize, attr->ia_size);
+ /*
+ * Only when the new size is smaller and not aligned to
+ * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
+ */
+ if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
+ (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ fill_fscrypt = true;
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ if (attr->ia_size > isize) {
+ i_size_write(inode, attr->ia_size);
+ inode->i_blocks = calc_inode_blocks(attr->ia_size);
+ ci->i_reported_size = attr->ia_size;
+ dirtied |= CEPH_CAP_FILE_EXCL;
+ ia_valid |= ATTR_MTIME;
+ }
+ } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+ attr->ia_size != isize) {
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ if (IS_ENCRYPTED(inode) && attr->ia_size) {
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ } else {
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
+ req->r_fscrypt_file = 0;
+ }
}
}
if (ia_valid & ATTR_MTIME) {
- dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
- attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
- if (issued & CEPH_CAP_FILE_EXCL) {
+ struct timespec64 mtime = inode_get_mtime(inode);
+
+ doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
+ inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
+ if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
ci->i_time_warp_seq++;
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec_compare(&inode->i_mtime,
- &attr->ia_mtime) < 0) {
- inode->i_mtime = attr->ia_mtime;
+ } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
+ timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
- ceph_encode_timespec(&req->r_args.setattr.mtime,
- &attr->ia_mtime);
+ !timespec64_equal(&mtime, &attr->ia_mtime)) {
+ ceph_encode_timespec64(&req->r_args.setattr.mtime,
+ &attr->ia_mtime);
mask |= CEPH_SETATTR_MTIME;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
- }
- }
- if (ia_valid & ATTR_SIZE) {
- dout("setattr %p size %lld -> %lld\n", inode,
- inode->i_size, attr->ia_size);
- if (attr->ia_size > inode->i_sb->s_maxbytes) {
- err = -EINVAL;
- goto out;
- }
- if ((issued & CEPH_CAP_FILE_EXCL) &&
- attr->ia_size > inode->i_size) {
- inode->i_size = attr->ia_size;
- inode->i_blocks =
- (attr->ia_size + (1 << 9) - 1) >> 9;
- inode->i_ctime = attr->ia_ctime;
- ci->i_reported_size = attr->ia_size;
- dirtied |= CEPH_CAP_FILE_EXCL;
- } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- attr->ia_size != inode->i_size) {
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size =
- cpu_to_le64(inode->i_size);
- mask |= CEPH_SETATTR_SIZE;
- release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR;
+ release |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
}
}
/* these do nothing */
if (ia_valid & ATTR_CTIME) {
+ struct timespec64 ictime = inode_get_ctime(inode);
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
- dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
- attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
- only ? "ctime only" : "ignored");
- inode->i_ctime = attr->ia_ctime;
+ doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
+ inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
+ only ? "ctime only" : "ignored");
if (only) {
/*
* if kernel wants to dirty ctime but nothing else,
@@ -1741,15 +2812,22 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
}
}
if (ia_valid & ATTR_FILE)
- dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+ doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
+ ceph_vinop(inode));
if (dirtied) {
- inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
- inode->i_ctime = CURRENT_TIME;
+ inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+ &prealloc_cf);
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
+ inode_inc_iversion_raw(inode);
}
release &= issued;
spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem) {
+ up_read(&mdsc->snap_rwsem);
+ lock_snap_rwsem = false;
+ }
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -1760,52 +2838,215 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ req->r_stamp = attr->ia_ctime;
+ if (fill_fscrypt) {
+ err = fill_fscrypt_truncate(inode, req, attr);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * The truncate request will return -EAGAIN when the
+ * last block has been updated just before the MDS
+ * successfully gets the xlock for the FILE lock. To
+ * avoid corrupting the file contents we need to retry
+ * it.
+ */
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == -EAGAIN && truncate_retry--) {
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
+ inode, ceph_vinop(inode), err,
+ ceph_cap_string(dirtied), mask);
+ ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
+ goto retry;
+ }
}
- dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
- ceph_cap_string(dirtied), mask);
+out:
+ doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
+ ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- __ceph_do_pending_vmtruncate(inode);
+ ceph_free_cap_flush(prealloc_cf);
+
+ if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+ __ceph_do_pending_vmtruncate(inode);
+
return err;
-out:
- spin_unlock(&ci->i_ceph_lock);
- ceph_mdsc_put_request(req);
+}
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ int err;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
+ err = setattr_prepare(idmap, dentry, attr);
+ if (err != 0)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
+ return -EFBIG;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
+ return -EDQUOT;
+
+ err = __ceph_setattr(idmap, inode, attr, NULL);
+
+ if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+ err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
+
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
*/
-int ceph_do_getattr(struct inode *inode, int mask)
+int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ int mode;
int err;
if (ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("do_getattr inode %p SNAPDIR\n", inode);
+ doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
+ ceph_vinop(inode));
return 0;
}
- dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
- if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
- return 0;
+ doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
+ ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
+ if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
+ return 0;
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
req->r_args.getattr.mask = cpu_to_le32(mask);
+ req->r_locked_page = locked_page;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (locked_page && err == 0) {
+ u64 inline_version = req->r_reply_info.targeti.inline_version;
+ if (inline_version == 0) {
+ /* the reply is supposed to contain inline data */
+ err = -EINVAL;
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
+ err = -ENODATA;
+ } else {
+ err = req->r_reply_info.targeti.inline_len;
+ }
+ }
ceph_mdsc_put_request(req);
- dout("do_getattr result=%d\n", err);
+ doutc(cl, "result=%d\n", err);
+ return err;
+}
+
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ doutc(cl, "result=%d\n", err);
return err;
}
@@ -1814,48 +3055,162 @@ int ceph_do_getattr(struct inode *inode, int mask)
* Check inode permissions. We verify we have a valid value for
* the AUTH cap, then call the generic handler.
*/
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
{
int err;
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+ err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
if (!err)
- err = generic_permission(inode, mask);
+ err = generic_permission(idmap, inode, mask);
return err;
}
+/* Craft a mask of needed caps given a set of requested statx attrs. */
+static int statx_to_caps(u32 want, umode_t mode)
+{
+ int mask = 0;
+
+ if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
+ mask |= CEPH_CAP_AUTH_SHARED;
+
+ if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
+ /*
+ * The link count for directories depends on inode->i_subdirs,
+ * and that is only updated when Fs caps are held.
+ */
+ if (S_ISDIR(mode))
+ mask |= CEPH_CAP_FILE_SHARED;
+ else
+ mask |= CEPH_CAP_LINK_SHARED;
+ }
+
+ if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
+ mask |= CEPH_CAP_FILE_SHARED;
+
+ if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
+ mask |= CEPH_CAP_XATTR_SHARED;
+
+ return mask;
+}
+
/*
- * Get all attributes. Hopefully somedata we'll have a statlite()
- * and can limit the fields we require to be accurate.
+ * Get all the attributes. If we have sufficient caps for the requested attrs,
+ * then we can avoid talking to the MDS at all.
*/
-int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
+int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(path->dentry);
+ struct super_block *sb = inode->i_sb;
struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
+ u32 valid_mask = STATX_BASIC_STATS;
+ int err = 0;
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
- if (!err) {
- generic_fillattr(inode, stat);
- stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
- if (ceph_snap(inode) != CEPH_NOSNAP)
- stat->dev = ceph_snap(inode);
- else
- stat->dev = 0;
- if (S_ISDIR(inode->i_mode)) {
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
- RBYTES))
- stat->size = ci->i_rbytes;
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ /* Skip the getattr altogether if we're asked not to sync */
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
+ err = ceph_do_getattr(inode,
+ statx_to_caps(request_mask, inode->i_mode),
+ flags & AT_STATX_FORCE_SYNC);
+ if (err)
+ return err;
+ }
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ stat->ino = ceph_present_inode(inode);
+
+ /*
+ * btime on newly-allocated inodes is 0, so if this is still set to
+ * that, then assume that it's not valid.
+ */
+ if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
+ stat->btime = ci->i_btime;
+ valid_mask |= STATX_BTIME;
+ }
+
+ if (request_mask & STATX_CHANGE_COOKIE) {
+ stat->change_cookie = inode_peek_iversion_raw(inode);
+ valid_mask |= STATX_CHANGE_COOKIE;
+ }
+
+ if (ceph_snap(inode) == CEPH_NOSNAP)
+ stat->dev = sb->s_dev;
+ else
+ stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
+ if (S_ISDIR(inode->i_mode)) {
+ if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
+ stat->size = ci->i_rbytes;
+ } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
+ struct ceph_inode_info *pci;
+ struct ceph_snap_realm *realm;
+ struct inode *parent;
+
+ parent = ceph_lookup_inode(sb, ceph_ino(inode));
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+
+ pci = ceph_inode(parent);
+ spin_lock(&pci->i_ceph_lock);
+ realm = pci->i_snap_realm;
+ if (realm)
+ stat->size = realm->num_snaps;
else
- stat->size = ci->i_files + ci->i_subdirs;
- stat->blocks = 0;
- stat->blksize = 65536;
+ stat->size = 0;
+ spin_unlock(&pci->i_ceph_lock);
+ iput(parent);
+ } else {
+ stat->size = ci->i_files + ci->i_subdirs;
}
+ stat->blocks = 0;
+ stat->blksize = 65536;
+ /*
+ * Some applications rely on the number of st_nlink
+ * value on directories to be either 0 (if unlinked)
+ * or 2 + number of subdirectories.
+ */
+ if (stat->nlink == 1)
+ /* '.' + '..' + subdirs */
+ stat->nlink = 1 + 1 + ci->i_subdirs;
}
+
+ stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+ if (IS_ENCRYPTED(inode))
+ stat->attributes |= STATX_ATTR_ENCRYPTED;
+ stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
+ STATX_ATTR_ENCRYPTED);
+
+ stat->result_mask = request_mask & valid_mask;
return err;
}
+
+void ceph_inode_shutdown(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct rb_node *p;
+ int iputs = 0;
+ bool invalidate = false;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+
+ p = rb_next(p);
+ iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ while (iputs--)
+ iput(inode);
+}
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 000000000000..2d10f49c93a9
--- /dev/null
+++ b/fs/ceph/io.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ * Copyright (c) 2019 Jeff Layton
+ *
+ * I/O and data path helper functionality.
+ *
+ * Heavily borrowed from equivalent code in fs/nfs/io.c
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+
+#include "super.h"
+#include "io.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
+{
+ bool is_odirect;
+
+ lockdep_assert_held_write(&inode->i_rwsem);
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ if (is_odirect) {
+ clear_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags);
+ /* ensure modified bit is visible */
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (is_odirect)
+ inode_dio_wait(inode);
+}
+
+/**
+ * ceph_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int ceph_start_io_read(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_odirect;
+ int err;
+
+ /* Be an optimist! */
+ err = down_read_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ spin_unlock(&ci->i_ceph_lock);
+ if (!is_odirect)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ err = down_write_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ ceph_block_o_direct(ci, inode);
+ downgrade_write(&inode->i_rwsem);
+
+ return 0;
+}
+
+/**
+ * ceph_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * ceph_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int ceph_start_io_write(struct inode *inode)
+{
+ int err = down_write_killable(&inode->i_rwsem);
+ if (!err)
+ ceph_block_o_direct(ceph_inode(inode), inode);
+ return err;
+}
+
+/**
+ * ceph_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
+{
+ bool is_odirect;
+
+ lockdep_assert_held_write(&inode->i_rwsem);
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ if (!is_odirect) {
+ set_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags);
+ /* ensure modified bit is visible */
+ smp_mb__after_atomic();
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (!is_odirect) {
+ /* FIXME: unmap_mapping_range? */
+ filemap_write_and_wait(inode->i_mapping);
+ }
+}
+
+/**
+ * ceph_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int ceph_start_io_direct(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ bool is_odirect;
+ int err;
+
+ /* Be an optimist! */
+ err = down_read_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ spin_lock(&ci->i_ceph_lock);
+ /* ensure that bit state is consistent */
+ smp_mb__before_atomic();
+ is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT;
+ spin_unlock(&ci->i_ceph_lock);
+ if (is_odirect)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ err = down_write_killable(&inode->i_rwsem);
+ if (err)
+ return err;
+
+ ceph_block_buffered(ci, inode);
+ downgrade_write(&inode->i_rwsem);
+
+ return 0;
+}
+
+/**
+ * ceph_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 000000000000..79029825e8b8
--- /dev/null
+++ b/fs/ceph/io.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_IO_H
+#define _FS_CEPH_IO_H
+
+#include <linux/compiler_attributes.h>
+
+int __must_check ceph_start_io_read(struct inode *inode);
+void ceph_end_io_read(struct inode *inode);
+int __must_check ceph_start_io_write(struct inode *inode);
+void ceph_end_io_write(struct inode *inode);
+int __must_check ceph_start_io_direct(struct inode *inode);
+void ceph_end_io_direct(struct inode *inode);
+
+#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e0b4ef31d3c8..15cde055f3da 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,11 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
#include "super.h"
#include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
#include "ioctl.h"
-
+#include <linux/ceph/striper.h>
+#include <linux/fscrypt.h>
/*
* ioctls
@@ -20,13 +21,13 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
struct ceph_ioctl_layout l;
int err;
- err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
- l.preferred_osd = (s32)-1;
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
+ l.preferred_osd = -1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
}
@@ -42,7 +43,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
/* validate striping parameters */
if ((l->object_size & ~PAGE_MASK) ||
(l->stripe_unit & ~PAGE_MASK) ||
- (l->stripe_unit != 0 &&
+ ((unsigned)l->stripe_unit != 0 &&
((unsigned)l->object_size % (unsigned)l->stripe_unit)))
return -EINVAL;
@@ -64,8 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct inode *parent_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
struct ceph_inode_info *ci = ceph_inode(file_inode(file));
@@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return -EFAULT;
/* validate changed params against current layout */
- err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
+ err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (err)
return err;
@@ -84,22 +84,22 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.stripe_count = ci->i_layout.stripe_count;
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_unit = ci->i_layout.stripe_unit;
if (l.object_size)
nl.object_size = l.object_size;
else
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.object_size = ci->i_layout.object_size;
if (l.data_pool)
nl.data_pool = l.data_pool;
else
- nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+ nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
- nl.preferred_osd = le64_to_cpu(-1);
+ nl.preferred_osd = -1;
err = __validate_layout(mdsc, &nl);
if (err)
@@ -111,6 +111,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
+
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
req->r_args.setlayout.layout.fl_stripe_unit =
@@ -121,9 +123,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
@@ -140,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
/* copy and validate */
if (copy_from_user(&l, arg, sizeof(l)))
@@ -157,6 +157,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
@@ -182,8 +183,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
- &ceph_sb_to_client(inode->i_sb)->client->osdc;
- u64 len = 1, olen;
+ &ceph_sb_to_fs_client(inode->i_sb)->client->osdc;
+ struct ceph_object_locator oloc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ u32 xlen;
u64 tmp;
struct ceph_pg pgid;
int r;
@@ -192,15 +195,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
if (copy_from_user(&dl, arg, sizeof(dl)))
return -EFAULT;
- down_read(&osdc->map_sem);
- r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
- &dl.object_no, &dl.object_offset,
- &olen);
- if (r < 0)
- return -EIO;
+ down_read(&osdc->lock);
+ ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1,
+ &dl.object_no, &dl.object_offset, &xlen);
dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
@@ -209,10 +209,19 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
- ceph_file_layout_pg_pool(ci->i_layout));
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ceph_oid_printf(&oid, "%s", dl.object_name);
+
+ r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
- dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+ ceph_oloc_destroy(&oloc);
+ if (r < 0) {
+ up_read(&osdc->lock);
+ return r;
+ }
+
+ dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
if (dl.osd >= 0) {
struct ceph_entity_addr *a =
ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -221,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
} else {
memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
}
- up_read(&osdc->map_sem);
+ up_read(&osdc->lock);
/* send result back to user */
if (copy_to_user(arg, &dl, sizeof(dl)))
@@ -235,19 +244,30 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ bool is_file_already_lazy = false;
+ spin_lock(&ci->i_ceph_lock);
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
- spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
- spin_unlock(&ci->i_ceph_lock);
- dout("ioctl_layzio: file %p marked lazy\n", file);
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+ __ceph_touch_fmode(ci, mdsc, fi->fmode);
+ } else {
+ is_file_already_lazy = true;
+ }
+ spin_unlock(&ci->i_ceph_lock);
- ceph_check_caps(ci, 0, NULL);
+ if (is_file_already_lazy) {
+ doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
+ ceph_vinop(inode));
} else {
- dout("ioctl_layzio: file %p already lazy\n", file);
+ doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode,
+ ceph_vinop(inode));
+
+ ceph_check_caps(ci, 0);
}
+
return 0;
}
@@ -259,9 +279,98 @@ static long ceph_ioctl_syncio(struct file *file)
return 0;
}
+static int vet_mds_for_fscrypt(struct file *file)
+{
+ int i, ret = -EOPNOTSUPP;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = mdsc->sessions[i];
+
+ if (!s)
+ continue;
+ if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
+ ret = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ return ret;
+}
+
+static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
+{
+ int ret, got = 0;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ /* encrypted directories can't have striped layout */
+ if (ci->i_layout.stripe_count > 1)
+ return -EINVAL;
+
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+
+ /*
+ * Ensure we hold these caps so that we _know_ that the rstats check
+ * in the empty_dir check is reliable.
+ */
+ ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+ if (got)
+ ceph_put_cap_refs(ci, got);
+
+ return ret;
+}
+
+static const char *ceph_ioctl_cmd_name(const unsigned int cmd)
+{
+ switch (cmd) {
+ case CEPH_IOC_GET_LAYOUT:
+ return "get_layout";
+ case CEPH_IOC_SET_LAYOUT:
+ return "set_layout";
+ case CEPH_IOC_SET_LAYOUT_POLICY:
+ return "set_layout_policy";
+ case CEPH_IOC_GET_DATALOC:
+ return "get_dataloc";
+ case CEPH_IOC_LAZYIO:
+ return "lazyio";
+ case CEPH_IOC_SYNCIO:
+ return "syncio";
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ return "set_encryption_policy";
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ return "get_encryption_policy";
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ return "get_encryption_policy_ex";
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ return "add_encryption_key";
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return "remove_encryption_key";
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return "remove_encryption_key_all_users";
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return "get_encryption_key_status";
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return "get_encryption_nonce";
+ default:
+ return "unknown";
+ }
+}
+
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+ struct inode *inode = file_inode(file);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ int ret;
+
+ doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file,
+ inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg);
switch (cmd) {
case CEPH_IOC_GET_LAYOUT:
return ceph_ioctl_get_layout(file, (void __user *)arg);
@@ -280,6 +389,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case CEPH_IOC_SYNCIO:
return ceph_ioctl_syncio(file);
+
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ return ceph_set_encryption_policy(file, arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_add_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return fscrypt_ioctl_remove_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return fscrypt_ioctl_remove_key_all_users(file,
+ (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
}
return -ENOTTY;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index c77028afb1e1..51f7f1d39a94 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef FS_CEPH_IOCTL_H
#define FS_CEPH_IOCTL_H
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0f..dd764f9c64b9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,30 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/file.h>
#include <linux/namei.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
+#include <linux/filelock.h>
#include <linux/ceph/pagelist.h>
-/**
+static u64 lock_secret;
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
+
+static inline u64 secure_addr(void *addr)
+{
+ u64 v = lock_secret ^ (u64)(unsigned long)addr;
+ /*
+ * Set the most significant bit, so that MDS knows the 'owner'
+ * is sufficient to identify the owner of lock. (old code uses
+ * both 'owner' and 'pid')
+ */
+ v |= (1ULL << 63);
+ return v;
+}
+
+void __init ceph_flock_init(void)
+{
+ get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
+static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
+{
+ struct inode *inode = file_inode(dst->c.flc_file);
+ atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+ dst->fl_u.ceph.inode = igrab(inode);
+}
+
+/*
+ * Do not use the 'fl->fl_file' in release function, which
+ * is possibly already released by another thread.
+ */
+static void ceph_fl_release_lock(struct file_lock *fl)
+{
+ struct inode *inode = fl->fl_u.ceph.inode;
+ struct ceph_inode_info *ci;
+
+ /*
+ * If inode is NULL it should be a request file_lock,
+ * nothing we can do.
+ */
+ if (!inode)
+ return;
+
+ ci = ceph_inode(inode);
+ if (atomic_dec_and_test(&ci->i_filelock_ref)) {
+ /* clear error when all locks are released */
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ fl->fl_u.ceph.inode = NULL;
+ iput(inode);
+}
+
+static const struct file_lock_operations ceph_fl_lock_ops = {
+ .fl_copy_lock = ceph_fl_copy_lock,
+ .fl_release_private = ceph_fl_release_lock,
+};
+
+/*
* Implement fcntl and flock locking functions.
*/
-static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
- struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
int err;
u64 length = 0;
+ u64 owner;
+
+ if (operation == CEPH_MDS_OP_SETFILELOCK) {
+ /*
+ * increasing i_filelock_ref closes race window between
+ * handling request reply and adding file_lock struct to
+ * inode. Otherwise, auth caps may get trimmed in the
+ * window. Caller function will decrease the counter.
+ */
+ fl->fl_ops = &ceph_fl_lock_ops;
+ fl->fl_ops->fl_copy_lock(fl, NULL);
+ }
+
+ if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
+ wait = 0;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
/* mds requires start and length rather than start and end */
if (LLONG_MAX == fl->fl_end)
@@ -32,32 +110,34 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
else
length = fl->fl_end - fl->fl_start + 1;
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type);
+ owner = secure_addr(fl->c.flc_owner);
+
+ doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d\n",
+ (int)lock_type, (int)operation, owner,
+ (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
- req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
- /* This should be adjusted, but I'm not sure if
- namespaces actually get id numbers*/
- req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
+ req->r_args.filelock_change.owner = cpu_to_le64(owner);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- err = ceph_mdsc_do_request(mdsc, inode, req);
-
- if ( operation == CEPH_MDS_OP_GETFILELOCK){
- fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
+ if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
+ fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_RDLCK;
+ fl->c.flc_type = F_RDLCK;
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
- fl->fl_type = F_WRLCK;
+ fl->c.flc_type = F_WRLCK;
else
- fl->fl_type = F_UNLCK;
+ fl->c.flc_type = F_UNLCK;
fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
@@ -69,128 +149,292 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
}
ceph_mdsc_put_request(req);
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type, err);
+ doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
+ "length: %llu, wait: %d, type: %d, err code %d\n",
+ (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
+ fl->fl_start, length, wait, fl->c.flc_type, err);
return err;
}
-/**
+static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_request *intr_req;
+ struct inode *inode = req->r_inode;
+ int err, lock_type;
+
+ BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
+ if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
+ lock_type = CEPH_LOCK_FCNTL_INTR;
+ else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
+ lock_type = CEPH_LOCK_FLOCK_INTR;
+ else
+ BUG_ON(1);
+ BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
+
+ err = wait_for_completion_interruptible(&req->r_completion);
+ if (!err)
+ return 0;
+
+ doutc(cl, "request %llu was interrupted\n", req->r_tid);
+
+ mutex_lock(&mdsc->mutex);
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ err = 0;
+ } else {
+ /*
+ * ensure we aren't running concurrently with
+ * ceph_fill_trace or ceph_readdir_prepopulate, which
+ * rely on locks (dir mutex) held by our caller.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = err;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+
+ if (!req->r_session) {
+ // haven't sent the request
+ err = 0;
+ }
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (!err)
+ return 0;
+
+ intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
+ USE_AUTH_MDS);
+ if (IS_ERR(intr_req))
+ return PTR_ERR(intr_req);
+
+ intr_req->r_inode = inode;
+ ihold(inode);
+ intr_req->r_num_caps = 1;
+
+ intr_req->r_args.filelock_change = req->r_args.filelock_change;
+ intr_req->r_args.filelock_change.rule = lock_type;
+ intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
+
+ err = ceph_mdsc_do_request(mdsc, inode, intr_req);
+ ceph_mdsc_put_request(intr_req);
+
+ if (err && err != -ERESTARTSYS)
+ return err;
+
+ err = wait_for_completion_killable(&req->r_safe_completion);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+ int err;
+ unsigned int orig_flags = fl->c.flc_flags;
+ fl->c.flc_flags |= FL_EXISTS;
+ err = locks_lock_file_wait(file, fl);
+ fl->c.flc_flags = orig_flags;
+ if (err == -ENOENT) {
+ if (!(orig_flags & FL_EXISTS))
+ err = 0;
+ return err;
+ }
+ return 1;
+}
+
+/*
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
*/
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
- u8 lock_cmd;
- int err;
- u8 wait = 0;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int err = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
+ u8 wait = 0;
+ u8 lock_cmd;
+
+ if (!(fl->c.flc_flags & FL_POSIX))
+ return -ENOLCK;
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
- if (F_SETLKW == cmd)
- wait = 1;
- if (F_GETLK == cmd)
+ if (IS_GETLK(cmd))
op = CEPH_MDS_OP_GETFILELOCK;
+ else if (IS_SETLKW(cmd))
+ wait = 1;
- if (F_RDLCK == fl->fl_type)
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
+ posix_lock_file(file, fl, NULL);
+ return err;
+ }
+
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (F_WRLCK == fl->fl_type)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
+ if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
+ err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
- if ( op != CEPH_MDS_OP_GETFILELOCK ){
- dout("mds locked, locking locally");
+ if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
+ doutc(cl, "locking locally\n");
err = posix_lock_file(file, fl, NULL);
- if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+ if (err) {
/* undo! This should only happen if
* the kernel detects local
* deadlock. */
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+ ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on posix_lock_file, undid lock",
- err);
+ doutc(cl, "got %d on posix_lock_file, undid lock\n",
+ err);
}
}
-
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int err = 0;
+ u8 wait = 0;
u8 lock_cmd;
- int err;
- u8 wait = 1;
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+ if (!(fl->c.flc_flags & FL_FLOCK))
+ return -ENOLCK;
- /* set wait bit, then clear it out of cmd*/
- if (cmd & LOCK_NB)
- wait = 0;
- cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
- /* set command sequence that Ceph wants to see:
- shared lock, exclusive lock, or unlock */
- if (LOCK_SH == cmd)
+ if (ceph_inode_is_shutdown(inode))
+ return -ESTALE;
+
+ doutc(cl, "fl_file: %p\n", fl->c.flc_file);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
+ err = -EIO;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (err < 0) {
+ if (lock_is_unlock(fl))
+ locks_lock_file_wait(file, fl);
+ return err;
+ }
+
+ if (IS_SETLKW(cmd))
+ wait = 1;
+
+ if (lock_is_read(fl))
lock_cmd = CEPH_LOCK_SHARED;
- else if (LOCK_EX == cmd)
+ else if (lock_is_write(fl))
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
+ if (lock_is_unlock(fl)) {
+ err = try_unlock_file(file, fl);
+ if (err <= 0)
+ return err;
+ }
+
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, lock_cmd, wait, fl);
- if (!err) {
- err = flock_lock_file_wait(file, fl);
+ inode, lock_cmd, wait, fl);
+ if (!err && F_UNLCK != fl->c.flc_type) {
+ err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on flock_lock_file_wait, undid lock", err);
+ inode, CEPH_LOCK_UNLOCK, 0, fl);
+ doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
+ err);
}
- } else if (err == -ERESTARTSYS) {
- dout("undoing lock\n");
- ceph_lock_message(CEPH_LOCK_FLOCK,
- CEPH_MDS_OP_SETFILELOCK,
- file, CEPH_LOCK_UNLOCK, 0, fl);
}
return err;
}
-/**
- * Must be called with lock_flocks() already held. Fills in the passed
- * counter variables, so you can prepare pagelist metadata before calling
- * ceph_encode_locks.
+/*
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
+ * before calling ceph_encode_locks.
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct file_lock *lock;
+ struct file_lock_context *ctx;
*fcntl_count = 0;
*flock_count = 0;
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX)
+ ctx = locks_inode_context(inode);
+ if (ctx) {
+ spin_lock(&ctx->flc_lock);
+ for_each_file_lock(lock, &ctx->flc_posix)
++(*fcntl_count);
- else if (lock->fl_flags & FL_FLOCK)
+ for_each_file_lock(lock, &ctx->flc_flock)
++(*flock_count);
+ spin_unlock(&ctx->flc_lock);
}
- dout("counted %d flock locks and %d fcntl locks",
- *flock_count, *fcntl_count);
+ doutc(cl, "counted %d flock locks and %d fcntl locks\n",
+ *flock_count, *fcntl_count);
}
-/**
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+static int lock_to_ceph_filelock(struct inode *inode,
+ struct file_lock *lock,
+ struct ceph_filelock *cephlock)
+{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ int err = 0;
+
+ cephlock->start = cpu_to_le64(lock->fl_start);
+ cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+ cephlock->client = cpu_to_le64(0);
+ cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
+
+ switch (lock->c.flc_type) {
+ case F_RDLCK:
+ cephlock->type = CEPH_LOCK_SHARED;
+ break;
+ case F_WRLCK:
+ cephlock->type = CEPH_LOCK_EXCL;
+ break;
+ case F_UNLCK:
+ cephlock->type = CEPH_LOCK_UNLOCK;
+ break;
+ default:
+ doutc(cl, "Have unknown lock type %d\n",
+ lock->c.flc_type);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+/*
* Encode the flock and fcntl locks for the given inode into the ceph_filelock
* array. Must be called with inode->i_lock already held.
* If we encounter more of a specific lock type than expected, return -ENOSPC.
@@ -200,45 +444,48 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
+ struct file_lock_context *ctx = locks_inode_context(inode);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
int l = 0;
- dout("encoding %d flock and %d fcntl locks", num_flock_locks,
- num_fcntl_locks);
+ doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
+ num_fcntl_locks);
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX) {
- ++seen_fcntl;
- if (seen_fcntl > num_fcntl_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &flocks[l]);
- if (err)
- goto fail;
- ++l;
+ if (!ctx)
+ return 0;
+
+ spin_lock(&ctx->flc_lock);
+ for_each_file_lock(lock, &ctx->flc_posix) {
+ ++seen_fcntl;
+ if (seen_fcntl > num_fcntl_locks) {
+ err = -ENOSPC;
+ goto fail;
}
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
}
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_FLOCK) {
- ++seen_flock;
- if (seen_flock > num_flock_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &flocks[l]);
- if (err)
- goto fail;
- ++l;
+ for_each_file_lock(lock, &ctx->flc_flock) {
+ ++seen_flock;
+ if (seen_flock > num_flock_locks) {
+ err = -ENOSPC;
+ goto fail;
}
+ err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
}
fail:
+ spin_unlock(&ctx->flc_lock);
return err;
}
-/**
+/*
* Copy the encoded flock and fcntl locks into the pagelist.
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
* sequential flock locks.
@@ -256,52 +503,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
if (err)
goto out_fail;
- err = ceph_pagelist_append(pagelist, flocks,
- num_fcntl_locks * sizeof(*flocks));
- if (err)
- goto out_fail;
+ if (num_fcntl_locks > 0) {
+ err = ceph_pagelist_append(pagelist, flocks,
+ num_fcntl_locks * sizeof(*flocks));
+ if (err)
+ goto out_fail;
+ }
nlocks = cpu_to_le32(num_flock_locks);
err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
if (err)
goto out_fail;
- err = ceph_pagelist_append(pagelist,
- &flocks[num_fcntl_locks],
- num_flock_locks * sizeof(*flocks));
-out_fail:
- return err;
-}
-
-/*
- * Given a pointer to a lock, convert it to a ceph filelock
- */
-int lock_to_ceph_filelock(struct file_lock *lock,
- struct ceph_filelock *cephlock)
-{
- int err = 0;
-
- cephlock->start = cpu_to_le64(lock->fl_start);
- cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
- cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace =
- cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
-
- switch (lock->fl_type) {
- case F_RDLCK:
- cephlock->type = CEPH_LOCK_SHARED;
- break;
- case F_WRLCK:
- cephlock->type = CEPH_LOCK_EXCL;
- break;
- case F_UNLCK:
- cephlock->type = CEPH_LOCK_UNLOCK;
- break;
- default:
- dout("Have unknown lock type %d", lock->fl_type);
- err = -EINVAL;
+ if (num_flock_locks > 0) {
+ err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
+ num_flock_locks * sizeof(*flocks));
}
-
+out_fail:
return err;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 187bf214444d..1740047aef0f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,14 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/ratelimit.h>
+#include <linux/bits.h>
+#include <linux/ktime.h>
+#include <linux/bitmap.h>
+#include <linux/mnt_idmapping.h>
#include "super.h"
#include "mds_client.h"
+#include "crypto.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/messenger.h>
@@ -17,6 +25,8 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+
/*
* A cluster of MDS (metadata server) daemons is responsible for
* managing the file system namespace (the directory hierarchy and
@@ -43,12 +53,17 @@
*/
struct ceph_reconnect_state {
+ struct ceph_mds_session *session;
+ int nr_caps, nr_realms;
struct ceph_pagelist *pagelist;
- bool flock;
+ unsigned msg_version;
+ bool allow_multi;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
+static void ceph_cap_release_work(struct work_struct *work);
+static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
@@ -57,15 +72,54 @@ static const struct ceph_connection_operations mds_con_ops;
* mds reply parsing
*/
+static int parse_reply_info_quota(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only
+ * understand encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ ceph_decode_64_safe(p, end, info->max_bytes, bad);
+ ceph_decode_64_safe(p, end, info->max_files, bad);
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse individual inode info
*/
static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
- int features)
+ u64 features)
{
- int err = -EIO;
+ int err = 0;
+ u8 struct_v = 0;
+
+ if (features == (u64)-1) {
+ u32 struct_len;
+ u8 struct_compat;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding with struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+ ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
info->in = *p;
*p += sizeof(struct ceph_mds_reply_inode) +
sizeof(*info->in->fragtree.splits) *
@@ -76,28 +130,240 @@ static int parse_reply_info_in(void **p, void *end,
info->symlink = *p;
*p += info->symlink_len;
- if (features & CEPH_FEATURE_DIRLAYOUTHASH)
- ceph_decode_copy_safe(p, end, &info->dir_layout,
- sizeof(info->dir_layout), bad);
- else
- memset(&info->dir_layout, 0, sizeof(info->dir_layout));
-
+ ceph_decode_copy_safe(p, end, &info->dir_layout,
+ sizeof(info->dir_layout), bad);
ceph_decode_32_safe(p, end, info->xattr_len, bad);
ceph_decode_need(p, end, info->xattr_len, bad);
info->xattr_data = *p;
*p += info->xattr_len;
+
+ if (features == (u64)-1) {
+ /* inline data */
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ /* quota */
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ /* pool namespace */
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+
+ /* btime */
+ ceph_decode_need(p, end, sizeof(info->btime), bad);
+ ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+
+ /* change attribute */
+ ceph_decode_64_safe(p, end, info->change_attr, bad);
+
+ /* dir pin */
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, info->dir_pin, bad);
+ } else {
+ info->dir_pin = -ENODATA;
+ }
+
+ /* snapshot birth time, remains zero for v<=2 */
+ if (struct_v >= 3) {
+ ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
+ ceph_decode_copy(p, &info->snap_btime,
+ sizeof(info->snap_btime));
+ } else {
+ memset(&info->snap_btime, 0, sizeof(info->snap_btime));
+ }
+
+ /* snapshot count, remains zero for v<=3 */
+ if (struct_v >= 4) {
+ ceph_decode_64_safe(p, end, info->rsnaps, bad);
+ } else {
+ info->rsnaps = 0;
+ }
+
+ if (struct_v >= 5) {
+ u32 alen;
+
+ ceph_decode_32_safe(p, end, alen, bad);
+
+ while (alen--) {
+ u32 len;
+
+ /* key */
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_skip_n(p, end, len, bad);
+ /* value */
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_skip_n(p, end, len, bad);
+ }
+ }
+
+ /* fscrypt flag -- ignore */
+ if (struct_v >= 6)
+ ceph_decode_skip_8(p, end, bad);
+
+ info->fscrypt_auth = NULL;
+ info->fscrypt_auth_len = 0;
+ info->fscrypt_file = NULL;
+ info->fscrypt_file_len = 0;
+ if (struct_v >= 7) {
+ ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
+ if (info->fscrypt_auth_len) {
+ info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!info->fscrypt_auth)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, info->fscrypt_auth,
+ info->fscrypt_auth_len, bad);
+ }
+ ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
+ if (info->fscrypt_file_len) {
+ info->fscrypt_file = kmalloc(info->fscrypt_file_len,
+ GFP_KERNEL);
+ if (!info->fscrypt_file)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, info->fscrypt_file,
+ info->fscrypt_file_len, bad);
+ }
+ }
+ *p = end;
+ } else {
+ /* legacy (unversioned) struct */
+ if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
+ ceph_decode_64_safe(p, end, info->inline_version, bad);
+ ceph_decode_32_safe(p, end, info->inline_len, bad);
+ ceph_decode_need(p, end, info->inline_len, bad);
+ info->inline_data = *p;
+ *p += info->inline_len;
+ } else
+ info->inline_version = CEPH_INLINE_NONE;
+
+ if (features & CEPH_FEATURE_MDS_QUOTA) {
+ err = parse_reply_info_quota(p, end, info);
+ if (err < 0)
+ goto out_bad;
+ } else {
+ info->max_bytes = 0;
+ info->max_files = 0;
+ }
+
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
+ }
+
+ if (features & CEPH_FEATURE_FS_BTIME) {
+ ceph_decode_need(p, end, sizeof(info->btime), bad);
+ ceph_decode_copy(p, &info->btime, sizeof(info->btime));
+ ceph_decode_64_safe(p, end, info->change_attr, bad);
+ }
+
+ info->dir_pin = -ENODATA;
+ /* info->snap_btime and info->rsnaps remain zero */
+ }
return 0;
bad:
+ err = -EIO;
+out_bad:
return err;
}
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_dirfrag **dirfrag,
+ u64 features)
+{
+ if (features == (u64)-1) {
+ u8 struct_v, struct_compat;
+ u32 struct_len;
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ ceph_decode_need(p, end, struct_len, bad);
+ end = *p + struct_len;
+ }
+
+ ceph_decode_need(p, end, sizeof(**dirfrag), bad);
+ *dirfrag = *p;
+ *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+ if (features == (u64)-1)
+ *p = end;
+ return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_lease(void **p, void *end,
+ struct ceph_mds_reply_lease **lease,
+ u64 features, u32 *altname_len, u8 **altname)
+{
+ u8 struct_v;
+ u32 struct_len;
+ void *lend;
+
+ if (features == (u64)-1) {
+ u8 struct_compat;
+
+ ceph_decode_8_safe(p, end, struct_v, bad);
+ ceph_decode_8_safe(p, end, struct_compat, bad);
+
+ /* struct_v is expected to be >= 1. we only understand
+ * encoding whose struct_compat == 1. */
+ if (!struct_v || struct_compat != 1)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, struct_len, bad);
+ } else {
+ struct_len = sizeof(**lease);
+ *altname_len = 0;
+ *altname = NULL;
+ }
+
+ lend = *p + struct_len;
+ ceph_decode_need(p, end, struct_len, bad);
+ *lease = *p;
+ *p += sizeof(**lease);
+
+ if (features == (u64)-1) {
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, *altname_len, bad);
+ ceph_decode_need(p, end, *altname_len, bad);
+ *altname = *p;
+ *p += *altname_len;
+ } else {
+ *altname = NULL;
+ *altname_len = 0;
+ }
+ }
+ *p = lend;
+ return 0;
+bad:
+ return -EIO;
+}
+
/*
* parse a normal reply, which may contain a (dir+)dentry and/or a
* target inode.
*/
static int parse_reply_info_trace(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
int err;
@@ -106,20 +372,19 @@ static int parse_reply_info_trace(void **p, void *end,
if (err < 0)
goto out_bad;
- if (unlikely(*p + sizeof(*info->dirfrag) > end))
- goto bad;
- info->dirfrag = *p;
- *p += sizeof(*info->dirfrag) +
- sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
- if (unlikely(*p > end))
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dirfrag, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_32_safe(p, end, info->dname_len, bad);
ceph_decode_need(p, end, info->dname_len, bad);
info->dname = *p;
*p += info->dname_len;
- info->dlease = *p;
- *p += sizeof(*info->dlease);
+
+ err = parse_reply_info_lease(p, end, &info->dlease, features,
+ &info->altname_len, &info->altname);
+ if (err < 0)
+ goto out_bad;
}
if (info->head->is_target) {
@@ -142,72 +407,141 @@ out_bad:
/*
* parse readdir results
*/
-static int parse_reply_info_dir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
+static int parse_reply_info_readdir(void **p, void *end,
+ struct ceph_mds_request *req,
+ u64 features)
{
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
u32 num, i = 0;
int err;
- info->dir_dir = *p;
- if (*p + sizeof(*info->dir_dir) > end)
- goto bad;
- *p += sizeof(*info->dir_dir) +
- sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
- if (*p > end)
- goto bad;
+ err = parse_reply_info_dir(p, end, &info->dir_dir, features);
+ if (err < 0)
+ goto out_bad;
ceph_decode_need(p, end, sizeof(num) + 2, bad);
num = ceph_decode_32(p);
- info->dir_end = ceph_decode_8(p);
- info->dir_complete = ceph_decode_8(p);
+ {
+ u16 flags = ceph_decode_16(p);
+ info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+ info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+ info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
+ }
if (num == 0)
goto done;
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
+ BUG_ON(!info->dir_entries);
+ if ((unsigned long)(info->dir_entries + num) >
+ (unsigned long)info->dir_entries + info->dir_buf_size) {
+ pr_err_client(cl, "dir contents are larger than expected\n");
+ WARN_ON(1);
+ goto bad;
}
- info->dir_dname = (void *)(info->dir_in + num);
- info->dir_dname_len = (void *)(info->dir_dname + num);
- info->dir_dlease = (void *)(info->dir_dname_len + num);
+ info->dir_nr = num;
while (num) {
+ struct inode *inode = d_inode(req->r_dentry);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+ struct fscrypt_str tname = FSTR_INIT(NULL, 0);
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname;
+ u32 altname_len, _name_len;
+ u8 *altname, *_name;
+
/* dentry */
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- info->dir_dname_len[i] = ceph_decode_32(p);
- ceph_decode_need(p, end, info->dir_dname_len[i], bad);
- info->dir_dname[i] = *p;
- *p += info->dir_dname_len[i];
- dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
- info->dir_dname[i]);
- info->dir_dlease[i] = *p;
- *p += sizeof(struct ceph_mds_reply_lease);
+ ceph_decode_32_safe(p, end, _name_len, bad);
+ ceph_decode_need(p, end, _name_len, bad);
+ _name = *p;
+ *p += _name_len;
+ doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name);
+
+ if (info->hash_order)
+ rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ _name, _name_len);
+
+ /* dentry lease */
+ err = parse_reply_info_lease(p, end, &rde->lease, features,
+ &altname_len, &altname);
+ if (err)
+ goto out_bad;
+
+ /*
+ * Try to dencrypt the dentry names and update them
+ * in the ceph_mds_reply_dir_entry struct.
+ */
+ fname.dir = inode;
+ fname.name = _name;
+ fname.name_len = _name_len;
+ fname.ctext = altname;
+ fname.ctext_len = altname_len;
+ /*
+ * The _name_len maybe larger than altname_len, such as
+ * when the human readable name length is in range of
+ * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),
+ * then the copy in ceph_fname_to_usr will corrupt the
+ * data if there has no encryption key.
+ *
+ * Just set the no_copy flag and then if there has no
+ * encryption key the oname.name will be assigned to
+ * _name always.
+ */
+ fname.no_copy = true;
+ if (altname_len == 0) {
+ /*
+ * Set tname to _name, and this will be used
+ * to do the base64_decode in-place. It's
+ * safe because the decoded string should
+ * always be shorter, which is 3/4 of origin
+ * string.
+ */
+ tname.name = _name;
+
+ /*
+ * Set oname to _name too, and this will be
+ * used to do the dencryption in-place.
+ */
+ oname.name = _name;
+ oname.len = _name_len;
+ } else {
+ /*
+ * This will do the decryption only in-place
+ * from altname cryptext directly.
+ */
+ oname.name = altname;
+ oname.len = altname_len;
+ }
+ rde->is_nokey = false;
+ err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
+ if (err) {
+ pr_err_client(cl, "unable to decode %.*s, got %d\n",
+ _name_len, _name, err);
+ goto out_bad;
+ }
+ rde->name = oname.name;
+ rde->name_len = oname.len;
/* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+ err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
goto out_bad;
+ /* ceph_readdir_prepopulate() will update it */
+ rde->offset = 0;
i++;
num--;
}
done:
- if (*p != end)
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
bad:
err = -EIO;
out_bad:
- pr_err("problem parsing dir contents %d\n", err);
+ pr_err_client(cl, "problem parsing dir contents %d\n", err);
return err;
}
@@ -216,42 +550,171 @@ out_bad:
*/
static int parse_reply_info_filelock(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (*p + sizeof(*info->filelock_reply) > end)
goto bad;
info->filelock_reply = *p;
- *p += sizeof(*info->filelock_reply);
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
+bad:
+ return -EIO;
+}
+
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+ u32 sets;
+
+ ceph_decode_32_safe(p, end, sets, bad);
+ doutc(cl, "got %u sets of delegated inodes\n", sets);
+ while (sets--) {
+ u64 start, len;
+
+ ceph_decode_64_safe(p, end, start, bad);
+ ceph_decode_64_safe(p, end, len, bad);
+
+ /* Don't accept a delegation of system inodes */
+ if (start < CEPH_INO_SYSTEM_BASE) {
+ pr_warn_ratelimited_client(cl,
+ "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+ start, len);
+ continue;
+ }
+ while (len--) {
+ int err = xa_insert(&s->s_delegated_inos, start++,
+ DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+ if (!err) {
+ doutc(cl, "added delegated inode 0x%llx\n", start - 1);
+ } else if (err == -EBUSY) {
+ pr_warn_client(cl,
+ "MDS delegated inode 0x%llx more than once.\n",
+ start - 1);
+ } else {
+ return err;
+ }
+ }
+ }
+ return 0;
+bad:
+ return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ unsigned long ino;
+ void *val;
+
+ xa_for_each(&s->s_delegated_inos, ino, val) {
+ val = xa_erase(&s->s_delegated_inos, ino);
+ if (val == DELEGATED_INO_AVAILABLE)
+ return ino;
+ }
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+ GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+ struct ceph_mds_session *s)
+{
+ u32 sets;
+ ceph_decode_32_safe(p, end, sets, bad);
+ if (sets)
+ ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+ return 0;
bad:
return -EIO;
}
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+ return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+ return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features, struct ceph_mds_session *s)
{
- if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+ int ret;
+
+ if (features == (u64)-1 ||
+ (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
if (*p == end) {
+ /* Malformed reply? */
info->has_create_ino = false;
+ } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+ info->has_create_ino = true;
+ /* struct_v, struct_compat, and len */
+ ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
+ ceph_decode_64_safe(p, end, info->ino, bad);
+ ret = ceph_parse_deleg_inos(p, end, s);
+ if (ret)
+ return ret;
} else {
+ /* legacy */
+ ceph_decode_64_safe(p, end, info->ino, bad);
info->has_create_ino = true;
- info->ino = ceph_decode_64(p);
}
+ } else {
+ if (*p != end)
+ goto bad;
}
- if (unlikely(*p != end))
- goto bad;
+ /* Skip over any unrecognized fields */
+ *p = end;
return 0;
+bad:
+ return -EIO;
+}
+
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
bad:
return -EIO;
}
@@ -260,16 +723,20 @@ bad:
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- int features)
+ struct ceph_mds_request *req,
+ u64 features, struct ceph_mds_session *s)
{
- if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ u32 op = le32_to_cpu(info->head->op);
+
+ if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
- else if (info->head->op == CEPH_MDS_OP_READDIR ||
- info->head->op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_dir(p, end, info, features);
- else if (info->head->op == CEPH_MDS_OP_CREATE)
- return parse_reply_info_create(p, end, info, features);
+ else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
+ return parse_reply_info_readdir(p, end, req, features);
+ else if (op == CEPH_MDS_OP_CREATE)
+ return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -277,10 +744,11 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
-static int parse_reply_info(struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info,
- int features)
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
+ struct ceph_mds_request *req, u64 features)
{
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
void *p, *end;
u32 len;
int err;
@@ -302,7 +770,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features);
+ err = parse_reply_info_extra(&p, p+len, req, features, s);
if (err < 0)
goto out_bad;
}
@@ -320,20 +788,110 @@ static int parse_reply_info(struct ceph_msg *msg,
bad:
err = -EIO;
out_bad:
- pr_err("mds parse_reply err %d\n", err);
+ pr_err_client(cl, "mds parse_reply err %d\n", err);
+ ceph_msg_dump(msg);
return err;
}
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- kfree(info->dir_in);
+ int i;
+
+ kfree(info->diri.fscrypt_auth);
+ kfree(info->diri.fscrypt_file);
+ kfree(info->targeti.fscrypt_auth);
+ kfree(info->targeti.fscrypt_file);
+ if (!info->dir_entries)
+ return;
+
+ for (i = 0; i < info->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+
+ kfree(rde->inode.fscrypt_auth);
+ kfree(rde->inode.fscrypt_file);
+ }
+ free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
+}
+
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDentry. That means the just created file is possibly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+ struct ceph_client *cl = fsc->client;
+ struct dentry *pdentry = dentry->d_parent;
+ struct dentry *udentry, *found = NULL;
+ struct ceph_dentry_info *di;
+ struct qstr dname;
+ u32 hash = dentry->d_name.hash;
+ int err;
+
+ dname.name = dentry->d_name.name;
+ dname.len = dentry->d_name.len;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+ hnode, hash) {
+ udentry = di->dentry;
+
+ spin_lock(&udentry->d_lock);
+ if (udentry->d_name.hash != hash)
+ goto next;
+ if (unlikely(udentry->d_parent != pdentry))
+ goto next;
+ if (!hash_hashed(&di->hnode))
+ goto next;
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n",
+ dentry, dentry);
+
+ if (!d_same_name(udentry, pdentry, &dname))
+ goto next;
+
+ found = dget_dlock(udentry);
+ spin_unlock(&udentry->d_lock);
+ break;
+next:
+ spin_unlock(&udentry->d_lock);
+ }
+ rcu_read_unlock();
+
+ if (likely(!found))
+ return 0;
+
+ doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry,
+ found, found);
+
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+ TASK_KILLABLE);
+ dput(found);
+ return err;
}
/*
* sessions
*/
-static const char *session_state_name(int s)
+const char *ceph_session_state_name(int s)
{
switch (s) {
case CEPH_MDS_SESSION_NEW: return "new";
@@ -341,33 +899,31 @@ static const char *session_state_name(int s)
case CEPH_MDS_SESSION_OPEN: return "open";
case CEPH_MDS_SESSION_HUNG: return "hung";
case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_CLOSED: return "closed";
case CEPH_MDS_SESSION_RESTARTING: return "restarting";
case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+ case CEPH_MDS_SESSION_REJECTED: return "rejected";
default: return "???";
}
}
-static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{
- if (atomic_inc_not_zero(&s->s_ref)) {
- dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ if (refcount_inc_not_zero(&s->s_ref))
return s;
- } else {
- dout("mdsc get_session %p 0 -- FAIL", s);
- return NULL;
- }
+ return NULL;
}
void ceph_put_mds_session(struct ceph_mds_session *s)
{
- dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
+ if (IS_ERR_OR_NULL(s))
+ return;
+
+ if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
- ceph_auth_destroy_authorizer(
- s->s_mdsc->fsc->client->monc.auth,
- s->s_auth.authorizer);
+ ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+ WARN_ON(mutex_is_locked(&s->s_mutex));
+ xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
@@ -378,22 +934,17 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
int mds)
{
- struct ceph_mds_session *session;
-
- if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return NULL;
- session = mdsc->sessions[mds];
- dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
- get_session(session);
- return session;
+ return ceph_get_mds_session(mdsc->sessions[mds]);
}
static bool __have_session(struct ceph_mds_client *mdsc, int mds)
{
- if (mds >= mdsc->max_sessions)
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return false;
- return mdsc->sessions[mds];
+ else
+ return true;
}
static int __verify_registered_session(struct ceph_mds_client *mdsc,
@@ -412,59 +963,63 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc,
static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *s;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+ return ERR_PTR(-EIO);
+
+ if (mds >= mdsc->mdsmap->possible_max_rank)
+ return ERR_PTR(-EINVAL);
+
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s)
return ERR_PTR(-ENOMEM);
+
+ if (mds >= mdsc->max_sessions) {
+ int newmax = 1 << get_count_order(mds + 1);
+ struct ceph_mds_session **sa;
+ size_t ptr_size = sizeof(struct ceph_mds_session *);
+
+ doutc(cl, "realloc to %d\n", newmax);
+ sa = kcalloc(newmax, ptr_size, GFP_NOFS);
+ if (!sa)
+ goto fail_realloc;
+ if (mdsc->sessions) {
+ memcpy(sa, mdsc->sessions,
+ mdsc->max_sessions * ptr_size);
+ kfree(mdsc->sessions);
+ }
+ mdsc->sessions = sa;
+ mdsc->max_sessions = newmax;
+ }
+
+ doutc(cl, "mds%d\n", mds);
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
- s->s_ttl = 0;
- s->s_seq = 0;
mutex_init(&s->s_mutex);
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
- spin_lock_init(&s->s_gen_ttl_lock);
- s->s_cap_gen = 0;
+ atomic_set(&s->s_cap_gen, 1);
s->s_cap_ttl = jiffies - 1;
spin_lock_init(&s->s_cap_lock);
- s->s_renew_requested = 0;
- s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
- s->s_nr_caps = 0;
- s->s_trim_caps = 0;
- atomic_set(&s->s_ref, 1);
+ refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
- s->s_num_cap_releases = 0;
- s->s_cap_iterator = NULL;
+ xa_init(&s->s_delegated_inos);
INIT_LIST_HEAD(&s->s_cap_releases);
- INIT_LIST_HEAD(&s->s_cap_releases_done);
- INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+ INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
- dout("register_session mds%d\n", mds);
- if (mds >= mdsc->max_sessions) {
- int newmax = 1 << get_count_order(mds+1);
- struct ceph_mds_session **sa;
+ INIT_LIST_HEAD(&s->s_cap_dirty);
+ INIT_LIST_HEAD(&s->s_cap_flushing);
- dout("register_session realloc to %d\n", newmax);
- sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
- if (sa == NULL)
- goto fail_realloc;
- if (mdsc->sessions) {
- memcpy(sa, mdsc->sessions,
- mdsc->max_sessions * sizeof(void *));
- kfree(mdsc->sessions);
- }
- mdsc->sessions = sa;
- mdsc->max_sessions = newmax;
- }
mdsc->sessions[mds] = s;
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+ atomic_inc(&mdsc->num_sessions);
+ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
@@ -482,11 +1037,12 @@ fail_realloc:
static void __unregister_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
- dout("__unregister_session mds%d %p\n", s->s_mds, s);
+ doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
+ atomic_dec(&mdsc->num_sessions);
}
/*
@@ -502,28 +1058,59 @@ static void put_request_session(struct ceph_mds_request *req)
}
}
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state)
+{
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+ struct ceph_mds_session *s;
+
+ s = __ceph_lookup_mds_session(mdsc, mds);
+ if (!s)
+ continue;
+
+ if (check_state && !check_session_state(s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ cb(s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
void ceph_mdsc_release_request(struct kref *kref)
{
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ ceph_mdsc_release_dir_caps_async(req);
+ destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode);
}
- if (req->r_locked_dir)
- ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_target_inode)
- iput(req->r_target_inode);
+ if (req->r_parent) {
+ ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ iput(req->r_parent);
+ }
+ iput(req->r_target_inode);
+ iput(req->r_new_inode);
if (req->r_dentry)
dput(req->r_dentry);
- if (req->r_old_dentry) {
+ if (req->r_old_dentry)
+ dput(req->r_old_dentry);
+ if (req->r_old_dentry_dir) {
/*
* track (and drop pins for) r_old_dentry_dir
* separately, since r_old_dentry's d_parent may have
@@ -532,61 +1119,40 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- dput(req->r_old_dentry);
iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
kfree(req->r_path2);
+ put_cred(req->r_cred);
+ if (req->r_mnt_idmap)
+ mnt_idmap_put(req->r_mnt_idmap);
+ if (req->r_pagelist)
+ ceph_pagelist_release(req->r_pagelist);
+ kfree(req->r_fscrypt_auth);
+ kfree(req->r_altname);
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
- kfree(req);
+ WARN_ON_ONCE(!list_empty(&req->r_wait));
+ kmem_cache_free(ceph_mds_request_cachep, req);
}
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
/*
* lookup session, bump ref if found.
*
* called under mdsc->mutex.
*/
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
- u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
{
struct ceph_mds_request *req;
- struct rb_node *n = mdsc->request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mds_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else {
- ceph_mdsc_get_request(req);
- return req;
- }
- }
- return NULL;
-}
-
-static void __insert_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *new)
-{
- struct rb_node **p = &mdsc->request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mds_request *req = NULL;
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mds_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
+ req = lookup_request(&mdsc->request_tree, tid);
+ if (req)
+ ceph_mdsc_get_request(req);
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &mdsc->request_tree);
+ return req;
}
/*
@@ -599,23 +1165,38 @@ static void __register_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
struct inode *dir)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int ret = 0;
+
req->r_tid = ++mdsc->last_tid;
- if (req->r_num_caps)
- ceph_reserve_caps(mdsc, &req->r_caps_reservation,
- req->r_num_caps);
- dout("__register_request %p tid %lld\n", req, req->r_tid);
+ if (req->r_num_caps) {
+ ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+ req->r_num_caps);
+ if (ret < 0) {
+ pr_err_client(cl, "%p failed to reserve caps: %d\n",
+ req, ret);
+ /* set req->r_err to fail early from __do_request */
+ req->r_err = ret;
+ return;
+ }
+ }
+ doutc(cl, "%p tid %lld\n", req, req->r_tid);
ceph_mdsc_get_request(req);
- __insert_request(mdsc, req);
+ insert_request(&mdsc->request_tree, req);
+
+ req->r_cred = get_current_cred();
+ if (!req->r_mnt_idmap)
+ req->r_mnt_idmap = &nop_mnt_idmap;
- req->r_uid = current_fsuid();
- req->r_gid = current_fsgid();
+ if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+ mdsc->oldest_tid = req->r_tid;
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
+ spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
@@ -624,25 +1205,74 @@ static void __register_request(struct ceph_mds_client *mdsc,
static void __unregister_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &mdsc->request_tree);
- RB_CLEAR_NODE(&req->r_node);
+ doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid);
+
+ /* Never leave an unregistered request on an unsafe list! */
+ list_del_init(&req->r_unsafe_item);
+
+ if (req->r_tid == mdsc->oldest_tid) {
+ struct rb_node *p = rb_next(&req->r_node);
+ mdsc->oldest_tid = 0;
+ while (p) {
+ struct ceph_mds_request *next_req =
+ rb_entry(p, struct ceph_mds_request, r_node);
+ if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+ mdsc->oldest_tid = next_req->r_tid;
+ break;
+ }
+ p = rb_next(p);
+ }
+ }
+
+ erase_request(&mdsc->request_tree, req);
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
+ complete_all(&req->r_safe_completion);
+
ceph_mdsc_put_request(req);
}
/*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry)
+{
+ struct inode *inode = NULL;
+
+ while (dentry && !IS_ROOT(dentry)) {
+ inode = d_inode_rcu(dentry);
+ if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+ break;
+ dentry = dentry->d_parent;
+ }
+ if (inode)
+ inode = igrab(inode);
+ return inode;
+}
+
+/*
* Choose mds to send request to next. If there is a hint set in the
* request (e.g., due to a prior forward hint from the mds), use that.
* Otherwise, consult frag tree and/or caps to identify the
@@ -650,21 +1280,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
*
* Called under mdsc->mutex.
*/
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
- /*
- * we don't need to worry about protecting the d_parent access
- * here because we never renaming inside the snapped namespace
- * except to resplice to another snapdir, and either the old or new
- * result is a valid result.
- */
- while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
- dentry = dentry->d_parent;
- return dentry;
-}
-
static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+ struct ceph_mds_request *req,
+ bool *random)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -672,7 +1290,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
int mode = req->r_direct_mode;
int mds = -1;
u32 hash = req->r_direct_hash;
- bool is_hash = req->r_direct_is_hash;
+ bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ if (random)
+ *random = false;
/*
* is there a specific mds we should try? ignore hint if we have
@@ -681,8 +1303,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("choose_mds using resend_mds mds%d\n",
- req->r_resend_mds);
+ doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds);
return req->r_resend_mds;
}
@@ -691,36 +1312,57 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode = NULL;
if (req->r_inode) {
- inode = req->r_inode;
+ if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
+ inode = req->r_inode;
+ ihold(inode);
+ } else {
+ /* req->r_dentry is non-null for LSSNAP request */
+ rcu_read_lock();
+ inode = get_nonsnap_parent(req->r_dentry);
+ rcu_read_unlock();
+ doutc(cl, "using snapdir's parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
+ }
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
- struct dentry *parent = req->r_dentry->d_parent;
- struct inode *dir = parent->d_inode;
-
- if (dir->i_sb != mdsc->fsc->sb) {
- /* not this fs! */
- inode = req->r_dentry->d_inode;
+ struct dentry *parent;
+ struct inode *dir;
+
+ rcu_read_lock();
+ parent = READ_ONCE(req->r_dentry->d_parent);
+ dir = req->r_parent ? : d_inode_rcu(parent);
+
+ if (!dir || dir->i_sb != mdsc->fsc->sb) {
+ /* not this fs or parent went negative */
+ inode = d_inode(req->r_dentry);
+ if (inode)
+ ihold(inode);
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
- struct dentry *dn = get_nonsnap_parent(parent);
- inode = dn->d_inode;
- dout("__choose_mds using nonsnap parent %p\n", inode);
- } else if (req->r_dentry->d_inode) {
- /* dentry target */
- inode = req->r_dentry->d_inode;
+ inode = get_nonsnap_parent(parent);
+ doutc(cl, "using nonsnap parent %p %llx.%llx\n",
+ inode, ceph_vinop(inode));
} else {
- /* dir + name */
- inode = dir;
- hash = ceph_dentry_hash(dir, req->r_dentry);
- is_hash = true;
+ /* dentry target */
+ inode = d_inode(req->r_dentry);
+ if (!inode || mode == USE_AUTH_MDS) {
+ /* dir + name */
+ inode = igrab(dir);
+ hash = ceph_dentry_hash(dir, req->r_dentry);
+ is_hash = true;
+ } else {
+ ihold(inode);
+ }
}
+ rcu_read_unlock();
}
- dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
- (int)hash, mode);
if (!inode)
goto random;
+
+ doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode,
+ ceph_vinop(inode), (int)is_hash, hash, mode);
ci = ceph_inode(inode);
if (is_hash && S_ISDIR(inode->i_mode)) {
@@ -736,30 +1378,31 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1);
r %= frag.ndist;
mds = frag.dist[r];
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (%d/%d)\n",
- inode, ceph_vinop(inode),
- frag.frag, mds,
- (int)r, frag.ndist);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode), frag.frag,
+ mds, (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- return mds;
+ CEPH_MDS_STATE_ACTIVE &&
+ !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
+ goto out;
}
/* since this file/dir wasn't known to be
* replicated, then we want to look for the
* authoritative mds. */
- mode = USE_AUTH_MDS;
if (frag.mds >= 0) {
/* choose auth mds */
mds = frag.mds;
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (auth)\n",
- inode, ceph_vinop(inode), frag.frag, mds);
+ doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- return mds;
+ CEPH_MDS_STATE_ACTIVE) {
+ if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ mds))
+ goto out;
+ }
}
+ mode = USE_AUTH_MDS;
}
}
@@ -771,18 +1414,24 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
if (!cap) {
spin_unlock(&ci->i_ceph_lock);
+ iput(inode);
goto random;
}
mds = cap->session->s_mds;
- dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
- inode, ceph_vinop(inode), mds,
- cap == ci->i_auth_cap ? "auth " : "", cap);
+ doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode,
+ ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
+out:
+ iput(inode);
return mds;
random:
+ if (random)
+ *random = true;
+
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("choose_mds chose random mds%d\n", mds);
+ doutc(cl, "chose random mds%d\n", mds);
return mds;
}
@@ -790,7 +1439,7 @@ random:
/*
* session messages
*/
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -798,12 +1447,210 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session %s msg\n",
+ ceph_session_op_name(op));
return NULL;
}
h = msg->front.iov_base;
h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
+
+ return msg;
+}
+
+static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
+#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
+static int encode_supported_features(void **p, void *end)
+{
+ static const size_t count = ARRAY_SIZE(feature_bits);
+
+ if (count > 0) {
+ size_t i;
+ size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
+
+ if (WARN_ON_ONCE(*p + 4 + size > end))
+ return -ERANGE;
+
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
+ *p += size;
+ } else {
+ if (WARN_ON_ONCE(*p + 4 > end))
+ return -ERANGE;
+
+ ceph_encode_32(p, 0);
+ }
+
+ return 0;
+}
+
+static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
+#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
+static int encode_metric_spec(void **p, void *end)
+{
+ static const size_t count = ARRAY_SIZE(metric_bits);
+
+ /* header */
+ if (WARN_ON_ONCE(*p + 2 > end))
+ return -ERANGE;
+
+ ceph_encode_8(p, 1); /* version */
+ ceph_encode_8(p, 1); /* compat */
+
+ if (count > 0) {
+ size_t i;
+ size_t size = METRIC_BYTES(count);
+
+ if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4 + size);
+
+ /* metric spec */
+ ceph_encode_32(p, size);
+ memset(*p, 0, size);
+ for (i = 0; i < count; i++)
+ ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
+ *p += size;
+ } else {
+ if (WARN_ON_ONCE(*p + 4 + 4 > end))
+ return -ERANGE;
+
+ /* metric spec info length */
+ ceph_encode_32(p, 4);
+ /* metric spec */
+ ceph_encode_32(p, 0);
+ }
+
+ return 0;
+}
+
+/*
+ * session message, specialization for CEPH_SESSION_REQUEST_OPEN
+ * to include additional client metadata fields.
+ */
+static struct ceph_msg *
+create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+ int i;
+ int extra_bytes = 0;
+ int metadata_key_count = 0;
+ struct ceph_options *opt = mdsc->fsc->client->options;
+ struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ struct ceph_client *cl = mdsc->fsc->client;
+ size_t size, count;
+ void *p, *end;
+ int ret;
+
+ const char* metadata[][2] = {
+ {"hostname", mdsc->nodename},
+ {"kernel_version", init_utsname()->release},
+ {"entity_id", opt->name ? : ""},
+ {"root", fsopt->server_path ? : "/"},
+ {NULL, NULL}
+ };
+
+ /* Calculate serialized length of metadata */
+ extra_bytes = 4; /* map length */
+ for (i = 0; metadata[i][0]; ++i) {
+ extra_bytes += 8 + strlen(metadata[i][0]) +
+ strlen(metadata[i][1]);
+ metadata_key_count++;
+ }
+
+ /* supported feature */
+ size = 0;
+ count = ARRAY_SIZE(feature_bits);
+ if (count > 0)
+ size = FEATURE_BYTES(count);
+ extra_bytes += 4 + size;
+
+ /* metric spec */
+ size = 0;
+ count = ARRAY_SIZE(metric_bits);
+ if (count > 0)
+ size = METRIC_BYTES(count);
+ extra_bytes += 2 + 4 + 4 + size;
+
+ /* flags, mds auth caps and oldest_client_tid */
+ extra_bytes += 4 + 4 + 8;
+
+ /* Allocate the message */
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
+ GFP_NOFS, false);
+ if (!msg) {
+ pr_err_client(cl, "ENOMEM creating session open msg\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ h = p;
+ h->op = cpu_to_le32(op);
+ h->seq = cpu_to_le64(seq);
+
+ /*
+ * Serialize client metadata into waiting buffer space, using
+ * the format that userspace expects for map<string, string>
+ *
+ * ClientSession messages with metadata are v7
+ */
+ msg->hdr.version = cpu_to_le16(7);
+ msg->hdr.compat_version = cpu_to_le16(1);
+
+ /* The write pointer, following the session_head structure */
+ p += sizeof(*h);
+
+ /* Number of entries in the map */
+ ceph_encode_32(&p, metadata_key_count);
+
+ /* Two length-prefixed strings for each entry in the map */
+ for (i = 0; metadata[i][0]; ++i) {
+ size_t const key_len = strlen(metadata[i][0]);
+ size_t const val_len = strlen(metadata[i][1]);
+
+ ceph_encode_32(&p, key_len);
+ memcpy(p, metadata[i][0], key_len);
+ p += key_len;
+ ceph_encode_32(&p, val_len);
+ memcpy(p, metadata[i][1], val_len);
+ p += val_len;
+ }
+
+ ret = encode_supported_features(&p, end);
+ if (ret) {
+ pr_err_client(cl, "encode_supported_features failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
+ ret = encode_metric_spec(&p, end);
+ if (ret) {
+ pr_err_client(cl, "encode_metric_spec failed!\n");
+ ceph_msg_put(msg);
+ return ERR_PTR(ret);
+ }
+
+ /* version == 5, flags */
+ ceph_encode_32(&p, 0);
+
+ /* version == 6, mds auth caps */
+ ceph_encode_32(&p, 0);
+
+ /* version == 7, oldest_client_tid */
+ ceph_encode_64(&p, mdsc->oldest_tid);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
return msg;
}
@@ -819,17 +1666,21 @@ static int __open_session(struct ceph_mds_client *mdsc,
int mstate;
int mds = session->s_mds;
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
+ return -EIO;
+
/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
- dout("open_session to mds%d (%s)\n", mds,
- ceph_mds_state_name(mstate));
+ doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
session->s_state = CEPH_MDS_SESSION_OPENING;
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
- if (!msg)
- return -ENOMEM;
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,
+ session->s_seq);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -839,71 +1690,122 @@ static int __open_session(struct ceph_mds_client *mdsc,
*
* called under mdsc->mutex
*/
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+ int ret;
+
+ session = __ceph_lookup_mds_session(mdsc, target);
+ if (!session) {
+ session = register_session(mdsc, target);
+ if (IS_ERR(session))
+ return session;
+ }
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ ret = __open_session(mdsc, session);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "to mds%d\n", target);
+
+ mutex_lock(&mdsc->mutex);
+ session = __open_export_target_session(mdsc, target);
+ mutex_unlock(&mdsc->mutex);
+
+ return session;
+}
+
static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_info *mi;
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- int target;
+ struct ceph_client *cl = mdsc->fsc->client;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->possible_max_rank)
return;
+
mi = &mdsc->mdsmap->m_info[mds];
- dout("open_export_target_sessions for mds%d (%d targets)\n",
- session->s_mds, mi->num_export_targets);
+ doutc(cl, "for mds%d (%d targets)\n", session->s_mds,
+ mi->num_export_targets);
for (i = 0; i < mi->num_export_targets; i++) {
- target = mi->export_targets[i];
- ts = __ceph_lookup_mds_session(mdsc, target);
- if (!ts) {
- ts = register_session(mdsc, target);
- if (IS_ERR(ts))
- return;
- }
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- else
- dout(" mds%d target mds%d %p is %s\n", session->s_mds,
- i, ts, session_state_name(ts->s_state));
+ ts = __open_export_target_session(mdsc, mi->export_targets[i]);
ceph_put_mds_session(ts);
}
}
-void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- mutex_lock(&mdsc->mutex);
- __open_export_target_sessions(mdsc, session);
- mutex_unlock(&mdsc->mutex);
-}
-
/*
* session caps
*/
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
+static void detach_cap_releases(struct ceph_mds_session *session,
+ struct list_head *target)
{
- struct ceph_msg *msg;
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
+ lockdep_assert_held(&session->s_cap_lock);
+
+ list_splice_init(&session->s_cap_releases, target);
+ session->s_num_cap_releases = 0;
+ doutc(cl, "mds%d\n", session->s_mds);
+}
+
+static void dispose_cap_releases(struct ceph_mds_client *mdsc,
+ struct list_head *dispose)
+{
+ while (!list_empty(dispose)) {
+ struct ceph_cap *cap;
+ /* zero out the in-progress message */
+ cap = list_first_entry(dispose, struct ceph_cap, session_caps);
+ list_del(&cap->session_caps);
+ ceph_put_cap(mdsc, cap);
}
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
+}
+
+static void cleanup_session_requests(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_request *req;
+ struct rb_node *p;
+
+ doutc(cl, "mds%d\n", session->s_mds);
+ mutex_lock(&mdsc->mutex);
+ while (!list_empty(&session->s_unsafe)) {
+ req = list_first_entry(&session->s_unsafe,
+ struct ceph_mds_request, r_unsafe_item);
+ pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n",
+ req->r_tid);
+ if (req->r_target_inode)
+ mapping_set_error(req->r_target_inode->i_mapping, -EIO);
+ if (req->r_unsafe_dir)
+ mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
+ __unregister_request(mdsc, req);
}
- spin_unlock(&session->s_cap_lock);
+ /* zero r_attempts, so kick_requests() will re-send requests */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds)
+ req->r_attempts = 0;
+ }
+ mutex_unlock(&mdsc->mutex);
}
/*
@@ -912,27 +1814,31 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
*
* Caller must hold session s_mutex.
*/
-static int iterate_session_caps(struct ceph_mds_session *session,
- int (*cb)(struct inode *, struct ceph_cap *,
- void *), void *arg)
+int ceph_iterate_session_caps(struct ceph_mds_session *session,
+ int (*cb)(struct inode *, int mds, void *),
+ void *arg)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct list_head *p;
struct ceph_cap *cap;
struct inode *inode, *last_inode = NULL;
struct ceph_cap *old_cap = NULL;
int ret;
- dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ doutc(cl, "%p mds%d\n", session, session->s_mds);
spin_lock(&session->s_cap_lock);
p = session->s_caps.next;
while (p != &session->s_caps) {
+ int mds;
+
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
}
session->s_cap_iterator = cap;
+ mds = cap->mds;
spin_unlock(&session->s_cap_lock);
if (last_inode) {
@@ -944,19 +1850,22 @@ static int iterate_session_caps(struct ceph_mds_session *session,
old_cap = NULL;
}
- ret = cb(inode, cap, arg);
+ ret = cb(inode, mds, arg);
last_inode = inode;
spin_lock(&session->s_cap_lock);
p = p->next;
- if (cap->ci == NULL) {
- dout("iterate_session_caps finishing cap %p removal\n",
- cap);
+ if (!cap->ci) {
+ doutc(cl, "finishing cap %p removal\n", cap);
BUG_ON(cap->session != session);
+ cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- cap->session = NULL;
- old_cap = cap; /* put_cap it w/o locks held */
+ atomic64_dec(&session->s_mdsc->metric.total_caps);
+ if (cap->queue_release)
+ __ceph_queue_cap_release(session, cap);
+ else
+ old_cap = cap; /* put_cap it w/o locks held */
}
if (ret < 0)
goto out;
@@ -966,57 +1875,36 @@ out:
session->s_cap_iterator = NULL;
spin_unlock(&session->s_cap_lock);
- if (last_inode)
- iput(last_inode);
+ iput(last_inode);
if (old_cap)
ceph_put_cap(session->s_mdsc, old_cap);
return ret;
}
-static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int drop = 0;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ bool invalidate = false;
+ struct ceph_cap *cap;
+ int iputs = 0;
- dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- if (!__ceph_is_any_real_caps(ci)) {
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
-
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- pr_info(" dropping dirty %s state for %p %lld\n",
- ceph_cap_string(ci->i_dirty_caps),
- inode, ceph_ino(inode));
- ci->i_dirty_caps = 0;
- list_del_init(&ci->i_dirty_item);
- drop = 1;
- }
- if (!list_empty(&ci->i_flushing_item)) {
- pr_info(" dropping dirty+flushing %s state for %p %lld\n",
- ceph_cap_string(ci->i_flushing_caps),
- inode, ceph_ino(inode));
- ci->i_flushing_caps = 0;
- list_del_init(&ci->i_flushing_item);
- mdsc->num_cap_flushing--;
- drop = 1;
- }
- if (drop && ci->i_wrbuffer_ref) {
- pr_info(" dropping dirty data for %p %lld\n",
- inode, ceph_ino(inode));
- ci->i_wrbuffer_ref = 0;
- ci->i_wrbuffer_ref_head = 0;
- drop++;
- }
- spin_unlock(&mdsc->cap_dirty_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap) {
+ doutc(cl, " removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->netfs.inode);
+
+ iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
}
spin_unlock(&ci->i_ceph_lock);
- while (drop--)
+
+ if (cap)
+ wake_up_all(&ci->i_cap_wq);
+ if (invalidate)
+ ceph_queue_invalidate(inode);
+ while (iputs--)
iput(inode);
return 0;
}
@@ -1026,40 +1914,96 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
*/
static void remove_session_caps(struct ceph_mds_session *session)
{
- dout("remove_session_caps on %p\n", session);
- iterate_session_caps(session, remove_session_caps_cb, NULL);
+ struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+ struct super_block *sb = fsc->sb;
+ LIST_HEAD(dispose);
+
+ doutc(fsc->client, "on %p\n", session);
+ ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
+
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
+ spin_lock(&session->s_cap_lock);
+ if (session->s_nr_caps > 0) {
+ struct inode *inode;
+ struct ceph_cap *cap, *prev = NULL;
+ struct ceph_vino vino;
+ /*
+ * iterate_session_caps() skips inodes that are being
+ * deleted, we need to wait until deletions are complete.
+ * __wait_on_freeing_inode() is designed for the job,
+ * but it is not exported, so use lookup inode function
+ * to access it.
+ */
+ while (!list_empty(&session->s_caps)) {
+ cap = list_entry(session->s_caps.next,
+ struct ceph_cap, session_caps);
+ if (cap == prev)
+ break;
+ prev = cap;
+ vino = cap->ci->i_vino;
+ spin_unlock(&session->s_cap_lock);
+
+ inode = ceph_find_inode(sb, vino);
+ iput(inode);
+
+ spin_lock(&session->s_cap_lock);
+ }
+ }
+
+ // drop cap expires and unlock s_cap_lock
+ detach_cap_releases(session, &dispose);
+
BUG_ON(session->s_nr_caps > 0);
BUG_ON(!list_empty(&session->s_cap_flushing));
- cleanup_cap_releases(session);
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(session->s_mdsc, &dispose);
}
+enum {
+ RECONNECT,
+ RENEWCAPS,
+ FORCE_RO,
+};
+
/*
* wake up any threads waiting on this session's caps. if the cap is
* old (didn't get renewed on the client reconnect), remove it now.
*
* caller must hold s_mutex.
*/
-static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned long ev = (unsigned long)arg;
- wake_up_all(&ci->i_cap_wq);
- if (arg) {
+ if (ev == RECONNECT) {
spin_lock(&ci->i_ceph_lock);
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
+ } else if (ev == RENEWCAPS) {
+ struct ceph_cap *cap;
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ /* mds did not re-issue stale cap */
+ if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
+ cap->issued = cap->implemented = CEPH_CAP_PIN;
+ spin_unlock(&ci->i_ceph_lock);
+ } else if (ev == FORCE_RO) {
}
+ wake_up_all(&ci->i_cap_wq);
return 0;
}
-static void wake_up_session_caps(struct ceph_mds_session *session,
- int reconnect)
+static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
{
- dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
- iterate_session_caps(session, wake_up_session_cb,
- (void *)(unsigned long)reconnect);
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
+
+ doutc(cl, "session %p mds%d\n", session, session->s_mds);
+ ceph_iterate_session_caps(session, wake_up_session_cb,
+ (void *)(unsigned long)ev);
}
/*
@@ -1071,33 +2015,50 @@ static void wake_up_session_caps(struct ceph_mds_session *session,
static int send_renew_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
int state;
if (time_after_eq(jiffies, session->s_cap_ttl) &&
time_after_eq(session->s_cap_ttl, session->s_renew_requested))
- pr_info("mds%d caps stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps stale\n", session->s_mds);
session->s_renew_requested = jiffies;
/* do not try to renew caps until a recovering mds has reconnected
* with its clients. */
state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
if (state < CEPH_MDS_STATE_RECONNECT) {
- dout("send_renew_caps ignoring mds%d (%s)\n",
- session->s_mds, ceph_mds_state_name(state));
+ doutc(cl, "ignoring mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
return 0;
}
- dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
- ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
+ doutc(cl, "to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, u64 seq)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_msg *msg;
+
+ doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
return 0;
}
+
/*
* Note new cap ttl, and any transition from stale -> not stale (fresh?).
*
@@ -1106,6 +2067,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
static void renewed_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session, int is_renew)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int was_stale;
int wake = 0;
@@ -1117,37 +2079,39 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
if (was_stale) {
if (time_before(jiffies, session->s_cap_ttl)) {
- pr_info("mds%d caps renewed\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps renewed\n",
+ session->s_mds);
wake = 1;
} else {
- pr_info("mds%d caps still stale\n", session->s_mds);
+ pr_info_client(cl, "mds%d caps still stale\n",
+ session->s_mds);
}
}
- dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
- session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
- time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds,
+ session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
spin_unlock(&session->s_cap_lock);
if (wake)
- wake_up_session_caps(session, 0);
+ wake_up_session_caps(session, RENEWCAPS);
}
/*
* send a session close request
*/
-static int request_close_session(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int request_close_session(struct ceph_mds_session *session)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
- dout("request_close_session mds%d state %s seq %lld\n",
- session->s_mds, session_state_name(session->s_state),
- session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ doutc(cl, "mds%d state %s seq %lld\n", session->s_mds,
+ ceph_session_state_name(session->s_state), session->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+ session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
- return 0;
+ return 1;
}
/*
@@ -1159,7 +2123,30 @@ static int __close_session(struct ceph_mds_client *mdsc,
if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
return 0;
session->s_state = CEPH_MDS_SESSION_CLOSING;
- return request_close_session(mdsc, session);
+ return request_close_session(session);
+}
+
+static bool drop_negative_children(struct dentry *dentry)
+{
+ struct dentry *child;
+ bool all_negative = true;
+
+ if (!d_is_dir(dentry))
+ goto out;
+
+ spin_lock(&dentry->d_lock);
+ hlist_for_each_entry(child, &dentry->d_children, d_sib) {
+ if (d_really_is_positive(child)) {
+ all_negative = false;
+ break;
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (all_negative)
+ shrink_dcache_parent(dentry);
+out:
+ return all_negative;
}
/*
@@ -1172,40 +2159,77 @@ static int __close_session(struct ceph_mds_client *mdsc,
* Yes, this is a bit sloppy. Our only real goal here is to respond to
* memory pressure from the MDS, though, so it needn't be perfect.
*/
-static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+static int trim_caps_cb(struct inode *inode, int mds, void *arg)
{
- struct ceph_mds_session *session = arg;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ int *remaining = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
- int used, oissued, mine;
+ int used, wanted, oissued, mine;
+ struct ceph_cap *cap;
- if (session->s_trim_caps <= 0)
+ if (*remaining <= 0)
return -1;
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ return 0;
+ }
mine = cap->issued | cap->implemented;
used = __ceph_caps_used(ci);
+ wanted = __ceph_caps_file_wanted(ci);
oissued = __ceph_caps_issued_other(ci, cap);
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
- inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used));
- if (ci->i_dirty_caps)
- goto out; /* dirty caps */
- if ((used & ~oissued) & mine)
+ doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(mine),
+ ceph_cap_string(oissued), ceph_cap_string(used),
+ ceph_cap_string(wanted));
+ if (cap == ci->i_auth_cap) {
+ if (ci->i_dirty_caps || ci->i_flushing_caps ||
+ !list_empty(&ci->i_cap_snaps))
+ goto out;
+ if ((used | wanted) & CEPH_CAP_ANY_WR)
+ goto out;
+ /* Note: it's possible that i_filelock_ref becomes non-zero
+ * after dropping auth caps. It doesn't hurt because reply
+ * of lock mds request will re-add auth caps. */
+ if (atomic_read(&ci->i_filelock_ref) > 0)
+ goto out;
+ }
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (S_ISREG(inode->i_mode) &&
+ wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
+ if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
- session->s_trim_caps--;
if (oissued) {
/* we aren't the only cap.. just remove us */
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
- __ceph_remove_cap(cap);
+ ceph_remove_cap(mdsc, cap, true);
+ (*remaining)--;
} else {
- /* try to drop referring dentries */
+ struct dentry *dentry;
+ /* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
- d_prune_aliases(inode);
- dout("trim_caps_cb %p cap %p pruned, count now %d\n",
- inode, cap, atomic_read(&inode->i_count));
+ dentry = d_find_any_alias(inode);
+ if (dentry && drop_negative_children(dentry)) {
+ int count;
+ dput(dentry);
+ d_prune_aliases(inode);
+ count = icount_read(inode);
+ if (count == 1)
+ (*remaining)--;
+ doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
+ inode, ceph_vinop(inode), cap, count);
+ } else {
+ dput(dentry);
+ }
return 0;
}
@@ -1217,229 +2241,364 @@ out:
/*
* Trim session cap count down to some max number.
*/
-static int trim_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- int max_caps)
+int ceph_trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int trim_caps = session->s_nr_caps - max_caps;
- dout("trim_caps mds%d start: %d / %d, trim %d\n",
- session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds,
+ session->s_nr_caps, max_caps, trim_caps);
if (trim_caps > 0) {
- session->s_trim_caps = trim_caps;
- iterate_session_caps(session, trim_caps_cb, session);
- dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
- session->s_mds, session->s_nr_caps, max_caps,
- trim_caps - session->s_trim_caps);
- session->s_trim_caps = 0;
+ int remaining = trim_caps;
+
+ ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
+ doutc(cl, "mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - remaining);
}
+
+ ceph_flush_session_cap_releases(mdsc, session);
return 0;
}
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int ret = 1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ doutc(cl, "still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
+ ret = 0;
+ }
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ return ret;
+}
+
/*
- * Allocate cap_release messages. If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
+ * flush all dirty inode data to disk.
*
- * Called under s_mutex.
+ * returns true if we've flushed through want_flush_tid
*/
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "want %llu\n", want_flush_tid);
+
+ wait_event(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid));
+
+ doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
+}
+
+/*
+ * called under s_mutex
+ */
+static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
- struct ceph_msg *msg, *partial = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
- int err = -ENOMEM;
- int extra = mdsc->fsc->mount_options->cap_release_safety;
- int num;
+ struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ struct ceph_cap *cap;
+ LIST_HEAD(tmp_list);
+ int num_cap_releases;
+ __le32 barrier, *cap_barrier;
- dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
- extra);
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
spin_lock(&session->s_cap_lock);
+again:
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ num_cap_releases = session->s_num_cap_releases;
+ session->s_num_cap_releases = 0;
+ spin_unlock(&session->s_cap_lock);
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- if (num) {
- dout(" partial %p with (%d/%d)\n", msg, num,
- (int)CEPH_CAPS_PER_RELEASE);
- extra += CEPH_CAPS_PER_RELEASE - num;
- partial = msg;
+ while (!list_empty(&tmp_list)) {
+ if (!msg) {
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
+ PAGE_SIZE, GFP_NOFS, false);
+ if (!msg)
+ goto out_err;
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
}
- }
- while (session->s_num_cap_releases < session->s_nr_caps + extra) {
- spin_unlock(&session->s_cap_lock);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS, false);
- if (!msg)
- goto out_unlocked;
- dout("add_cap_releases %p msg %p now %d\n", session, msg,
- (int)msg->front.iov_len);
+
+ cap = list_first_entry(&tmp_list, struct ceph_cap,
+ session_caps);
+ list_del(&cap->session_caps);
+ num_cap_releases--;
+
head = msg->front.iov_base;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- spin_lock(&session->s_cap_lock);
- list_add(&msg->list_head, &session->s_cap_releases);
- session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+ put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
+ &head->num);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(cap->cap_ino);
+ item->cap_id = cpu_to_le64(cap->cap_id);
+ item->migrate_seq = cpu_to_le32(cap->mseq);
+ item->issue_seq = cpu_to_le32(cap->issue_seq);
+ msg->front.iov_len += sizeof(*item);
+
+ ceph_put_cap(mdsc, cap);
+
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ msg = NULL;
+ }
}
- if (partial) {
- head = partial->front.iov_base;
- num = le32_to_cpu(head->num);
- dout(" queueing partial %p with %d/%d\n", partial, num,
- (int)CEPH_CAPS_PER_RELEASE);
- list_move_tail(&partial->list_head,
- &session->s_cap_releases_done);
- session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+ BUG_ON(num_cap_releases != 0);
+
+ spin_lock(&session->s_cap_lock);
+ if (!list_empty(&session->s_cap_releases))
+ goto again;
+ spin_unlock(&session->s_cap_lock);
+
+ if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ doutc(cl, "mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
}
- err = 0;
+ return;
+out_err:
+ pr_err_client(cl, "mds%d, failed to allocate message\n",
+ session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ list_splice(&tmp_list, &session->s_cap_releases);
+ session->s_num_cap_releases += num_cap_releases;
spin_unlock(&session->s_cap_lock);
-out_unlocked:
- return err;
}
-/*
- * flush all dirty inode data to disk.
- *
- * returns true if we've flushed through want_flush_seq
- */
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void ceph_cap_release_work(struct work_struct *work)
{
- int mds, ret = 1;
+ struct ceph_mds_session *session =
+ container_of(work, struct ceph_mds_session, s_cap_release_work);
- dout("check_cap_flush want %lld\n", want_flush_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
+ mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_OPEN ||
+ session->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(session->s_mdsc, session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+}
- if (!session)
- continue;
- get_session(session);
- mutex_unlock(&mdsc->mutex);
+void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
-
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
- }
- spin_unlock(&ci->i_ceph_lock);
- }
- mutex_unlock(&session->s_mutex);
+ ceph_get_mds_session(session);
+ if (queue_work(mdsc->fsc->cap_wq,
+ &session->s_cap_release_work)) {
+ doutc(cl, "cap release work queued\n");
+ } else {
ceph_put_mds_session(session);
-
- if (!ret)
- return ret;
- mutex_lock(&mdsc->mutex);
+ doutc(cl, "failed to queue cap release work\n");
}
-
- mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
}
/*
- * called under s_mutex
+ * caller holds session->s_cap_lock
*/
-void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap)
{
- struct ceph_msg *msg;
+ list_add_tail(&cap->session_caps, &session->s_cap_releases);
+ session->s_num_cap_releases++;
- dout("send_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- spin_unlock(&session->s_cap_lock);
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
- ceph_con_send(&session->s_con, msg);
- spin_lock(&session->s_cap_lock);
- }
- spin_unlock(&session->s_cap_lock);
+ if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
+ ceph_flush_session_cap_releases(session->s_mdsc, session);
}
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static void ceph_cap_reclaim_work(struct work_struct *work)
{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- unsigned num;
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_reclaim_work);
+ int ret = ceph_trim_dentries(mdsc);
+ if (ret == -EAGAIN)
+ ceph_queue_cap_reclaim_work(mdsc);
+}
- dout("discard_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
+void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
-
- /* requeue completed messages */
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
+ doutc(cl, "caps reclaim work queued\n");
+ } else {
+ doutc(cl, "failed to queue caps release work\n");
+ }
+}
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
- num);
- session->s_num_cap_releases += num;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- list_add(&msg->list_head, &session->s_cap_releases);
+void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
+{
+ int val;
+ if (!nr)
+ return;
+ val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
+ if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
+ atomic_set(&mdsc->cap_reclaim_pending, 0);
+ ceph_queue_cap_reclaim_work(mdsc);
}
+}
- spin_unlock(&session->s_cap_lock);
+void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {
+ doutc(cl, "caps unlink work queued\n");
+ } else {
+ doutc(cl, "failed to queue caps unlink work\n");
+ }
+}
+
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_unlink_work);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&mdsc->cap_delay_lock);
+ while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->netfs.inode);
+ if (inode) {
+ spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+ iput(inode);
+ spin_lock(&mdsc->cap_delay_lock);
+ }
+ }
+ spin_unlock(&mdsc->cap_delay_lock);
+ doutc(cl, "done\n");
}
/*
* requests
*/
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+ size_t size = sizeof(struct ceph_mds_reply_dir_entry);
+ unsigned int num_entries;
+ u64 bytes_count;
+ int order;
+
+ spin_lock(&ci->i_ceph_lock);
+ num_entries = ci->i_files + ci->i_subdirs;
+ spin_unlock(&ci->i_ceph_lock);
+ num_entries = max(num_entries, 1U);
+ num_entries = min(num_entries, opt->max_readdir);
+
+ bytes_count = (u64)size * num_entries;
+ if (unlikely(bytes_count > ULONG_MAX))
+ bytes_count = ULONG_MAX;
+
+ order = get_order((unsigned long)bytes_count);
+ while (order >= 0) {
+ rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_ZERO,
+ order);
+ if (rinfo->dir_entries)
+ break;
+ order--;
+ }
+ if (!rinfo->dir_entries || unlikely(order < 0))
+ return -ENOMEM;
+
+ num_entries = (PAGE_SIZE << order) / size;
+ num_entries = min(num_entries, opt->max_readdir);
+
+ rinfo->dir_buf_size = PAGE_SIZE << order;
+ req->r_num_caps = num_entries + 1;
+ req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+ return 0;
+}
+
/*
* Create an mds request.
*/
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
- struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct ceph_mds_request *req;
+ req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
mutex_init(&req->r_fill_mutex);
req->r_mdsc = mdsc;
req->r_started = jiffies;
+ req->r_start_latency = ktime_get();
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
+ req->r_feature_needed = -1;
kref_init(&req->r_kref);
+ RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_wait);
init_completion(&req->r_completion);
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
+ ktime_get_coarse_real_ts64(&req->r_stamp);
+
req->r_op = op;
req->r_direct_mode = mode;
return req;
@@ -1458,145 +2617,275 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
struct ceph_mds_request, r_node);
}
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
{
- struct ceph_mds_request *req = __get_oldest_req(mdsc);
-
- if (req)
- return req->r_tid;
- return 0;
+ return mdsc->oldest_tid;
}
-/*
- * Build a dentry's path. Allocate on heap; caller must kfree. Based
- * on build_path_from_dentry in fs/cifs/dir.c.
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
+{
+ struct inode *dir = req->r_parent;
+ struct dentry *dentry = req->r_dentry;
+ const struct qstr *name = req->r_dname;
+ u8 *cryptbuf = NULL;
+ u32 len = 0;
+ int ret = 0;
+
+ /* only encode if we have parent and dentry */
+ if (!dir || !dentry)
+ goto success;
+
+ /* No-op unless this is encrypted */
+ if (!IS_ENCRYPTED(dir))
+ goto success;
+
+ ret = ceph_fscrypt_prepare_readdir(dir);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ /* No key? Just ignore it. */
+ if (!fscrypt_has_encryption_key(dir))
+ goto success;
+
+ if (!name)
+ name = &dentry->d_name;
+
+ if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) {
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+
+ /* No need to append altname if name is short enough */
+ if (len <= CEPH_NOHASH_NAME_MAX) {
+ len = 0;
+ goto success;
+ }
+
+ cryptbuf = kmalloc(len, GFP_KERNEL);
+ if (!cryptbuf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len);
+ if (ret) {
+ kfree(cryptbuf);
+ return ERR_PTR(ret);
+ }
+success:
+ *plen = len;
+ return cryptbuf;
+}
+#else
+static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
+{
+ *plen = 0;
+ return NULL;
+}
+#endif
+
+/**
+ * ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
+ * @dentry: dentry to which path should be built
+ * @path_info: output path, length, base ino+snap, and freepath ownership flag
+ * @for_wire: is this path going to be sent to the MDS?
*
- * If @stop_on_nosnap, generate path relative to the first non-snapped
- * inode.
+ * Build a string that represents the path to the dentry. This is mostly called
+ * for two different purposes:
+ *
+ * 1) we need to build a path string to send to the MDS (for_wire == true)
+ * 2) we need a path string for local presentation (e.g. debugfs)
+ * (for_wire == false)
+ *
+ * The path is built in reverse, starting with the dentry. Walk back up toward
+ * the root, building the path until the first non-snapped inode is reached
+ * (for_wire) or the root inode is reached (!for_wire).
*
* Encode hidden .snap dirs as a double /, i.e.
* foo/.snap/bar -> foo//bar
*/
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap)
+char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ struct ceph_path_info *path_info, int for_wire)
{
- struct dentry *temp;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct dentry *cur;
+ struct inode *inode;
char *path;
- int len, pos;
+ int pos;
unsigned seq;
+ u64 base;
- if (dentry == NULL)
+ if (!dentry)
return ERR_PTR(-EINVAL);
+ path = __getname();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
retry:
- len = 0;
- seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- for (temp = dentry; !IS_ROOT(temp);) {
- struct inode *inode = temp->d_inode;
- if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
- len++; /* slash only */
- else if (stop_on_nosnap && inode &&
- ceph_snap(inode) == CEPH_NOSNAP)
- break;
- else
- len += 1 + temp->d_name.len;
- temp = temp->d_parent;
- }
- rcu_read_unlock();
- if (len)
- len--; /* no leading '/' */
+ pos = PATH_MAX - 1;
+ path[pos] = '\0';
- path = kmalloc(len+1, GFP_NOFS);
- if (path == NULL)
- return ERR_PTR(-ENOMEM);
- pos = len;
- path[pos] = 0; /* trailing null */
- rcu_read_lock();
- for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
- struct inode *inode;
+ seq = read_seqbegin(&rename_lock);
+ cur = dget(dentry);
+ for (;;) {
+ struct dentry *parent;
- spin_lock(&temp->d_lock);
- inode = temp->d_inode;
+ spin_lock(&cur->d_lock);
+ inode = d_inode(cur);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path path+%d: %p SNAPDIR\n",
- pos, temp);
- } else if (stop_on_nosnap && inode &&
+ doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur);
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
+ } else if (for_wire && inode && dentry != cur &&
ceph_snap(inode) == CEPH_NOSNAP) {
- spin_unlock(&temp->d_lock);
+ spin_unlock(&cur->d_lock);
+ pos++; /* get rid of any prepended '/' */
break;
+ } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {
+ pos -= cur->d_name.len;
+ if (pos < 0) {
+ spin_unlock(&cur->d_lock);
+ break;
+ }
+ memcpy(path + pos, cur->d_name.name, cur->d_name.len);
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
} else {
- pos -= temp->d_name.len;
+ int len, ret;
+ char buf[NAME_MAX];
+
+ /*
+ * Proactively copy name into buf, in case we need to
+ * present it as-is.
+ */
+ memcpy(buf, cur->d_name.name, cur->d_name.len);
+ len = cur->d_name.len;
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
+
+ ret = ceph_fscrypt_prepare_readdir(d_inode(parent));
+ if (ret < 0) {
+ dput(parent);
+ dput(cur);
+ return ERR_PTR(ret);
+ }
+
+ if (fscrypt_has_encryption_key(d_inode(parent))) {
+ len = ceph_encode_encrypted_dname(d_inode(parent),
+ buf, len);
+ if (len < 0) {
+ dput(parent);
+ dput(cur);
+ return ERR_PTR(len);
+ }
+ }
+ pos -= len;
if (pos < 0) {
- spin_unlock(&temp->d_lock);
+ dput(parent);
break;
}
- strncpy(path + pos, temp->d_name.name,
- temp->d_name.len);
+ memcpy(path + pos, buf, len);
}
- spin_unlock(&temp->d_lock);
- if (pos)
- path[--pos] = '/';
- temp = temp->d_parent;
+ dput(cur);
+ cur = parent;
+
+ /* Are we at the root? */
+ if (IS_ROOT(cur))
+ break;
+
+ /* Are we out of buffer? */
+ if (--pos < 0)
+ break;
+
+ path[pos] = '/';
}
- rcu_read_unlock();
- if (pos != 0 || read_seqretry(&rename_lock, seq)) {
- pr_err("build_path did not end path lookup where "
- "expected, namelen is %d, pos is %d\n", len, pos);
- /* presumably this is only possible if racing with a
- rename of one of the parent directories (we can not
- lock the dentries above us to prevent this, but
- retrying should be harmless) */
- kfree(path);
+ inode = d_inode(cur);
+ base = inode ? ceph_ino(inode) : 0;
+ dput(cur);
+
+ if (read_seqretry(&rename_lock, seq))
goto retry;
+
+ if (pos < 0) {
+ /*
+ * The path is longer than PATH_MAX and this function
+ * cannot ever succeed. Creating paths that long is
+ * possible with Ceph, but Linux cannot use them.
+ */
+ return ERR_PTR(-ENAMETOOLONG);
}
- *base = ceph_ino(temp->d_inode);
- *plen = len;
- dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, d_count(dentry), *base, len, path);
- return path;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
+ doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
+ base, PATH_MAX - 1 - pos, path + pos);
+ return path + pos;
}
-static int build_dentry_path(struct dentry *dentry,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(dentry->d_parent->d_inode);
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ rcu_read_lock();
+ if (!dir)
+ dir = d_inode_rcu(dentry->d_parent);
+ if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
+ !IS_ENCRYPTED(dir)) {
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
+ rcu_read_unlock();
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ rcu_read_unlock();
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = 1;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -1604,79 +2893,228 @@ static int build_inode_path(struct inode *inode,
* request arguments may be specified via an inode *, a dentry *, or
* an explicit ino+path.
*/
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
- const char *rpath, u64 rino,
- const char **ppath, int *pathlen,
- u64 *ino, int *freepath)
+static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+ struct dentry *rdentry, struct inode *rdiri,
+ const char *rpath, u64 rino,
+ struct ceph_path_info *path_info,
+ bool parent_locked)
{
+ struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
- dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
- ceph_snap(rinode));
+ r = build_inode_path(rinode, path_info);
+ doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+ ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
- dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
- *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- dout(" path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
}
+static void encode_mclientrequest_tail(void **p,
+ const struct ceph_mds_request *req)
+{
+ struct ceph_timespec ts;
+ int i;
+
+ ceph_encode_timespec64(&ts, &req->r_stamp);
+ ceph_encode_copy(p, &ts, sizeof(ts));
+
+ /* v4: gid_list */
+ ceph_encode_32(p, req->r_cred->group_info->ngroups);
+ for (i = 0; i < req->r_cred->group_info->ngroups; i++)
+ ceph_encode_64(p, from_kgid(&init_user_ns,
+ req->r_cred->group_info->gid[i]));
+
+ /* v5: altname */
+ ceph_encode_32(p, req->r_altname_len);
+ ceph_encode_copy(p, req->r_altname, req->r_altname_len);
+
+ /* v6: fscrypt_auth and fscrypt_file */
+ if (req->r_fscrypt_auth) {
+ u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+ ceph_encode_32(p, authlen);
+ ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
+ } else {
+ ceph_encode_32(p, 0);
+ }
+ if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {
+ ceph_encode_32(p, sizeof(__le64));
+ ceph_encode_64(p, req->r_fscrypt_file);
+ } else {
+ ceph_encode_32(p, 0);
+ }
+}
+
+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+ if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+ return 1;
+
+ if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+ return 2;
+
+ return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
+static struct ceph_mds_request_head_legacy *
+find_legacy_request_head(void *p, u64 features)
+{
+ bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+ struct ceph_mds_request_head *head;
+
+ if (legacy)
+ return (struct ceph_mds_request_head_legacy *)p;
+ head = (struct ceph_mds_request_head *)p;
+ return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
+}
+
/*
* called under mdsc->mutex
*/
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_mds_request *req,
- int mds)
+ bool drop_cap_releases)
{
+ int mds = session->s_mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
- struct ceph_mds_request_head *head;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- int freepath1 = 0, freepath2 = 0;
+ struct ceph_mds_request_head_legacy *lhead;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
+ struct dentry *old_dentry = NULL;
int len;
u16 releases;
void *p, *end;
int ret;
-
- ret = set_request_path_attr(req->r_inode, req->r_dentry,
- req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1);
+ bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
+ u16 request_head_version = mds_supported_head_version(session);
+ kuid_t caller_fsuid = req->r_cred->fsuid;
+ kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+
+ ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
- ret = set_request_path_attr(NULL, req->r_old_dentry,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2);
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked && req->r_parent && path_info1.vino.ino &&
+ ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *old_parent = req->r_parent;
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(old_parent), path_info1.vino.ino);
+ /*
+ * Transfer CEPH_CAP_PIN from the old parent to the new one.
+ * The pin was taken earlier in ceph_mdsc_submit_request().
+ */
+ ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN);
+ iput(old_parent);
+ req->r_parent = correct_dir;
+ ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ }
+ }
+
+ /* If r_old_dentry is set, then assume that its parent is locked */
+ if (req->r_old_dentry &&
+ !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
+ old_dentry = req->r_old_dentry;
+ ret = set_request_path_attr(mdsc, NULL, old_dentry,
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
}
- len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+ req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
+ if (IS_ERR(req->r_altname)) {
+ msg = ERR_CAST(req->r_altname);
+ req->r_altname = NULL;
+ goto out_free2;
+ }
- /* calculate (max) length for cap releases */
+ /*
+ * For old cephs without supporting the 32bit retry/fwd feature
+ * it will copy the raw memories directly when decoding the
+ * requests. While new cephs will decode the head depending the
+ * version member, so we need to make sure it will be compatible
+ * with them both.
+ */
+ if (legacy)
+ len = sizeof(struct ceph_mds_request_head_legacy);
+ else if (request_head_version == 1)
+ len = offsetofend(struct ceph_mds_request_head, args);
+ else if (request_head_version == 2)
+ len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
+ else
+ len = sizeof(struct ceph_mds_request_head);
+
+ /* filepaths */
+ len += 2 * (1 + sizeof(u32) + sizeof(u64));
+ len += path_info1.pathlen + path_info2.pathlen;
+
+ /* cap releases */
len += sizeof(struct ceph_mds_request_release) *
(!!req->r_inode_drop + !!req->r_dentry_drop +
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
if (req->r_dentry_drop)
- len += req->r_dentry->d_name.len;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += req->r_old_dentry->d_name.len;
+ len += path_info2.pathlen;
+
+ /* MClientRequest tail */
+
+ /* req->r_stamp */
+ len += sizeof(struct ceph_timespec);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+ /* gid list */
+ len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
+
+ /* alternate name */
+ len += sizeof(u32) + req->r_altname_len;
+
+ /* fscrypt_auth */
+ len += sizeof(u32); // fscrypt_auth
+ if (req->r_fscrypt_auth)
+ len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+ /* fscrypt_file */
+ len += sizeof(u32);
+ if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
+ len += sizeof(__le64);
+
+ msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -1684,18 +3122,90 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
msg->hdr.tid = cpu_to_le64(req->r_tid);
- head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(*head);
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
+
+ if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+ !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+ WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));
+
+ if (enable_unsafe_idmap) {
+ pr_warn_once_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. UID/GID-based restrictions may"
+ " not work properly.\n");
+
+ caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ } else {
+ pr_err_ratelimited_client(cl,
+ "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+ " is not supported by MDS. Fail request with -EIO.\n");
+
+ ret = -EIO;
+ goto out_err;
+ }
+ }
+
+ /*
+ * The ceph_mds_request_head_legacy didn't contain a version field, and
+ * one was added when we moved the message version from 3->4.
+ */
+ if (legacy) {
+ msg->hdr.version = cpu_to_le16(3);
+ p = msg->front.iov_base + sizeof(*lhead);
+ } else if (request_head_version == 1) {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(4);
+ nhead->version = cpu_to_le16(1);
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args);
+ } else if (request_head_version == 2) {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(2);
+
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
+ } else {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+ kuid_t owner_fsuid;
+ kgid_t owner_fsgid;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));
+
+ if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {
+ owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+ VFSUIDT_INIT(req->r_cred->fsuid));
+ owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+ VFSGIDT_INIT(req->r_cred->fsgid));
+ nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+ nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
+ } else {
+ nhead->owner_uid = cpu_to_le32(-1);
+ nhead->owner_gid = cpu_to_le32(-1);
+ }
+
+ p = msg->front.iov_base + sizeof(*nhead);
+ }
+
end = msg->front.iov_base + msg->front.iov_len;
- head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
- head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
- head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
- head->args = req->r_args;
+ lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+ lhead->op = cpu_to_le32(req->r_op);
+ lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+ caller_fsuid));
+ lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+ caller_fsgid));
+ lhead->ino = cpu_to_le64(req->r_deleg_ino);
+ lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -1704,41 +3214,69 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
releases = 0;
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
- req->r_inode ? req->r_inode : req->r_dentry->d_inode,
- mds, req->r_inode_drop, req->r_inode_unless, 0);
- if (req->r_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_dentry,
- mds, req->r_dentry_drop, req->r_dentry_unless);
- if (req->r_old_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
- mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+ req->r_inode ? req->r_inode : d_inode(req->r_dentry),
+ mds, req->r_inode_drop, req->r_inode_unless,
+ req->r_op == CEPH_MDS_OP_READDIR);
+ if (req->r_dentry_drop) {
+ ret = ceph_encode_dentry_release(&p, req->r_dentry,
+ req->r_parent, mds, req->r_dentry_drop,
+ req->r_dentry_unless);
+ if (ret < 0)
+ goto out_err;
+ releases += ret;
+ }
+ if (req->r_old_dentry_drop) {
+ ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
+ req->r_old_dentry_dir, mds,
+ req->r_old_dentry_drop,
+ req->r_old_dentry_unless);
+ if (ret < 0)
+ goto out_err;
+ releases += ret;
+ }
if (req->r_old_inode_drop)
releases += ceph_encode_inode_release(&p,
- req->r_old_dentry->d_inode,
+ d_inode(req->r_old_dentry),
mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
- head->num_releases = cpu_to_le16(releases);
- BUG_ON(p > end);
+ if (drop_cap_releases) {
+ releases = 0;
+ p = msg->front.iov_base + req->r_request_release_offset;
+ }
+
+ lhead->num_releases = cpu_to_le16(releases);
+
+ encode_mclientrequest_tail(&p, req);
+
+ if (WARN_ON_ONCE(p > end)) {
+ ceph_msg_put(msg);
+ msg = ERR_PTR(-ERANGE);
+ goto out_free2;
+ }
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- if (req->r_data_len) {
- /* outbound data set only by ceph_sync_setxattr() */
- BUG_ON(!req->r_pages);
- ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+ if (req->r_pagelist) {
+ struct ceph_pagelist *pagelist = req->r_pagelist;
+ ceph_msg_data_add_pagelist(msg, pagelist);
+ msg->hdr.data_len = cpu_to_le32(pagelist->length);
+ } else {
+ msg->hdr.data_len = 0;
}
- msg->hdr.data_len = cpu_to_le32(req->r_data_len);
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- kfree((char *)path2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- kfree((char *)path1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
+out_err:
+ ceph_msg_put(msg);
+ msg = ERR_PTR(ret);
+ goto out_free2;
}
/*
@@ -1748,22 +3286,46 @@ out:
static void complete_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ req->r_end_latency = ktime_get();
+
if (req->r_callback)
req->r_callback(mdsc, req);
- else
- complete_all(&req->r_completion);
+ complete_all(&req->r_completion);
}
/*
* called under mdsc->mutex
*/
-static int __prepare_send_request(struct ceph_mds_client *mdsc,
+static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_request *req,
- int mds)
+ bool drop_cap_releases)
{
- struct ceph_mds_request_head *rhead;
+ int mds = session->s_mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_mds_request_head_legacy *lhead;
+ struct ceph_mds_request_head *nhead;
struct ceph_msg *msg;
- int flags = 0;
+ int flags = 0, old_max_retry;
+ bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ &session->s_features);
+
+ /*
+ * Avoid infinite retrying after overflow. The client will
+ * increase the retry count and if the MDS is old version,
+ * so we limit to retry at most 256 times.
+ */
+ if (req->r_attempts) {
+ old_max_retry = sizeof_field(struct ceph_mds_request_head,
+ num_retry);
+ old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+ if ((old_version && req->r_attempts >= old_max_retry) ||
+ ((uint32_t)req->r_attempts >= U32_MAX)) {
+ pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
+ req->r_tid);
+ return -EMULTIHOP;
+ }
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -1775,10 +3337,12 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
else
req->r_sent_on_mseq = -1;
}
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
- req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+ doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid,
+ ceph_mds_op_name(req->r_op), req->r_attempts);
+
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ void *p;
- if (req->r_got_unsafe) {
/*
* Replay. Do not regenerate message (and rebuild
* paths, etc.); just use the original message.
@@ -1786,21 +3350,30 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
* d_move mangles the src name.
*/
msg = req->r_request;
- rhead = msg->front.iov_base;
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
- flags = le32_to_cpu(rhead->flags);
+ flags = le32_to_cpu(lhead->flags);
flags |= CEPH_MDS_FLAG_REPLAY;
- rhead->flags = cpu_to_le32(flags);
+ lhead->flags = cpu_to_le32(flags);
if (req->r_target_inode)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+ lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- rhead->num_retry = req->r_attempts - 1;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
/* remove cap/dentry releases from message */
- rhead->num_releases = 0;
- msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
- msg->front.iov_len = req->r_request_release_offset;
+ lhead->num_releases = 0;
+
+ p = msg->front.iov_base + req->r_request_release_offset;
+ encode_mclientrequest_tail(&p, req);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
return 0;
}
@@ -1808,57 +3381,118 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
ceph_msg_put(req->r_request);
req->r_request = NULL;
}
- msg = create_request_message(mdsc, req, mds);
+ msg = create_request_message(session, req, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
- complete_request(mdsc, req);
return PTR_ERR(msg);
}
req->r_request = msg;
- rhead = msg->front.iov_base;
- rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
- if (req->r_got_unsafe)
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
+ lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
- if (req->r_locked_dir)
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+ flags |= CEPH_MDS_FLAG_ASYNC;
+ if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
- rhead->flags = cpu_to_le32(flags);
- rhead->num_fwd = req->r_num_fwd;
- rhead->num_retry = req->r_attempts - 1;
- rhead->ino = 0;
+ lhead->flags = cpu_to_le32(flags);
+ lhead->num_fwd = req->r_num_fwd;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
- dout(" r_locked_dir = %p\n", req->r_locked_dir);
+ doutc(cl, " r_parent = %p\n", req->r_parent);
return 0;
}
/*
+ * called under mdsc->mutex
+ */
+static int __send_request(struct ceph_mds_session *session,
+ struct ceph_mds_request *req,
+ bool drop_cap_releases)
+{
+ int err;
+
+ err = __prepare_send_request(session, req, drop_cap_releases);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+
+ return err;
+}
+
+/*
* send request, or put it on the appropriate wait list.
*/
-static int __do_request(struct ceph_mds_client *mdsc,
+static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session = NULL;
int mds = -1;
- int err = -EAGAIN;
+ int err = 0;
+ bool random;
- if (req->r_err || req->r_got_result)
- goto out;
+ if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
+ __unregister_request(mdsc, req);
+ return;
+ }
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
+ doutc(cl, "metadata corrupted\n");
+ err = -EIO;
+ goto finish;
+ }
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
- dout("do_request timed out\n");
+ doutc(cl, "timed out\n");
+ err = -ETIMEDOUT;
+ goto finish;
+ }
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ doutc(cl, "forced umount\n");
err = -EIO;
goto finish;
}
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ doutc(cl, "mdsmap err %d\n", err);
+ goto finish;
+ }
+ if (mdsc->mdsmap->m_epoch == 0) {
+ doutc(cl, "no mdsmap, waiting for map\n");
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ return;
+ }
+ if (!(mdsc->fsc->mount_options->flags &
+ CEPH_MOUNT_OPT_MOUNTWAIT) &&
+ !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
+ err = -EHOSTUNREACH;
+ goto finish;
+ }
+ }
put_request_session(req);
- mds = __choose_mds(mdsc, req);
+ mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
- dout("do_request no mds or not active, waiting for map\n");
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto finish;
+ }
+ doutc(cl, "no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
- goto out;
+ return;
}
/* get, open session */
@@ -1870,15 +3504,54 @@ static int __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
}
- req->r_session = get_session(session);
+ req->r_session = ceph_get_mds_session(session);
+
+ doutc(cl, "mds%d session %p state %s\n", mds, session,
+ ceph_session_state_name(session->s_state));
+
+ /*
+ * The old ceph will crash the MDSs when see unknown OPs
+ */
+ if (req->r_feature_needed > 0 &&
+ !test_bit(req->r_feature_needed, &session->s_features)) {
+ err = -EOPNOTSUPP;
+ goto out_session;
+ }
- dout("do_request mds%d session %p state %s\n", mds, session,
- session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
session->s_state != CEPH_MDS_SESSION_HUNG) {
+ /*
+ * We cannot queue async requests since the caps and delegated
+ * inodes are bound to the session. Just return -EJUKEBOX and
+ * let the caller retry a sync request in that case.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+ err = -EJUKEBOX;
+ goto out_session;
+ }
+
+ /*
+ * If the session has been REJECTED, then return a hard error,
+ * unless it's a CLEANRECOVER mount, in which case we'll queue
+ * it to the mdsc queue.
+ */
+ if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
+ if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
+ list_add(&req->r_wait, &mdsc->waiting_for_map);
+ else
+ err = -EACCES;
+ goto out_session;
+ }
+
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ err = __open_session(mdsc, session);
+ if (err)
+ goto out_session;
+ /* retry the same mds later */
+ if (random)
+ req->r_resend_mds = mds;
+ }
list_add(&req->r_wait, &session->s_waiting);
goto out_session;
}
@@ -1889,21 +3562,76 @@ static int __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
- err = __prepare_send_request(mdsc, req, mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
+ /*
+ * For async create we will choose the auth MDS of frag in parent
+ * directory to send the request and usually this works fine, but
+ * if the migrated the dirtory to another MDS before it could handle
+ * it the request will be forwarded.
+ *
+ * And then the auth cap will be changed.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
+ struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ /*
+ * The request maybe handled very fast and the new inode
+ * hasn't been linked to the dentry yet. We need to wait
+ * for the ceph_finish_async_create(), which shouldn't be
+ * stuck too long or fail in thoery, to finish when forwarding
+ * the request.
+ */
+ if (!d_inode(req->r_dentry)) {
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+ if (err) {
+ mutex_lock(&req->r_fill_mutex);
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ goto out_session;
+ }
+ }
+
+ ci = ceph_inode(d_inode(req->r_dentry));
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
+ doutc(cl, "session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
+
+ /* Remove the auth cap from old session */
+ spin_lock(&cap->session->s_cap_lock);
+ cap->session->s_nr_caps--;
+ list_del_init(&cap->session_caps);
+ spin_unlock(&cap->session->s_cap_lock);
+
+ /* Add the auth cap to the new session */
+ cap->mds = mds;
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ session->s_nr_caps++;
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ change_auth_cap_ses(ci, session);
+ }
+ spin_unlock(&ci->i_ceph_lock);
}
+ err = __send_request(session, req, false);
+
out_session:
ceph_put_mds_session(session);
-out:
- return err;
-
finish:
- req->r_err = err;
- complete_request(mdsc, req);
- goto out;
+ if (err) {
+ doutc(cl, "early error %d\n", err);
+ req->r_err = err;
+ complete_request(mdsc, req);
+ __unregister_request(mdsc, req);
+ }
+ return;
}
/*
@@ -1912,6 +3640,7 @@ finish:
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
LIST_HEAD(tmp_list);
@@ -1921,7 +3650,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
req = list_entry(tmp_list.next,
struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
- dout(" wake request %p tid %llu\n", req, req->r_tid);
+ doutc(cl, " wake request %p tid %llu\n", req,
+ req->r_tid);
__do_request(mdsc, req);
}
}
@@ -1932,84 +3662,104 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
*/
static void kick_requests(struct ceph_mds_client *mdsc, int mds)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
- struct rb_node *p;
+ struct rb_node *p = rb_first(&mdsc->request_tree);
- dout("kick_requests mds%d\n", mds);
- for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+ doutc(cl, "kick_requests mds%d\n", mds);
+ while (p) {
req = rb_entry(p, struct ceph_mds_request, r_node);
- if (req->r_got_unsafe)
+ p = rb_next(p);
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
continue;
+ if (req->r_attempts > 0)
+ continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
- dout(" kicking tid %llu\n", req->r_tid);
+ doutc(cl, " kicking tid %llu\n", req->r_tid);
+ list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
}
}
-void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
- dout("submit_request on %p\n", req);
- mutex_lock(&mdsc->mutex);
- __register_request(mdsc, req, NULL);
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Synchrously perform an mds request. Take care of all of the
- * session setup, forwarding, retry details.
- */
-int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
- struct inode *dir,
- struct ceph_mds_request *req)
-{
- int err;
-
- dout("do_request on %p\n", req);
+ struct ceph_client *cl = mdsc->fsc->client;
+ int err = 0;
- /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+ /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_locked_dir)
- ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
+ if (req->r_parent) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+ int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+ CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+ spin_lock(&ci->i_ceph_lock);
+ ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+ __ceph_touch_fmode(ci, mdsc, fmode);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- /* issue */
+ if (req->r_inode) {
+ err = ceph_wait_on_async_create(req->r_inode);
+ if (err) {
+ doutc(cl, "wait for async create returned: %d\n", err);
+ return err;
+ }
+ }
+
+ if (!err && req->r_old_inode) {
+ err = ceph_wait_on_async_create(req->r_old_inode);
+ if (err) {
+ doutc(cl, "wait for async create returned: %d\n", err);
+ return err;
+ }
+ }
+
+ doutc(cl, "submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
__do_request(mdsc, req);
+ err = req->r_err;
+ mutex_unlock(&mdsc->mutex);
+ return err;
+}
- if (req->r_err) {
- err = req->r_err;
- __unregister_request(mdsc, req);
- dout("do_request early error %d\n", err);
- goto out;
- }
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int err;
/* wait */
- mutex_unlock(&mdsc->mutex);
- dout("do_request waiting\n");
- if (req->r_timeout) {
- err = (long)wait_for_completion_killable_timeout(
- &req->r_completion, req->r_timeout);
- if (err == 0)
- err = -EIO;
+ doutc(cl, "do_request waiting\n");
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
- err = wait_for_completion_killable(&req->r_completion);
+ long timeleft = wait_for_completion_killable_timeout(
+ &req->r_completion,
+ ceph_timeout_jiffies(req->r_timeout));
+ if (timeleft > 0)
+ err = 0;
+ else if (!timeleft)
+ err = -ETIMEDOUT; /* timed out */
+ else
+ err = timeleft; /* killed */
}
- dout("do_request waited, got %d\n", err);
+ doutc(cl, "do_request waited, got %d\n", err);
mutex_lock(&mdsc->mutex);
/* only abort if we didn't race with a real reply */
- if (req->r_got_result) {
+ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = le32_to_cpu(req->r_reply_info.head->result);
} else if (err < 0) {
- dout("aborted request %lld with %d\n", req->r_tid, err);
+ doutc(cl, "aborted request %lld with %d\n", req->r_tid, err);
/*
* ensure we aren't running concurrently with
@@ -2018,19 +3768,38 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
*/
mutex_lock(&req->r_fill_mutex);
req->r_err = err;
- req->r_aborted = true;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
- if (req->r_locked_dir &&
+ if (req->r_parent &&
(req->r_op & CEPH_MDS_OP_WRITE))
ceph_invalidate_dir_request(req);
} else {
err = req->r_err;
}
-out:
mutex_unlock(&mdsc->mutex);
- dout("do_request %p done, result %d\n", req, err);
+ return err;
+}
+
+/*
+ * Synchrously perform an mds request. Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ int err;
+
+ doutc(cl, "do_request on %p\n", req);
+
+ /* issue */
+ err = ceph_mdsc_submit_request(mdsc, dir, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
+ doutc(cl, "do_request %p done, result %d\n", req, err);
return err;
}
@@ -2040,11 +3809,16 @@ out:
*/
void ceph_invalidate_dir_request(struct ceph_mds_request *req)
{
- struct inode *inode = req->r_locked_dir;
+ struct inode *dir = req->r_parent;
+ struct inode *old_dir = req->r_old_dentry_dir;
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
- dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
+ doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n",
+ dir, old_dir);
- ceph_dir_clear_complete(inode);
+ ceph_dir_clear_complete(dir);
+ if (old_dir)
+ ceph_dir_clear_complete(old_dir);
if (req->r_dentry)
ceph_invalidate_dentry_lease(req->r_dentry);
if (req->r_old_dentry)
@@ -2061,15 +3835,18 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ struct ceph_snap_realm *realm;
u64 tid;
int err, result;
int mds = session->s_mds;
+ bool close_sessions = false;
if (msg->front.iov_len < sizeof(*head)) {
- pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+ pr_err_client(cl, "got corrupt (short) reply\n");
ceph_msg_dump(msg);
return;
}
@@ -2077,84 +3854,49 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* get request, session */
tid = le64_to_cpu(msg->hdr.tid);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
- dout("handle_reply on unknown tid %llu\n", tid);
+ doutc(cl, "on unknown tid %llu\n", tid);
mutex_unlock(&mdsc->mutex);
return;
}
- dout("handle_reply %p\n", req);
+ doutc(cl, "handle_reply %p\n", req);
/* correct session? */
if (req->r_session != session) {
- pr_err("mdsc_handle_reply got %llu on session mds%d"
- " not mds%d\n", tid, session->s_mds,
- req->r_session ? req->r_session->s_mds : -1);
+ pr_err_client(cl, "got %llu on session mds%d not mds%d\n",
+ tid, session->s_mds,
+ req->r_session ? req->r_session->s_mds : -1);
mutex_unlock(&mdsc->mutex);
goto out;
}
/* dup? */
- if ((req->r_got_unsafe && !head->safe) ||
- (req->r_got_safe && head->safe)) {
- pr_warning("got a dup %s reply on %llu from mds%d\n",
- head->safe ? "safe" : "unsafe", tid, mds);
+ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
+ (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
+ pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n",
+ head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
- if (req->r_got_safe && !head->safe) {
- pr_warning("got unsafe after safe on %llu from mds%d\n",
- tid, mds);
+ if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
+ pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n",
+ tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu", req->r_tid);
- if (!req->r_inode) {
- /* do nothing; not an authority problem */
- } else if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap = NULL;
-
- if (req->r_session)
- cap = ceph_get_cap_for_mds(ci,
- req->r_session->s_mds);
-
- dout("already using auth");
- if ((!cap || cap != ci->i_auth_cap) ||
- (cap->mseq != req->r_sent_on_mseq)) {
- dout("but cap changed, so resending");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu", req->r_tid);
- }
-
-
if (head->safe) {
- req->r_got_safe = true;
+ set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
- complete_all(&req->r_safe_completion);
- if (req->r_got_unsafe) {
+ /* last request during umount? */
+ if (mdsc->stopping && !__get_oldest_req(mdsc))
+ complete_all(&mdsc->safe_umount_waiters);
+
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/*
* We already handled the unsafe response, now do the
* cleanup. No need to examine the response; the MDS
@@ -2162,38 +3904,77 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
* response. And even if it did, there is nothing
* useful we could do with a revised return value.
*/
- dout("got safe reply %llu, mds%d\n", tid, mds);
- list_del_init(&req->r_unsafe_item);
+ doutc(cl, "got safe reply %llu, mds%d\n", tid, mds);
- /* last unsafe request during umount? */
- if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex);
goto out;
}
} else {
- req->r_got_unsafe = true;
+ set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
}
- dout("handle_reply tid %lld result %d\n", tid, result);
- rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+ doutc(cl, "tid %lld result %d\n", tid, result);
+ if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
+ err = parse_reply_info(session, msg, req, (u64)-1);
+ else
+ err = parse_reply_info(session, msg, req,
+ session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
+ /* Must find target inode outside of mutexes to avoid deadlocks */
+ rinfo = &req->r_reply_info;
+ if ((err >= 0) && rinfo->head->is_target) {
+ struct inode *in = xchg(&req->r_new_inode, NULL);
+ struct ceph_vino tvino = {
+ .ino = le64_to_cpu(rinfo->targeti.in->ino),
+ .snap = le64_to_cpu(rinfo->targeti.in->snapid)
+ };
+
+ /*
+ * If we ended up opening an existing inode, discard
+ * r_new_inode
+ */
+ if (req->r_op == CEPH_MDS_OP_CREATE &&
+ !req->r_reply_info.has_create_ino) {
+ /* This should never happen on an async create */
+ WARN_ON_ONCE(req->r_deleg_ino);
+ iput(in);
+ in = NULL;
+ }
+
+ in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ mutex_lock(&session->s_mutex);
+ goto out_err;
+ }
+ req->r_target_inode = in;
+ }
+
mutex_lock(&session->s_mutex);
if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+ pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
+ mds, tid);
ceph_msg_dump(msg);
goto out_err;
}
/* snap trace */
+ realm = NULL;
if (rinfo->snapblob_len) {
down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+ err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+ &realm);
+ if (err) {
+ up_write(&mdsc->snap_rwsem);
+ close_sessions = true;
+ if (err == -EIO)
+ ceph_msg_dump(msg);
+ goto out_err;
+ }
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
@@ -2201,39 +3982,60 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
- err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
+ current->journal_info = req;
+ err = ceph_fill_trace(mdsc->fsc->sb, req);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
- req->r_op == CEPH_MDS_OP_LSSNAP) &&
- rinfo->dir_nr)
- ceph_readdir_prepopulate(req, req->r_session);
- ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
+ req->r_op == CEPH_MDS_OP_LSSNAP))
+ err = ceph_readdir_prepopulate(req, req->r_session);
}
+ current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0) {
+ if (req->r_target_inode &&
+ test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item,
+ &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+
+ ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
+ }
out_err:
mutex_lock(&mdsc->mutex);
- if (!req->r_aborted) {
+ if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
if (err) {
req->r_err = err;
} else {
- req->r_reply = msg;
- ceph_msg_get(msg);
- req->r_got_result = true;
+ req->r_reply = ceph_msg_get(msg);
+ set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
}
} else {
- dout("reply arrived after request %lld was aborted\n", tid);
+ doutc(cl, "reply arrived after request %lld was aborted\n", tid);
}
mutex_unlock(&mdsc->mutex);
- ceph_add_cap_releases(mdsc, req->r_session);
mutex_unlock(&session->s_mutex);
/* kick calling process */
complete_request(mdsc, req);
+
+ ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
+ req->r_end_latency, err);
out:
ceph_mdsc_put_request(req);
+
+ /* Defer closing the sessions after s_mutex lock being released */
+ if (close_sessions)
+ ceph_mdsc_close_sessions(mdsc);
return;
}
@@ -2246,6 +4048,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
u64 tid = le64_to_cpu(msg->hdr.tid);
u32 next_mds;
@@ -2253,41 +4056,89 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
fwd_seq = ceph_decode_32(&p);
mutex_lock(&mdsc->mutex);
- req = __lookup_request(mdsc, tid);
+ req = lookup_get_request(mdsc, tid);
if (!req) {
- dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
+ mutex_unlock(&mdsc->mutex);
+ doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds);
+ return; /* dup reply? */
}
- if (req->r_aborted) {
- dout("forward tid %llu aborted, unregistering\n", tid);
+ if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
- } else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
+ /*
+ * Avoid infinite retrying after overflow.
+ *
+ * The MDS will increase the fwd count and in client side
+ * if the num_fwd is less than the one saved in request
+ * that means the MDS is an old version and overflowed of
+ * 8 bits.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n",
+ tid);
} else {
/* resend. forward race not possible; mds would drop */
- dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
- BUG_ON(req->r_got_result);
+ BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
+ req->r_attempts = 0;
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
-out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad:
- pr_err("mdsc_handle_forward decode error err=%d\n", err);
+ pr_err_client(cl, "decode error err=%d\n", err);
+ ceph_msg_dump(msg);
+}
+
+static int __decode_session_metadata(void **p, void *end,
+ bool *blocklisted)
+{
+ /* map<string,string> */
+ u32 n;
+ bool err_str;
+ ceph_decode_32_safe(p, end, n, bad);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ err_str = !strncmp(*p, "error_string", len);
+ *p += len;
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+ /*
+ * Match "blocklisted (blacklisted)" from newer MDSes,
+ * or "blacklisted" from older MDSes.
+ */
+ if (err_str && strnstr(*p, "blacklisted", len))
+ *blocklisted = true;
+ *p += len;
+ }
+ return 0;
+bad:
+ return -1;
}
/*
@@ -2297,42 +4148,203 @@ static void handle_session(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
- u32 op;
- u64 seq;
+ struct ceph_client *cl = mdsc->fsc->client;
int mds = session->s_mds;
- struct ceph_mds_session_head *h = msg->front.iov_base;
+ int msg_version = le16_to_cpu(msg->hdr.version);
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ struct ceph_mds_session_head *h;
+ struct ceph_mds_cap_auth *cap_auths = NULL;
+ u32 op, cap_auths_num = 0;
+ u64 seq, features = 0;
int wake = 0;
+ bool blocklisted = false;
+ u32 i;
+
/* decode */
- if (msg->front.iov_len != sizeof(*h))
- goto bad;
+ ceph_decode_need(&p, end, sizeof(*h), bad);
+ h = p;
+ p += sizeof(*h);
+
op = le32_to_cpu(h->op);
seq = le64_to_cpu(h->seq);
+ if (msg_version >= 3) {
+ u32 len;
+ /* version >= 2 and < 5, decode metadata, skip otherwise
+ * as it's handled via flags.
+ */
+ if (msg_version >= 5)
+ ceph_decode_skip_map(&p, end, string, string, bad);
+ else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
+ goto bad;
+
+ /* version >= 3, feature bits */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len) {
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
+ }
+ }
+
+ if (msg_version >= 5) {
+ u32 flags, len;
+
+ /* version >= 4 */
+ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
+ ceph_decode_32_safe(&p, end, len, bad); /* len */
+ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
+
+ /* version >= 5, flags */
+ ceph_decode_32_safe(&p, end, flags, bad);
+ if (flags & CEPH_SESSION_BLOCKLISTED) {
+ pr_warn_client(cl, "mds%d session blocklisted\n",
+ session->s_mds);
+ blocklisted = true;
+ }
+ }
+
+ if (msg_version >= 6) {
+ ceph_decode_32_safe(&p, end, cap_auths_num, bad);
+ doutc(cl, "cap_auths_num %d\n", cap_auths_num);
+
+ if (cap_auths_num && op != CEPH_SESSION_OPEN) {
+ WARN_ON_ONCE(op != CEPH_SESSION_OPEN);
+ goto skip_cap_auths;
+ }
+
+ cap_auths = kcalloc(cap_auths_num,
+ sizeof(struct ceph_mds_cap_auth),
+ GFP_KERNEL);
+ if (!cap_auths) {
+ pr_err_client(cl, "No memory for cap_auths\n");
+ return;
+ }
+
+ for (i = 0; i < cap_auths_num; i++) {
+ u32 _len, j;
+
+ /* struct_v, struct_compat, and struct_len in MDSCapAuth */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);
+
+ /* struct_v, struct_compat, and struct_len in MDSCapMatch */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad);
+ ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad);
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.gids = kcalloc(_len, sizeof(u32),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.gids) {
+ pr_err_client(cl, "No memory for gids\n");
+ goto fail;
+ }
+
+ cap_auths[i].match.num_gids = _len;
+ for (j = 0; j < _len; j++)
+ ceph_decode_32_safe(&p, end,
+ cap_auths[i].match.gids[j],
+ bad);
+ }
+
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.path) {
+ pr_err_client(cl, "No memory for path\n");
+ goto fail;
+ }
+ ceph_decode_copy(&p, cap_auths[i].match.path, _len);
+
+ /* Remove the tailing '/' */
+ while (_len && cap_auths[i].match.path[_len - 1] == '/') {
+ cap_auths[i].match.path[_len - 1] = '\0';
+ _len -= 1;
+ }
+ }
+
+ ceph_decode_32_safe(&p, end, _len, bad);
+ if (_len) {
+ cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char),
+ GFP_KERNEL);
+ if (!cap_auths[i].match.fs_name) {
+ pr_err_client(cl, "No memory for fs_name\n");
+ goto fail;
+ }
+ ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len);
+ }
+
+ ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad);
+ ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad);
+ ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad);
+ doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n",
+ cap_auths[i].match.uid, cap_auths[i].match.num_gids,
+ cap_auths[i].match.path, cap_auths[i].match.fs_name,
+ cap_auths[i].match.root_squash,
+ cap_auths[i].readable, cap_auths[i].writeable);
+ }
+ }
+
+skip_cap_auths:
mutex_lock(&mdsc->mutex);
- if (op == CEPH_SESSION_CLOSE)
+ if (op == CEPH_SESSION_OPEN) {
+ if (mdsc->s_cap_auths) {
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ kfree(mdsc->s_cap_auths[i].match.gids);
+ kfree(mdsc->s_cap_auths[i].match.path);
+ kfree(mdsc->s_cap_auths[i].match.fs_name);
+ }
+ kfree(mdsc->s_cap_auths);
+ }
+ mdsc->s_cap_auths_num = cap_auths_num;
+ mdsc->s_cap_auths = cap_auths;
+ }
+ if (op == CEPH_SESSION_CLOSE) {
+ ceph_get_mds_session(session);
__unregister_session(mdsc, session);
+ }
/* FIXME: this ttl calculation is generous */
session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- dout("handle_session mds%d %s %p state %s seq %llu\n",
- mds, ceph_session_op_name(op), session,
- session_state_name(session->s_state), seq);
+ doutc(cl, "mds%d %s %p state %s seq %llu\n", mds,
+ ceph_session_op_name(op), session,
+ ceph_session_state_name(session->s_state), seq);
if (session->s_state == CEPH_MDS_SESSION_HUNG) {
session->s_state = CEPH_MDS_SESSION_OPEN;
- pr_info("mds%d came back\n", session->s_mds);
+ pr_info_client(cl, "mds%d came back\n", session->s_mds);
}
switch (op) {
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- renewed_caps(mdsc, session, 0);
+ pr_info_client(cl, "mds%d reconnect success\n",
+ session->s_mds);
+
+ session->s_features = features;
+ if (session->s_state == CEPH_MDS_SESSION_OPEN) {
+ pr_notice_client(cl, "mds%d is already opened\n",
+ session->s_mds);
+ } else {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
+ &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
+ }
+
+ /*
+ * The connection maybe broken and the session in client
+ * side has been reinitialized, need to update the seq
+ * anyway.
+ */
+ if (!session->s_seq && seq)
+ session->s_seq = seq;
+
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -2345,29 +4357,59 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
- pr_info("mds%d reconnect denied\n", session->s_mds);
+ pr_info_client(cl, "mds%d reconnect denied\n",
+ session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_CLOSED;
+ cleanup_session_requests(mdsc, session);
remove_session_caps(session);
- wake = 1; /* for good measure */
+ wake = 2; /* for good measure */
wake_up_all(&mdsc->session_close_wq);
- kick_requests(mdsc, mds);
break;
case CEPH_SESSION_STALE:
- pr_info("mds%d caps went stale, renewing\n",
- session->s_mds);
- spin_lock(&session->s_gen_ttl_lock);
- session->s_cap_gen++;
+ pr_info_client(cl, "mds%d caps went stale, renewing\n",
+ session->s_mds);
+ atomic_inc(&session->s_cap_gen);
session->s_cap_ttl = jiffies - 1;
- spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
case CEPH_SESSION_RECALL_STATE:
- trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+ ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+ break;
+
+ case CEPH_SESSION_FLUSHMSG:
+ /* flush cap releases */
+ spin_lock(&session->s_cap_lock);
+ if (session->s_num_cap_releases)
+ ceph_flush_session_cap_releases(mdsc, session);
+ spin_unlock(&session->s_cap_lock);
+
+ send_flushmsg_ack(mdsc, session, seq);
+ break;
+
+ case CEPH_SESSION_FORCE_RO:
+ doutc(cl, "force_session_readonly %p\n", session);
+ spin_lock(&session->s_cap_lock);
+ session->s_readonly = true;
+ spin_unlock(&session->s_cap_lock);
+ wake_up_session_caps(session, FORCE_RO);
+ break;
+
+ case CEPH_SESSION_REJECT:
+ WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
+ pr_info_client(cl, "mds%d rejected session\n",
+ session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_REJECTED;
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ if (blocklisted)
+ mdsc->fsc->blocklisted = true;
+ wake = 2; /* for good measure */
break;
default:
- pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+ pr_err_client(cl, "bad op %d mds%d\n", op, mds);
WARN_ON(1);
}
@@ -2375,17 +4417,51 @@ static void handle_session(struct ceph_mds_session *session,
if (wake) {
mutex_lock(&mdsc->mutex);
__wake_requests(mdsc, &session->s_waiting);
+ if (wake == 2)
+ kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex);
}
+ if (op == CEPH_SESSION_CLOSE)
+ ceph_put_mds_session(session);
return;
bad:
- pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
- (int)msg->front.iov_len);
+ pr_err_client(cl, "corrupt message mds%d len %d\n", mds,
+ (int)msg->front.iov_len);
ceph_msg_dump(msg);
+fail:
+ for (i = 0; i < cap_auths_num; i++) {
+ kfree(cap_auths[i].match.gids);
+ kfree(cap_auths[i].match.path);
+ kfree(cap_auths[i].match.fs_name);
+ }
+ kfree(cap_auths);
return;
}
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+ }
+}
+
+void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req)
+{
+ struct ceph_client *cl = req->r_mdsc->fsc->client;
+ int dcaps;
+
+ dcaps = xchg(&req->r_dir_caps, 0);
+ if (dcaps) {
+ doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+ ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps);
+ }
+}
/*
* called under session->mutex.
@@ -2394,134 +4470,394 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_request *req, *nreq;
- int err;
+ struct rb_node *p;
- dout("replay_unsafe_requests mds%d\n", session->s_mds);
+ doutc(mdsc->fsc->client, "mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
- list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
+ list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
+ __send_request(session, req, true);
+
+ /*
+ * also re-send old requests when MDS enters reconnect stage. So that MDS
+ * can process completed request in clientreplay stage.
+ */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
+ continue;
+ if (req->r_attempts == 0)
+ continue; /* only old requests */
+ if (!req->r_session)
+ continue;
+ if (req->r_session->s_mds != session->s_mds)
+ continue;
+
+ ceph_mdsc_release_dir_caps_async(req);
+
+ __send_request(session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
+static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
+{
+ struct ceph_msg *reply;
+ struct ceph_pagelist *_pagelist;
+ struct page *page;
+ __le32 *addr;
+ int err = -ENOMEM;
+
+ if (!recon_state->allow_multi)
+ return -ENOSPC;
+
+ /* can't handle message that contains both caps and realm */
+ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
+
+ /* pre-allocate new pagelist */
+ _pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!_pagelist)
+ return -ENOMEM;
+
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
+ if (!reply)
+ goto fail_msg;
+
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
+ if (err)
+ goto fail;
+ } else {
+ /* placeholder for nr_realms (currently encoding relams) */
+ err = ceph_pagelist_encode_32(_pagelist, 0);
+ if (err < 0)
+ goto fail;
+ }
+
+ err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
+ if (err)
+ goto fail;
+
+ page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
+ addr = kmap_atomic(page);
+ if (recon_state->nr_caps) {
+ /* currently encoding caps */
+ *addr = cpu_to_le32(recon_state->nr_caps);
+ } else {
+ /* currently encoding relams */
+ *(addr + 1) = cpu_to_le32(recon_state->nr_realms);
+ }
+ kunmap_atomic(addr);
+
+ reply->hdr.version = cpu_to_le16(5);
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
+
+ ceph_con_send(&recon_state->session->s_con, reply);
+ ceph_pagelist_release(recon_state->pagelist);
+
+ recon_state->pagelist = _pagelist;
+ recon_state->nr_caps = 0;
+ recon_state->nr_realms = 0;
+ recon_state->msg_version = 5;
+ return 0;
+fail:
+ ceph_msg_put(reply);
+fail_msg:
+ ceph_pagelist_release(_pagelist);
+ return err;
+}
+
+static struct dentry* d_find_primary(struct inode *inode)
+{
+ struct dentry *alias, *dn = NULL;
+
+ if (hlist_empty(&inode->i_dentry))
+ return NULL;
+
+ spin_lock(&inode->i_lock);
+ if (hlist_empty(&inode->i_dentry))
+ goto out_unlock;
+
+ if (S_ISDIR(inode->i_mode)) {
+ alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ if (!IS_ROOT(alias))
+ dn = dget(alias);
+ goto out_unlock;
+ }
+
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias) &&
+ (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+ dn = dget_dlock(alias);
+ }
+ spin_unlock(&alias->d_lock);
+ if (dn)
+ break;
+ }
+out_unlock:
+ spin_unlock(&inode->i_lock);
+ return dn;
+}
+
/*
* Encode information about a cap for a reconnect with the MDS.
*/
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
union {
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- size_t reclen;
- struct ceph_inode_info *ci;
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
- char *path;
- int pathlen, err;
- u64 pathbase;
struct dentry *dentry;
+ struct ceph_cap *cap;
+ struct ceph_path_info path_info = {0};
+ int err;
+ u64 snap_follows;
- ci = cap->ci;
-
- dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
- inode, ceph_vinop(inode), cap, cap->cap_id,
- ceph_cap_string(cap->issued));
- err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- if (err)
- return err;
-
- dentry = d_find_alias(inode);
+ dentry = d_find_primary(inode);
if (dentry) {
- path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+ /* set pathbase to parent dir when msg_version >= 2 */
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
+ recon_state->msg_version >= 2);
+ dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- goto out_dput;
+ goto out_err;
}
- } else {
- path = NULL;
- pathlen = 0;
}
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ err = 0;
+ goto out_err;
+ }
+ doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode,
+ ceph_vinop(inode), cap, cap->cap_id,
+ ceph_cap_string(cap->issued));
+
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
+ cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
+
+ /* These are lost when the session goes away */
+ if (S_ISDIR(inode->i_mode)) {
+ if (cap->issued & CEPH_CAP_DIR_CREATE) {
+ ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+ memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+ }
+ cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+ }
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
- rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
+ rec.v2.flock_len = (__force __le32)
+ ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
+ struct timespec64 ts;
+
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v1.issued = cpu_to_le32(cap->issued);
- rec.v1.size = cpu_to_le64(inode->i_size);
- ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
- ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+ rec.v1.size = cpu_to_le64(i_size_read(inode));
+ ts = inode_get_mtime(inode);
+ ceph_encode_timespec64(&rec.v1.mtime, &ts);
+ ts = inode_get_atime(inode);
+ ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
}
spin_unlock(&ci->i_ceph_lock);
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
- struct ceph_filelock *flocks;
+ struct ceph_filelock *flocks = NULL;
+ size_t struct_len, total_len = sizeof(u64);
+ u8 struct_v = 0;
encode_again:
- spin_lock(&inode->i_lock);
- ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
- spin_unlock(&inode->i_lock);
- flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock), GFP_NOFS);
- if (!flocks) {
- err = -ENOMEM;
- goto out_free;
- }
- spin_lock(&inode->i_lock);
- err = ceph_encode_locks_to_buffer(inode, flocks,
- num_fcntl_locks,
- num_flock_locks);
- spin_unlock(&inode->i_lock);
- if (err) {
+ if (rec.v2.flock_len) {
+ ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+ } else {
+ num_fcntl_locks = 0;
+ num_flock_locks = 0;
+ }
+ if (num_fcntl_locks + num_flock_locks > 0) {
+ flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
+ sizeof(struct ceph_filelock),
+ GFP_NOFS);
+ if (!flocks) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+ err = ceph_encode_locks_to_buffer(inode, flocks,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (err) {
+ kfree(flocks);
+ flocks = NULL;
+ if (err == -ENOSPC)
+ goto encode_again;
+ goto out_err;
+ }
+ } else {
kfree(flocks);
- if (err == -ENOSPC)
- goto encode_again;
- goto out_free;
+ flocks = NULL;
+ }
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len += 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
}
/*
* number of encoded locks is stable, so copy to pagelist
*/
- rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+
+ if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto out_freeflocks;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, total_len);
+ if (err)
+ goto out_freeflocks;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks, num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+out_freeflocks:
kfree(flocks);
} else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
+ err = ceph_pagelist_reserve(pagelist,
+ sizeof(u64) + sizeof(u32) +
+ path_info.pathlen + sizeof(rec.v1));
+ if (err)
+ goto out_err;
+
+ ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
-out_free:
- kfree(path);
-out_dput:
- dput(dentry);
+
+out_err:
+ ceph_mdsc_free_path_info(&path_info);
+ if (!err)
+ recon_state->nr_caps++;
+ return err;
+}
+
+static int encode_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_reconnect_state *recon_state)
+{
+ struct rb_node *p;
+ struct ceph_pagelist *pagelist = recon_state->pagelist;
+ struct ceph_client *cl = mdsc->fsc->client;
+ int err = 0;
+
+ if (recon_state->msg_version >= 4) {
+ err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
+ if (err < 0)
+ goto fail;
+ }
+
+ /*
+ * snaprealms. we provide mds with the ino, seq (version), and
+ * parent for all of our realms. If the mds has any newer info,
+ * it will tell us.
+ */
+ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+ struct ceph_snap_realm *realm =
+ rb_entry(p, struct ceph_snap_realm, node);
+ struct ceph_mds_snaprealm_reconnect sr_rec;
+
+ if (recon_state->msg_version >= 4) {
+ size_t need = sizeof(u8) * 2 + sizeof(u32) +
+ sizeof(sr_rec);
+
+ if (pagelist->length + need > RECONNECT_MAX_SIZE) {
+ err = send_reconnect_partial(recon_state);
+ if (err)
+ goto fail;
+ pagelist = recon_state->pagelist;
+ }
+
+ err = ceph_pagelist_reserve(pagelist, need);
+ if (err)
+ goto fail;
+
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
+ }
+
+ doutc(cl, " adding snap realm %llx seq %lld parent %llx\n",
+ realm->ino, realm->seq, realm->parent_ino);
+ sr_rec.ino = cpu_to_le64(realm->ino);
+ sr_rec.seq = cpu_to_le64(realm->seq);
+ sr_rec.parent = cpu_to_le64(realm->parent_ino);
+
+ err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+ if (err)
+ goto fail;
+
+ recon_state->nr_realms++;
+ }
+fail:
return err;
}
@@ -2535,34 +4871,58 @@ out_dput:
* recovering MDS might have.
*
* This is a relatively heavyweight operation, but it's rare.
- *
- * called with mdsc->mutex held.
*/
static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *reply;
- struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
- struct ceph_pagelist *pagelist;
- struct ceph_reconnect_state recon_state;
+ struct ceph_reconnect_state recon_state = {
+ .session = session,
+ };
+ LIST_HEAD(dispose);
- pr_info("mds%d reconnect start\n", mds);
+ pr_info_client(cl, "mds%d reconnect start\n", mds);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
- if (!pagelist)
+ recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!recon_state.pagelist)
goto fail_nopagelist;
- ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
+ xa_destroy(&session->s_delegated_inos);
+
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
+ doutc(cl, "session %p state %s\n", session,
+ ceph_session_state_name(session->s_state));
+
+ atomic_inc(&session->s_cap_gen);
+
+ spin_lock(&session->s_cap_lock);
+ /* don't know if session is readonly */
+ session->s_readonly = 0;
+ /*
+ * notify __ceph_remove_cap() that we are composing cap reconnect.
+ * If a cap get released before being added to the cap reconnect,
+ * __ceph_remove_cap() should skip queuing cap release.
+ */
+ session->s_cap_reconnect = 1;
+ /* drop old cap expires; we're about to reestablish that state */
+ detach_cap_releases(session, &dispose);
+ spin_unlock(&session->s_cap_lock);
+ dispose_cap_releases(mdsc, &dispose);
+
+ /* trim unused caps to reduce MDS's cache rejoin time */
+ if (mdsc->fsc->sb->s_root)
+ shrink_dcache_parent(mdsc->fsc->sb->s_root);
+
ceph_con_close(&session->s_con);
ceph_con_open(&session->s_con,
CEPH_ENTITY_TYPE_MDS, mds,
@@ -2571,52 +4931,91 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
/* replay unsafe requests */
replay_unsafe_requests(mdsc, session);
- down_read(&mdsc->snap_rwsem);
-
- dout("session %p state %s\n", session,
- session_state_name(session->s_state));
+ ceph_early_kick_flushing_caps(mdsc, session);
- /* drop old cap expires; we're about to reestablish that state */
- discard_cap_releases(mdsc, session);
+ down_read(&mdsc->snap_rwsem);
- /* traverse this session's caps */
- err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+ /* placeholder for nr_caps */
+ err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
if (err)
goto fail;
- recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
- err = iterate_session_caps(session, encode_caps_cb, &recon_state);
+ if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
+ recon_state.msg_version = 3;
+ recon_state.allow_multi = true;
+ } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
+ recon_state.msg_version = 3;
+ } else {
+ recon_state.msg_version = 2;
+ }
+ /* traverse this session's caps */
+ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
+
+ spin_lock(&session->s_cap_lock);
+ session->s_cap_reconnect = 0;
+ spin_unlock(&session->s_cap_lock);
+
if (err < 0)
goto fail;
- /*
- * snaprealms. we provide mds with the ino, seq (version), and
- * parent for all of our realms. If the mds has any newer info,
- * it will tell us.
- */
- for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
- struct ceph_snap_realm *realm =
- rb_entry(p, struct ceph_snap_realm, node);
- struct ceph_mds_snaprealm_reconnect sr_rec;
+ /* check if all realms can be encoded into current message */
+ if (mdsc->num_snap_realms) {
+ size_t total_len =
+ recon_state.pagelist->length +
+ mdsc->num_snap_realms *
+ sizeof(struct ceph_mds_snaprealm_reconnect);
+ if (recon_state.msg_version >= 4) {
+ /* number of realms */
+ total_len += sizeof(u32);
+ /* version, compat_version and struct_len */
+ total_len += mdsc->num_snap_realms *
+ (2 * sizeof(u8) + sizeof(u32));
+ }
+ if (total_len > RECONNECT_MAX_SIZE) {
+ if (!recon_state.allow_multi) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ if (recon_state.nr_caps) {
+ err = send_reconnect_partial(&recon_state);
+ if (err)
+ goto fail;
+ }
+ recon_state.msg_version = 5;
+ }
+ }
- dout(" adding snap realm %llx seq %lld parent %llx\n",
- realm->ino, realm->seq, realm->parent_ino);
- sr_rec.ino = cpu_to_le64(realm->ino);
- sr_rec.seq = cpu_to_le64(realm->seq);
- sr_rec.parent = cpu_to_le64(realm->parent_ino);
- err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
- if (err)
+ err = encode_snap_realms(mdsc, &recon_state);
+ if (err < 0)
+ goto fail;
+
+ if (recon_state.msg_version >= 5) {
+ err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
+ if (err < 0)
goto fail;
}
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
- if (pagelist->length) {
- /* set up outbound data if we have any */
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+ if (recon_state.nr_caps || recon_state.nr_realms) {
+ struct page *page =
+ list_first_entry(&recon_state.pagelist->head,
+ struct page, lru);
+ __le32 *addr = kmap_atomic(page);
+ if (recon_state.nr_caps) {
+ WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ } else if (recon_state.msg_version >= 4) {
+ *(addr + 1) = cpu_to_le32(recon_state.nr_realms);
+ }
+ kunmap_atomic(addr);
}
+
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
+ if (recon_state.msg_version >= 4)
+ reply->hdr.compat_version = cpu_to_le16(4);
+
+ reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
+ ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
+
ceph_con_send(&session->s_con, reply);
mutex_unlock(&session->s_mutex);
@@ -2626,6 +5025,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
+ ceph_pagelist_release(recon_state.pagelist);
return;
fail:
@@ -2633,10 +5033,10 @@ fail:
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
fail_nomsg:
- ceph_pagelist_release(pagelist);
- kfree(pagelist);
+ ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
- pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+ pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
+ err, mds);
return;
}
@@ -2651,48 +5051,64 @@ static void check_new_map(struct ceph_mds_client *mdsc,
struct ceph_mdsmap *newmap,
struct ceph_mdsmap *oldmap)
{
- int i;
+ int i, j, err;
int oldstate, newstate;
struct ceph_mds_session *s;
+ unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch);
- dout("check_new_map new %u old %u\n",
- newmap->m_epoch, oldmap->m_epoch);
+ if (newmap->m_info) {
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
+ set_bit(newmap->m_info[i].export_targets[j], targets);
+ }
+ }
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i] == NULL)
+ for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
+ if (!mdsc->sessions[i])
continue;
s = mdsc->sessions[i];
oldstate = ceph_mdsmap_get_state(oldmap, i);
newstate = ceph_mdsmap_get_state(newmap, i);
- dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
- i, ceph_mds_state_name(oldstate),
- ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
- ceph_mds_state_name(newstate),
- ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
- session_state_name(s->s_state));
+ doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n",
+ i, ceph_mds_state_name(oldstate),
+ ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+ ceph_mds_state_name(newstate),
+ ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+ ceph_session_state_name(s->s_state));
+
+ if (i >= newmap->possible_max_rank) {
+ /* force close session for stopped mds */
+ ceph_get_mds_session(s);
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
- if (i >= newmap->m_max_mds ||
- memcmp(ceph_mdsmap_get_addr(oldmap, i),
- ceph_mdsmap_get_addr(newmap, i),
- sizeof(struct ceph_entity_addr))) {
- if (s->s_state == CEPH_MDS_SESSION_OPENING) {
- /* the session never opened, just close it
- * out now */
- __wake_requests(mdsc, &s->s_waiting);
- __unregister_session(mdsc, s);
- } else {
- /* just close it */
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_lock(&mdsc->mutex);
- ceph_con_close(&s->s_con);
- mutex_unlock(&s->s_mutex);
- s->s_state = CEPH_MDS_SESSION_RESTARTING;
- }
+ ceph_put_mds_session(s);
- /* kick any requests waiting on the recovering mds */
+ mutex_lock(&mdsc->mutex);
kick_requests(mdsc, i);
+ continue;
+ }
+
+ if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+ ceph_mdsmap_get_addr(newmap, i),
+ sizeof(struct ceph_entity_addr))) {
+ /* just close it */
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
+ ceph_con_close(&s->s_con);
+ mutex_unlock(&s->s_mutex);
+ s->s_state = CEPH_MDS_SESSION_RESTARTING;
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -2703,6 +5119,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
newstate >= CEPH_MDS_STATE_RECONNECT) {
mutex_unlock(&mdsc->mutex);
+ clear_bit(i, targets);
send_mds_reconnect(mdsc, s);
mutex_lock(&mdsc->mutex);
}
@@ -2714,14 +5131,65 @@ static void check_new_map(struct ceph_mds_client *mdsc,
newstate >= CEPH_MDS_STATE_ACTIVE) {
if (oldstate != CEPH_MDS_STATE_CREATING &&
oldstate != CEPH_MDS_STATE_STARTING)
- pr_info("mds%d recovery completed\n", s->s_mds);
+ pr_info_client(cl, "mds%d recovery completed\n",
+ s->s_mds);
kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_lock(&mdsc->mutex);
ceph_kick_flushing_caps(mdsc, s);
- wake_up_session_caps(s, 1);
+ mutex_unlock(&s->s_mutex);
+ wake_up_session_caps(s, RECONNECT);
+ }
+ }
+
+ /*
+ * Only open and reconnect sessions that don't exist yet.
+ */
+ for (i = 0; i < newmap->possible_max_rank; i++) {
+ /*
+ * In case the import MDS is crashed just after
+ * the EImportStart journal is flushed, so when
+ * a standby MDS takes over it and is replaying
+ * the EImportStart journal the new MDS daemon
+ * will wait the client to reconnect it, but the
+ * client may never register/open the session yet.
+ *
+ * Will try to reconnect that MDS daemon if the
+ * rank number is in the export targets array and
+ * is the up:reconnect state.
+ */
+ newstate = ceph_mdsmap_get_state(newmap, i);
+ if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
+ continue;
+
+ /*
+ * The session maybe registered and opened by some
+ * requests which were choosing random MDSes during
+ * the mdsc->mutex's unlock/lock gap below in rare
+ * case. But the related MDS daemon will just queue
+ * that requests and be still waiting for the client's
+ * reconnection request in up:reconnect state.
+ */
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (likely(!s)) {
+ s = __open_export_target_session(mdsc, i);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ pr_err_client(cl,
+ "failed to open export target session, err %d\n",
+ err);
+ continue;
+ }
}
+ doutc(cl, "send reconnect to export target mds.%d\n", i);
+ mutex_unlock(&mdsc->mutex);
+ send_mds_reconnect(mdsc, s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
}
- for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -2730,8 +5198,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG ||
s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout(" connecting to export targets of laggy mds%d\n",
- i);
+ doutc(cl, " connecting to export targets of laggy mds%d\n", i);
__open_export_target_sessions(mdsc, s);
}
}
@@ -2758,6 +5225,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct dentry *parent, *dentry;
@@ -2769,7 +5237,10 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct qstr dname;
int release = 0;
- dout("handle_lease from mds%d\n", mds);
+ doutc(cl, "from mds%d\n", mds);
+
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
/* decode */
if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
@@ -2777,32 +5248,30 @@ static void handle_lease(struct ceph_mds_client *mdsc,
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
seq = le32_to_cpu(h->seq);
- dname.name = (void *)h + sizeof(*h) + sizeof(u32);
- dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
- if (dname.len != get_unaligned_le32(h+1))
+ dname.len = get_unaligned_le32(h + 1);
+ if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
goto bad;
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
+ dname.name = (void *)(h + 1) + sizeof(u32);
/* lookup inode */
inode = ceph_find_inode(sb, vino);
- dout("handle_lease %s, ino %llx %p %.*s\n",
- ceph_lease_op_name(h->action), vino.ino, inode,
- dname.len, dname.name);
- if (inode == NULL) {
- dout("handle_lease no inode %llx\n", vino.ino);
+ doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action),
+ vino.ino, inode, dname.len, dname.name);
+
+ mutex_lock(&session->s_mutex);
+ if (!inode) {
+ doutc(cl, "no inode %llx\n", vino.ino);
goto release;
}
/* dentry */
parent = d_find_alias(inode);
if (!parent) {
- dout("no parent dentry on inode %p\n", inode);
+ doutc(cl, "no parent dentry on inode %p\n", inode);
WARN_ON(1);
goto release; /* hrm... */
}
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
dentry = d_lookup(parent, &dname);
dput(parent);
if (!dentry)
@@ -2822,14 +5291,14 @@ static void handle_lease(struct ceph_mds_client *mdsc,
case CEPH_MDS_LEASE_RENEW:
if (di->lease_session == session &&
- di->lease_gen == session->s_cap_gen &&
+ di->lease_gen == atomic_read(&session->s_cap_gen) &&
di->lease_renew_from &&
di->lease_renew_after == 0) {
unsigned long duration =
- le32_to_cpu(h->duration_ms) * HZ / 1000;
+ msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
+ di->time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
di->lease_renew_from = 0;
@@ -2849,136 +5318,153 @@ release:
ceph_con_send(&session->s_con, msg);
out:
- iput(inode);
mutex_unlock(&session->s_mutex);
+ iput(inode);
+
+ ceph_dec_mds_stopping_blocker(mdsc);
return;
bad:
- pr_err("corrupt lease message\n");
+ ceph_dec_mds_stopping_blocker(mdsc);
+
+ pr_err_client(cl, "corrupt lease message\n");
ceph_msg_dump(msg);
}
void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
struct dentry *dentry, char action,
u32 seq)
{
+ struct ceph_client *cl = session->s_mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_lease *lease;
- int len = sizeof(*lease) + sizeof(u32);
- int dnamelen = 0;
+ struct inode *dir;
+ int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
- dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
- inode, dentry, ceph_lease_op_name(action), session->s_mds);
- dnamelen = dentry->d_name.len;
- len += dnamelen;
+ doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action),
+ session->s_mds);
msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
if (!msg)
return;
lease = msg->front.iov_base;
lease->action = action;
- lease->ino = cpu_to_le64(ceph_vino(inode).ino);
- lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
lease->seq = cpu_to_le32(seq);
- put_unaligned_le32(dnamelen, lease + 1);
- memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+ spin_lock(&dentry->d_lock);
+ dir = d_inode(dentry->d_parent);
+ lease->ino = cpu_to_le64(ceph_ino(dir));
+ lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
+
+ put_unaligned_le32(dentry->d_name.len, lease + 1);
+ memcpy((void *)(lease + 1) + 4,
+ dentry->d_name.name, dentry->d_name.len);
+ spin_unlock(&dentry->d_lock);
ceph_con_send(&session->s_con, msg);
}
/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
+ * lock unlock the session, to wait ongoing session activities
*/
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
+static void lock_unlock_session(struct ceph_mds_session *s)
{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
+}
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
+static void maybe_recover_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_fs_client *fsc = mdsc->fsc;
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
+ if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
return;
- }
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
+ if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
+ return;
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
+ if (!READ_ONCE(fsc->blocklisted))
+ return;
+
+ pr_info_client(cl, "auto reconnect after blocklisted\n");
+ ceph_force_reconnect(fsc->sb);
}
-/*
- * drop all leases (and dentry refs) in preparation for umount
- */
-static void drop_leases(struct ceph_mds_client *mdsc)
+bool check_session_state(struct ceph_mds_session *s)
{
- int i;
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
- dout("drop_leases\n");
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
+ switch (s->s_state) {
+ case CEPH_MDS_SESSION_OPEN:
+ if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+ s->s_state = CEPH_MDS_SESSION_HUNG;
+ pr_info_client(cl, "mds%d hung\n", s->s_mds);
+ }
+ break;
+ case CEPH_MDS_SESSION_CLOSING:
+ case CEPH_MDS_SESSION_NEW:
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_CLOSED:
+ case CEPH_MDS_SESSION_REJECTED:
+ return false;
}
- mutex_unlock(&mdsc->mutex);
+
+ return true;
}
+/*
+ * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
+ * then we need to retransmit that request.
+ */
+void inc_session_sequence(struct ceph_mds_session *s)
+{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+
+ lockdep_assert_held(&s->s_mutex);
+
+ s->s_seq++;
+
+ if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+ int ret;
+ doutc(cl, "resending session close request for mds%d\n", s->s_mds);
+ ret = request_close_session(s);
+ if (ret < 0)
+ pr_err_client(cl, "unable to close session to mds%d: %d\n",
+ s->s_mds, ret);
+ }
+}
/*
- * delayed work -- periodically trim expired leases, renew caps with mds
+ * delayed work -- periodically trim expired leases, renew caps with mds. If
+ * the @delay parameter is set to 0 or if it's more than 5 secs, the default
+ * workqueue delay value of 5 secs will be used.
*/
-static void schedule_delayed(struct ceph_mds_client *mdsc)
+static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
- schedule_delayed_work(&mdsc->delayed_work, hz);
+ unsigned long max_delay = HZ * 5;
+
+ /* 5 secs default delay */
+ if (!delay || (delay > max_delay))
+ delay = max_delay;
+ schedule_delayed_work(&mdsc->delayed_work,
+ round_jiffies_relative(delay));
}
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
- dout("mdsc delayed_work\n");
- ceph_check_delayed_caps(mdsc);
+ doutc(mdsc->fsc->client, "mdsc delayed_work\n");
+
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
+ return;
mutex_lock(&mdsc->mutex);
renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -2989,34 +5475,22 @@ static void delayed_work(struct work_struct *work)
for (i = 0; i < mdsc->max_sessions; i++) {
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (s == NULL)
- continue;
- if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
- dout("resending session close request for mds%d\n",
- s->s_mds);
- request_close_session(mdsc, s);
- ceph_put_mds_session(s);
+ if (!s)
continue;
- }
- if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
- if (s->s_state == CEPH_MDS_SESSION_OPEN) {
- s->s_state = CEPH_MDS_SESSION_HUNG;
- pr_info("mds%d hung\n", s->s_mds);
- }
- }
- if (s->s_state < CEPH_MDS_SESSION_OPEN) {
- /* this mds is failed or recovering, just wait */
+
+ if (!check_session_state(s)) {
ceph_put_mds_session(s);
continue;
}
mutex_unlock(&mdsc->mutex);
+ ceph_flush_session_cap_releases(mdsc, s);
+
mutex_lock(&s->s_mutex);
if (renew_caps)
send_renew_caps(mdsc, s);
else
ceph_con_keepalive(&s->s_con);
- ceph_add_cap_releases(mdsc, s);
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG)
ceph_send_cap_releases(mdsc, s);
@@ -3027,57 +5501,95 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
+
+ ceph_queue_cap_reclaim_work(mdsc);
+
+ ceph_trim_snapid_map(mdsc);
+
+ maybe_recover_session(mdsc);
+
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc;
+ int err;
mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
if (!mdsc)
return -ENOMEM;
mdsc->fsc = fsc;
- fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL) {
- kfree(mdsc);
- return -ENOMEM;
+ if (!mdsc->mdsmap) {
+ err = -ENOMEM;
+ goto err_mdsc;
}
init_completion(&mdsc->safe_umount_waiters);
+ spin_lock_init(&mdsc->stopping_lock);
+ atomic_set(&mdsc->stopping_blockers, 0);
+ init_completion(&mdsc->stopping_waiter);
+ atomic64_set(&mdsc->dirty_folios, 0);
+ init_waitqueue_head(&mdsc->flush_end_wq);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
- mdsc->sessions = NULL;
- mdsc->max_sessions = 0;
- mdsc->stopping = 0;
+ mdsc->quotarealms_inodes = RB_ROOT;
+ mutex_init(&mdsc->quotarealms_inodes_mutex);
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
spin_lock_init(&mdsc->snap_empty_lock);
- mdsc->last_tid = 0;
mdsc->request_tree = RB_ROOT;
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
INIT_LIST_HEAD(&mdsc->cap_delay_list);
+#ifdef CONFIG_DEBUG_FS
+ INIT_LIST_HEAD(&mdsc->cap_wait_list);
+#endif
spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
- mdsc->cap_flush_seq = 0;
- INIT_LIST_HEAD(&mdsc->cap_dirty);
+ mdsc->last_cap_flush_tid = 1;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
- mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
- spin_lock_init(&mdsc->dentry_lru_lock);
- INIT_LIST_HEAD(&mdsc->dentry_lru);
+ INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
+ err = ceph_metric_init(&mdsc->metric);
+ if (err)
+ goto err_mdsmap;
+
+ spin_lock_init(&mdsc->dentry_list_lock);
+ INIT_LIST_HEAD(&mdsc->dentry_leases);
+ INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
+
+ spin_lock_init(&mdsc->snapid_map_lock);
+ mdsc->snapid_map_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->snapid_map_lru);
+
+ init_rwsem(&mdsc->pool_perm_rwsem);
+ mdsc->pool_perm_tree = RB_ROOT;
+
+ strscpy(mdsc->nodename, utsname()->nodename,
+ sizeof(mdsc->nodename));
+ fsc->mdsc = mdsc;
return 0;
+
+err_mdsmap:
+ kfree(mdsc->mdsmap);
+err_mdsc:
+ kfree(mdsc);
+ return err;
}
/*
@@ -3086,27 +5598,228 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
*/
static void wait_requests(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req;
- struct ceph_fs_client *fsc = mdsc->fsc;
mutex_lock(&mdsc->mutex);
if (__get_oldest_req(mdsc)) {
mutex_unlock(&mdsc->mutex);
- dout("wait_requests waiting for requests\n");
+ doutc(cl, "waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- fsc->client->options->mount_timeout * HZ);
+ ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
while ((req = __get_oldest_req(mdsc))) {
- dout("wait_requests timed out on tid %llu\n",
- req->r_tid);
+ doutc(cl, "timed out on tid %llu\n", req->r_tid);
+ list_del_init(&req->r_wait);
__unregister_request(mdsc, req);
}
}
mutex_unlock(&mdsc->mutex);
- dout("wait_requests done\n");
+ doutc(cl, "done\n");
+}
+
+void send_flush_mdlog(struct ceph_mds_session *s)
+{
+ struct ceph_client *cl = s->s_mdsc->fsc->client;
+ struct ceph_msg *msg;
+
+ /*
+ * Pre-luminous MDS crashes when it sees an unknown session request
+ */
+ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+ return;
+
+ mutex_lock(&s->s_mutex);
+ doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+ s->s_seq);
+ if (!msg) {
+ pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ } else {
+ ceph_con_send(&s->s_con, msg);
+ }
+ mutex_unlock(&s->s_mutex);
+}
+
+static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
+ struct ceph_mds_cap_auth *auth,
+ const struct cred *cred,
+ char *tpath)
+{
+ u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);
+ u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);
+ struct ceph_client *cl = mdsc->fsc->client;
+ const char *fs_name = mdsc->fsc->mount_options->mds_namespace;
+ const char *spath = mdsc->fsc->mount_options->server_path;
+ bool gid_matched = false;
+ u32 gid, tlen, len;
+ int i, j;
+
+ doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n",
+ fs_name, auth->match.fs_name ? auth->match.fs_name : "");
+ if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) {
+ /* fsname mismatch, try next one */
+ return 0;
+ }
+
+ doutc(cl, "match.uid %lld\n", auth->match.uid);
+ if (auth->match.uid != MDS_AUTH_UID_ANY) {
+ if (auth->match.uid != caller_uid)
+ return 0;
+ if (auth->match.num_gids) {
+ for (i = 0; i < auth->match.num_gids; i++) {
+ if (caller_gid == auth->match.gids[i])
+ gid_matched = true;
+ }
+ if (!gid_matched && cred->group_info->ngroups) {
+ for (i = 0; i < cred->group_info->ngroups; i++) {
+ gid = from_kgid(&init_user_ns,
+ cred->group_info->gid[i]);
+ for (j = 0; j < auth->match.num_gids; j++) {
+ if (gid == auth->match.gids[j]) {
+ gid_matched = true;
+ break;
+ }
+ }
+ if (gid_matched)
+ break;
+ }
+ }
+ if (!gid_matched)
+ return 0;
+ }
+ }
+
+ /* path match */
+ if (auth->match.path) {
+ if (!tpath)
+ return 0;
+
+ tlen = strlen(tpath);
+ len = strlen(auth->match.path);
+ if (len) {
+ char *_tpath = tpath;
+ bool free_tpath = false;
+ int m, n;
+
+ doutc(cl, "server path %s, tpath %s, match.path %s\n",
+ spath, tpath, auth->match.path);
+ if (spath && (m = strlen(spath)) != 1) {
+ /* mount path + '/' + tpath + an extra space */
+ n = m + 1 + tlen + 1;
+ _tpath = kmalloc(n, GFP_NOFS);
+ if (!_tpath)
+ return -ENOMEM;
+ /* remove the leading '/' */
+ snprintf(_tpath, n, "%s/%s", spath + 1, tpath);
+ free_tpath = true;
+ tlen = strlen(_tpath);
+ }
+
+ /*
+ * Please note the tailing '/' for match.path has already
+ * been removed when parsing.
+ *
+ * Remove the tailing '/' for the target path.
+ */
+ while (tlen && _tpath[tlen - 1] == '/') {
+ _tpath[tlen - 1] = '\0';
+ tlen -= 1;
+ }
+ doutc(cl, "_tpath %s\n", _tpath);
+
+ /*
+ * In case first == _tpath && tlen == len:
+ * match.path=/foo --> /foo _path=/foo --> match
+ * match.path=/foo/ --> /foo _path=/foo --> match
+ *
+ * In case first == _tmatch.path && tlen > len:
+ * match.path=/foo/ --> /foo _path=/foo/ --> match
+ * match.path=/foo --> /foo _path=/foo/ --> match
+ * match.path=/foo/ --> /foo _path=/foo/d --> match
+ * match.path=/foo --> /foo _path=/food --> mismatch
+ *
+ * All the other cases --> mismatch
+ */
+ bool path_matched = true;
+ char *first = strstr(_tpath, auth->match.path);
+ if (first != _tpath ||
+ (tlen > len && _tpath[len] != '/')) {
+ path_matched = false;
+ }
+
+ if (free_tpath)
+ kfree(_tpath);
+
+ if (!path_matched)
+ return 0;
+ }
+ }
+
+ doutc(cl, "matched\n");
+ return 1;
+}
+
+int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask)
+{
+ const struct cred *cred = get_current_cred();
+ u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid);
+ u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid);
+ struct ceph_mds_cap_auth *rw_perms_s = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
+ bool root_squash_perms = true;
+ int i, err;
+
+ doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n",
+ tpath, mask, caller_uid, caller_gid);
+
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i];
+
+ err = ceph_mds_auth_match(mdsc, s, cred, tpath);
+ if (err < 0) {
+ put_cred(cred);
+ return err;
+ } else if (err > 0) {
+ /* always follow the last auth caps' permission */
+ root_squash_perms = true;
+ rw_perms_s = NULL;
+ if ((mask & MAY_WRITE) && s->writeable &&
+ s->match.root_squash && (!caller_uid || !caller_gid))
+ root_squash_perms = false;
+
+ if (((mask & MAY_WRITE) && !s->writeable) ||
+ ((mask & MAY_READ) && !s->readable))
+ rw_perms_s = s;
+ }
+ }
+
+ put_cred(cred);
+
+ doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms,
+ rw_perms_s);
+ if (root_squash_perms && rw_perms_s == NULL) {
+ doutc(cl, "access allowed\n");
+ return 0;
+ }
+
+ if (!root_squash_perms) {
+ doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write",
+ caller_uid, caller_gid);
+ }
+ if (rw_perms_s) {
+ doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d",
+ rw_perms_s->readable, rw_perms_s->writeable,
+ !!(mask & MAY_READ), !!(mask & MAY_WRITE));
+ }
+ doutc(cl, "access denied\n");
+ return -EACCES;
}
/*
@@ -3115,10 +5828,11 @@ static void wait_requests(struct ceph_mds_client *mdsc)
*/
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
- dout("pre_umount\n");
- mdsc->stopping = 1;
+ doutc(mdsc->fsc->client, "begin\n");
+ mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
- drop_leases(mdsc);
+ ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
+ ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
@@ -3127,18 +5841,24 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
* their inode/dcache refs
*/
ceph_msgr_flush();
+
+ ceph_cleanup_quotarealms_inodes(mdsc);
+ doutc(mdsc->fsc->client, "done\n");
}
/*
- * wait for all write mds requests to flush.
+ * flush the mdlog and wait for all write mds requests to flush.
*/
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req = NULL, *nextreq;
+ struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
+ doutc(cl, "want %lld\n", want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -3148,15 +5868,34 @@ restart:
nextreq = rb_entry(n, struct ceph_mds_request, r_node);
else
nextreq = NULL;
- if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+ if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
+ (req->r_op & CEPH_MDS_OP_WRITE)) {
+ struct ceph_mds_session *s = req->r_session;
+
+ if (!s) {
+ req = nextreq;
+ continue;
+ }
+
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
+ s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
- req->r_tid, want_tid);
+
+ /* send flush mdlog request to MDS */
+ if (last_session != s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(last_session);
+ last_session = s;
+ } else {
+ ceph_put_mds_session(s);
+ }
+ doutc(cl, "wait on %llu (want %llu)\n",
+ req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
+
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
if (!nextreq)
@@ -3171,58 +5910,63 @@ restart:
req = nextreq;
}
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
+ ceph_put_mds_session(last_session);
+ doutc(cl, "done\n");
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u64 want_tid, want_flush;
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
return;
- dout("sync\n");
+ doutc(cl, "sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
ceph_flush_dirty_caps(mdsc);
+ ceph_flush_cap_releases(mdsc);
+ spin_lock(&mdsc->cap_dirty_lock);
+ want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
- wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+ flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
+ wait_caps_flush(mdsc, want_flush);
}
/*
* true if all sessions are closed, or we force unmount
*/
-static bool done_closing_sessions(struct ceph_mds_client *mdsc)
+static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
{
- int i, n = 0;
-
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++)
- if (mdsc->sessions[i])
- n++;
- mutex_unlock(&mdsc->mutex);
- return n == 0;
+ return atomic_read(&mdsc->num_sessions) <= skipped;
}
/*
- * called after sb is ro.
+ * called after sb is ro or when metadata corrupted.
*/
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
+ struct ceph_options *opts = mdsc->fsc->client->options;
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_session *session;
int i;
- struct ceph_fs_client *fsc = mdsc->fsc;
- unsigned long timeout = fsc->client->options->mount_timeout * HZ;
+ int skipped = 0;
- dout("close_sessions\n");
+ doutc(cl, "begin\n");
/* close sessions */
mutex_lock(&mdsc->mutex);
@@ -3232,22 +5976,24 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
continue;
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- __close_session(mdsc, session);
+ if (__close_session(mdsc, session) <= 0)
+ skipped++;
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
- dout("waiting for sessions to close\n");
- wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
- timeout);
+ doutc(cl, "waiting for sessions to close\n");
+ wait_event_timeout(mdsc->session_close_wq,
+ done_closing_sessions(mdsc, skipped),
+ ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining sessions */
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i]) {
- session = get_session(mdsc->sessions[i]);
+ session = ceph_get_mds_session(mdsc->sessions[i]);
__unregister_session(mdsc, session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -3260,44 +6006,179 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
WARN_ON(!list_empty(&mdsc->cap_delay_list));
mutex_unlock(&mdsc->mutex);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_snapid_map(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
+ cancel_work_sync(&mdsc->cap_reclaim_work);
+ cancel_work_sync(&mdsc->cap_unlink_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
- dout("stopped\n");
+ doutc(cl, "done\n");
+}
+
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int mds;
+
+ doutc(mdsc->fsc->client, "force umount\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ continue;
+
+ if (session->s_state == CEPH_MDS_SESSION_REJECTED)
+ __unregister_session(mdsc, session);
+ __wake_requests(mdsc, &session->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ }
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
}
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
- dout("stop\n");
- cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+ doutc(mdsc->fsc->client, "stop\n");
+ /*
+ * Make sure the delayed work stopped before releasing
+ * the resources.
+ *
+ * Because the cancel_delayed_work_sync() will only
+ * guarantee that the work finishes executing. But the
+ * delayed work will re-arm itself again after that.
+ */
+ flush_delayed_work(&mdsc->delayed_work);
+
if (mdsc->mdsmap)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
ceph_caps_finalize(mdsc);
+
+ if (mdsc->s_cap_auths) {
+ int i;
+
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ kfree(mdsc->s_cap_auths[i].match.gids);
+ kfree(mdsc->s_cap_auths[i].match.path);
+ kfree(mdsc->s_cap_auths[i].match.fs_name);
+ }
+ kfree(mdsc->s_cap_auths);
+ }
+
+ ceph_pool_perm_destroy(mdsc);
}
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
{
struct ceph_mds_client *mdsc = fsc->mdsc;
+ doutc(fsc->client, "%p\n", mdsc);
- dout("mdsc_destroy %p\n", mdsc);
- ceph_mdsc_stop(mdsc);
+ if (!mdsc)
+ return;
/* flush out any connection work with references to us */
ceph_msgr_flush();
+ ceph_mdsc_stop(mdsc);
+
+ ceph_metric_destroy(&mdsc->metric);
+
fsc->mdsc = NULL;
kfree(mdsc);
- dout("mdsc_destroy %p done\n", mdsc);
+ doutc(fsc->client, "%p done\n", mdsc);
}
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ struct ceph_client *cl = fsc->client;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ doutc(cl, "epoch %u\n", epoch);
+
+ /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
+ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
+
+ ceph_decode_32_safe(&p, end, num_fs, bad);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ p += 2; // info_v, info_cv
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+
+bad:
+ pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n",
+ err);
+ ceph_umount_begin(mdsc->fsc->sb);
+ ceph_msg_dump(msg);
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = err;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
/*
* handle mds map update.
*/
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
u32 epoch;
u32 maplen;
void *p = msg->front.iov_base;
@@ -3312,19 +6193,17 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
return;
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
- dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+ doutc(cl, "epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
- dout("handle_map epoch %u <= our %u\n",
- epoch, mdsc->mdsmap->m_epoch);
+ doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
return;
}
- newmap = ceph_mdsmap_decode(&p, end);
+ newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad_unlock;
@@ -3339,38 +6218,40 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
} else {
mdsc->mdsmap = newmap; /* first mds map */
}
- mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+ mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
+ MAX_LFS_FILESIZE);
__wake_requests(mdsc, &mdsc->waiting_for_map);
+ ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+ mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
- schedule_delayed(mdsc);
+ schedule_delayed(mdsc, 0);
return;
bad_unlock:
mutex_unlock(&mdsc->mutex);
bad:
- pr_err("error decoding mdsmap %d\n", err);
+ pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n",
+ err);
+ ceph_umount_begin(mdsc->fsc->sb);
+ ceph_msg_dump(msg);
return;
}
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mds_get_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+ if (ceph_get_mds_session(s))
return con;
- }
- dout("mdsc con_get %p FAIL\n", s);
return NULL;
}
-static void con_put(struct ceph_connection *con)
+static void mds_put_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
@@ -3378,19 +6259,23 @@ static void con_put(struct ceph_connection *con)
* if the client is unresponsive for long enough, the mds will kill
* the session entirely.
*/
-static void peer_reset(struct ceph_connection *con)
+static void mds_peer_reset(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_warning("mds%d closed our session\n", s->s_mds);
- send_mds_reconnect(mdsc, s);
+ pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
+ s->s_mds);
+ if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
+ send_mds_reconnect(mdsc, s);
}
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
int type = le16_to_cpu(msg->hdr.type);
mutex_lock(&mdsc->mutex);
@@ -3402,7 +6287,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
break;
case CEPH_MSG_CLIENT_SESSION:
handle_session(s, msg);
@@ -3422,10 +6310,13 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
case CEPH_MSG_CLIENT_LEASE:
handle_lease(mdsc, s, msg);
break;
+ case CEPH_MSG_CLIENT_QUOTA:
+ ceph_handle_quota(mdsc, s, msg);
+ break;
default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
+ pr_err_client(cl, "received unknown message type %d %s\n",
+ type, ceph_msg_type_name(type));
}
out:
ceph_msg_put(msg);
@@ -3439,45 +6330,47 @@ out:
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
*/
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
- int *proto, int force_new)
+static struct ceph_auth_handshake *
+mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
- if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(ac, auth->authorizer);
- auth->authorizer = NULL;
- }
- if (!auth->authorizer) {
- int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
- auth);
- if (ret)
- return ERR_PTR(ret);
- } else {
- int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
- auth);
- if (ret)
- return ERR_PTR(ret);
- }
- *proto = ac->protocol;
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
return auth;
}
+static int mds_add_authorizer_challenge(struct ceph_connection *con,
+ void *challenge_buf, int challenge_buf_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
+ return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
+ challenge_buf, challenge_buf_len);
+}
+
+static int mds_verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
- return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
}
-static int invalidate_authorizer(struct ceph_connection *con)
+static int mds_invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -3488,6 +6381,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
}
+static int mds_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int mds_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int mds_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int mds_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
struct ceph_msg_header *hdr, int *skip)
{
@@ -3509,15 +6476,38 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
+static int mds_sign_message(struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = msg->con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_sign_message(auth, msg);
+}
+
+static int mds_check_message_signature(struct ceph_msg *msg)
+{
+ struct ceph_mds_session *s = msg->con->private;
+ struct ceph_auth_handshake *auth = &s->s_auth;
+
+ return ceph_auth_check_message_signature(auth, msg);
+}
+
static const struct ceph_connection_operations mds_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .peer_reset = peer_reset,
+ .get = mds_get_con,
+ .put = mds_put_con,
.alloc_msg = mds_alloc_msg,
+ .dispatch = mds_dispatch,
+ .peer_reset = mds_peer_reset,
+ .get_authorizer = mds_get_authorizer,
+ .add_authorizer_challenge = mds_add_authorizer_challenge,
+ .verify_authorizer_reply = mds_verify_authorizer_reply,
+ .invalidate_authorizer = mds_invalidate_authorizer,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
+ .get_auth_request = mds_get_auth_request,
+ .handle_auth_reply_more = mds_handle_auth_reply_more,
+ .handle_auth_done = mds_handle_auth_done,
+ .handle_auth_bad_method = mds_handle_auth_bad_method,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c2a19fbbe517..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_MDS_CLIENT_H
#define _FS_CEPH_MDS_CLIENT_H
@@ -7,12 +8,54 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/refcount.h>
+#include <linux/utsname.h>
+#include <linux/ktime.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/auth.h>
+#include "mdsmap.h"
+#include "metric.h"
+#include "super.h"
+
+/* The first 8 bits are reserved for old ceph releases */
+enum ceph_feature_type {
+ CEPHFS_FEATURE_MIMIC = 8,
+ CEPHFS_FEATURE_REPLY_ENCODING,
+ CEPHFS_FEATURE_RECLAIM_CLIENT,
+ CEPHFS_FEATURE_LAZY_CAP_WANTED,
+ CEPHFS_FEATURE_MULTI_RECONNECT,
+ CEPHFS_FEATURE_DELEG_INO,
+ CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_ALTERNATE_NAME,
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
+ CEPHFS_FEATURE_OP_GETVXATTR,
+ CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID,
+ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
+
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK,
+};
+
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ CEPHFS_FEATURE_MIMIC, \
+ CEPHFS_FEATURE_REPLY_ENCODING, \
+ CEPHFS_FEATURE_LAZY_CAP_WANTED, \
+ CEPHFS_FEATURE_MULTI_RECONNECT, \
+ CEPHFS_FEATURE_DELEG_INO, \
+ CEPHFS_FEATURE_METRIC_COLLECT, \
+ CEPHFS_FEATURE_ALTERNATE_NAME, \
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
+ CEPHFS_FEATURE_OP_GETVXATTR, \
+ CEPHFS_FEATURE_32BITS_RETRY_FWD, \
+ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \
+ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \
+}
+
/*
* Some lock dependencies:
*
@@ -30,6 +73,24 @@
struct ceph_fs_client;
struct ceph_cap;
+#define MDS_AUTH_UID_ANY -1
+
+struct ceph_mds_cap_match {
+ s64 uid; /* default to MDS_AUTH_UID_ANY */
+ u32 num_gids;
+ u32 *gids; /* use these GIDs */
+ char *path; /* require path to be child of this
+ (may be "" or "/" for any) */
+ char *fs_name;
+ bool root_squash; /* default to false */
+};
+
+struct ceph_mds_cap_auth {
+ struct ceph_mds_cap_match match;
+ bool readable;
+ bool writeable;
+};
+
/*
* parsed info about a single inode. pointers are into the encoded
* on-wire structures within the mds reply message payload.
@@ -41,6 +102,37 @@ struct ceph_mds_reply_info_in {
char *symlink;
u32 xattr_len;
char *xattr_data;
+ u64 inline_version;
+ u32 inline_len;
+ char *inline_data;
+ u32 pool_ns_len;
+ char *pool_ns_data;
+ u64 max_bytes;
+ u64 max_files;
+ s32 dir_pin;
+ struct ceph_timespec btime;
+ struct ceph_timespec snap_btime;
+ u8 *fscrypt_auth;
+ u8 *fscrypt_file;
+ u32 fscrypt_auth_len;
+ u32 fscrypt_file_len;
+ u64 rsnaps;
+ u64 change_attr;
+};
+
+struct ceph_mds_reply_dir_entry {
+ bool is_nokey;
+ char *name;
+ u32 name_len;
+ u32 raw_hash;
+ struct ceph_mds_reply_lease *lease;
+ struct ceph_mds_reply_info_in inode;
+ loff_t offset;
+};
+
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
};
/*
@@ -56,8 +148,11 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in diri, targeti;
struct ceph_mds_reply_dirfrag *dirfrag;
char *dname;
+ u8 *altname;
u32 dname_len;
+ u32 altname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -67,12 +162,13 @@ struct ceph_mds_reply_info_parsed {
/* for readdir results */
struct {
struct ceph_mds_reply_dirfrag *dir_dir;
+ size_t dir_buf_size;
int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
+ bool dir_end;
+ bool dir_complete;
+ bool hash_order;
+ bool offset_hash;
+ struct ceph_mds_reply_dir_entry *dir_entries;
};
/* for create results */
@@ -91,10 +187,13 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
+ sizeof(struct ceph_mds_cap_item))
/*
@@ -105,9 +204,11 @@ enum {
CEPH_MDS_SESSION_OPENING = 2,
CEPH_MDS_SESSION_OPEN = 3,
CEPH_MDS_SESSION_HUNG = 4,
- CEPH_MDS_SESSION_CLOSING = 5,
- CEPH_MDS_SESSION_RESTARTING = 6,
- CEPH_MDS_SESSION_RECONNECTING = 7,
+ CEPH_MDS_SESSION_RESTARTING = 5,
+ CEPH_MDS_SESSION_RECONNECTING = 6,
+ CEPH_MDS_SESSION_CLOSING = 7,
+ CEPH_MDS_SESSION_CLOSED = 8,
+ CEPH_MDS_SESSION_REJECTED = 9,
};
struct ceph_mds_session {
@@ -115,6 +216,7 @@ struct ceph_mds_session {
int s_mds;
int s_state;
unsigned long s_ttl; /* time until mds kills us */
+ unsigned long s_features;
u64 s_seq; /* incoming msg seq # */
struct mutex s_mutex; /* serialize session messages */
@@ -122,29 +224,33 @@ struct ceph_mds_session {
struct ceph_auth_handshake s_auth;
- /* protected by s_gen_ttl_lock */
- spinlock_t s_gen_ttl_lock;
- u32 s_cap_gen; /* inc each time we get mds stale msg */
- unsigned long s_cap_ttl; /* when session caps expire */
+ atomic_t s_cap_gen; /* inc each time we get mds stale msg */
+ unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
+ refcount_t s_ref;
struct list_head s_caps; /* all caps issued by this session */
- int s_nr_caps, s_trim_caps;
+ struct ceph_cap *s_cap_iterator;
+ int s_nr_caps;
int s_num_cap_releases;
+ int s_cap_reconnect;
+ int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct list_head s_cap_releases_done; /* ready to send */
- struct ceph_cap *s_cap_iterator;
+ struct work_struct s_cap_release_work;
+
+ /* See ceph_inode_info->i_dirty_item. */
+ struct list_head s_cap_dirty; /* inodes w/ dirty caps */
- /* protected by mutex */
+ /* See ceph_inode_info->i_flushing_item. */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
+
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- atomic_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
+ struct xarray s_delegated_inos;
};
/*
@@ -164,6 +270,11 @@ struct ceph_mds_client;
*/
typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
struct ceph_mds_request *req);
+/*
+ * wait for request completion callback
+ */
+typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req);
/*
* an in-flight mds request
@@ -173,6 +284,7 @@ struct ceph_mds_request {
struct rb_node r_node;
struct ceph_mds_client *r_mdsc;
+ struct kref r_kref;
int r_op; /* mds op code */
/* operation on what? */
@@ -183,25 +295,45 @@ struct ceph_mds_request {
char *r_path1, *r_path2;
struct ceph_vino r_ino1, r_ino2;
- struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+ struct inode *r_parent; /* parent dir inode */
struct inode *r_target_inode; /* resulting inode */
+ struct inode *r_new_inode; /* new inode (for creates) */
+
+ const struct qstr *r_dname; /* stable name (for ->d_revalidate) */
+
+#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
+#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
+#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */
+#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */
+#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
+#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
+#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC (8) /* async request */
+#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */
+ unsigned long r_req_flags;
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args;
+
+ struct ceph_fscrypt_auth *r_fscrypt_auth;
+ u64 r_fscrypt_file;
+
+ u8 *r_altname; /* fscrypt binary crypttext for long filenames */
+ u32 r_altname_len; /* length of r_altname */
+
int r_fmode; /* file mode, if expecting cap */
- kuid_t r_uid;
- kgid_t r_gid;
+ int r_request_release_offset;
+ const struct cred *r_cred;
+ struct mnt_idmap *r_mnt_idmap;
+ struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
int r_direct_mode;
u32 r_direct_hash; /* choose dir frag based on this dentry hash */
- bool r_direct_is_hash; /* true if r_direct_hash is valid */
/* data payload is used for xattr ops */
- struct page **r_pages;
- int r_num_pages;
- int r_data_len;
+ struct ceph_pagelist *r_pagelist;
/* what caps shall we drop? */
int r_inode_drop, r_inode_unless;
@@ -211,14 +343,19 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
- int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
- bool r_aborted;
+ u32 r_readdir_offset;
+
+ struct page *r_locked_page;
+ int r_dir_caps;
+ int r_num_caps;
- unsigned long r_timeout; /* optional. jiffies */
+ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
+ unsigned long r_start_latency; /* start time to measure latency */
+ unsigned long r_end_latency; /* finish time to measure latency */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
@@ -226,26 +363,77 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
+ u64 r_deleg_ino;
- struct kref r_kref;
struct list_head r_wait;
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
struct list_head r_unsafe_item; /* per-session unsafe list item */
- bool r_got_unsafe, r_got_safe, r_got_result;
- bool r_did_prepopulate;
- u32 r_readdir_offset;
+ long long r_dir_release_cnt;
+ long long r_dir_ordered_cnt;
+ int r_readdir_cache_idx;
+
+ int r_feature_needed;
struct ceph_cap_reservation r_caps_reservation;
- int r_num_caps;
+};
+
+struct ceph_pool_perm {
+ struct rb_node node;
+ int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
+};
+
+struct ceph_snapid_map {
+ struct rb_node node;
+ struct list_head lru;
+ atomic_t ref;
+ dev_t dev;
+ u64 snap;
+ unsigned long last_used;
+};
+
+/*
+ * node for list of quotarealm inodes that are not visible from the filesystem
+ * mountpoint, but required to handle, e.g. quotas.
+ */
+struct ceph_quotarealm_inode {
+ struct rb_node node;
+ u64 ino;
+ unsigned long timeout; /* last time a lookup failed for this inode */
+ struct mutex mutex;
+ struct inode *inode;
+};
+
+#ifdef CONFIG_DEBUG_FS
+
+struct cap_wait {
+ struct list_head list;
+ u64 ino;
+ pid_t tgid;
+ int need;
+ int want;
+};
+
+#endif
+
+enum {
+ CEPH_MDSC_STOPPING_BEGIN = 1,
+ CEPH_MDSC_STOPPING_FLUSHING = 2,
+ CEPH_MDSC_STOPPING_FLUSHED = 3,
};
/*
@@ -259,10 +447,27 @@ struct ceph_mds_client {
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
+ int mdsmap_err;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
- int max_sessions; /* len of s_mds_sessions */
- int stopping; /* true if shutting down */
+ atomic_t num_sessions;
+ int max_sessions; /* len of sessions array */
+
+ spinlock_t stopping_lock; /* protect snap_empty */
+ int stopping; /* the stage of shutting down */
+ atomic_t stopping_blockers;
+ struct completion stopping_waiter;
+
+ atomic64_t dirty_folios;
+ wait_queue_head_t flush_end_wq;
+
+ atomic64_t quotarealms_count; /* # realms with quota */
+ /*
+ * We keep a list of inodes we don't see in the mountpoint but that we
+ * need to track quota realms.
+ */
+ struct rb_root quotarealms_inodes;
+ struct mutex quotarealms_inodes_mutex;
/*
* snap_rwsem will cover cap linkage into snaprealms, and
@@ -271,27 +476,37 @@ struct ceph_mds_client {
* references (implying they contain no inodes with caps) that
* should be destroyed.
*/
+ u64 last_snap_seq;
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
+ int num_snap_realms;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
+ u64 oldest_tid; /* oldest incomplete mds request,
+ excluding setfilelock requests */
struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
- spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
+ spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
- u64 cap_flush_seq;
- struct list_head cap_dirty; /* inodes with dirty caps */
+ u64 last_cap_flush_tid;
+ struct list_head cap_flush_list;
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
+ struct work_struct cap_reclaim_work;
+ atomic_t cap_reclaim_pending;
+
+ struct work_struct cap_unlink_work;
+
/*
* Cap reservations
*
@@ -306,53 +521,72 @@ struct ceph_mds_client {
spinlock_t caps_list_lock;
struct list_head caps_list; /* unused (reserved or
unreserved) */
+#ifdef CONFIG_DEBUG_FS
+ struct list_head cap_wait_list;
+#endif
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
+ int caps_use_max; /* max used caps */
int caps_reserve_count; /* unused, reserved */
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
- spinlock_t dentry_lru_lock;
- struct list_head dentry_lru;
- int num_dentry;
+ spinlock_t dentry_list_lock;
+ struct list_head dentry_leases; /* fifo list */
+ struct list_head dentry_dir_leases; /* lru list */
+
+ struct ceph_client_metric metric;
+
+ spinlock_t snapid_map_lock;
+ struct rb_root snapid_map_tree;
+ struct list_head snapid_map_lru;
+
+ struct rw_semaphore pool_perm_rwsem;
+ struct rb_root pool_perm_tree;
+
+ u32 s_cap_auths_num;
+ struct ceph_mds_cap_auth *s_cap_auths;
+
+ char nodename[__NEW_UTS_LEN + 1];
};
extern const char *ceph_mds_op_name(int op);
+extern bool check_session_state(struct ceph_mds_session *s);
+void inc_session_sequence(struct ceph_mds_session *s);
+
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
-static inline struct ceph_mds_session *
-ceph_get_mds_session(struct ceph_mds_session *s)
-{
- atomic_inc(&s->s_ref);
- return s;
-}
+extern const char *ceph_session_state_name(int s);
+extern struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
-extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg, int mds);
-
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
-extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req);
+extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+ struct inode *dir,
+ struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -363,26 +597,74 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
-extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
+extern void send_flush_mdlog(struct ceph_mds_session *s);
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state);
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
+extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
+ struct ceph_cap *cap);
+extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
+extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
+extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
+ int (*cb)(struct inode *, int mds, void *),
+ void *arg);
+extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
+ int mask);
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap);
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
+{
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
+}
+
+extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+ struct dentry *dentry, struct ceph_path_info *path_info,
+ int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
- struct inode *inode,
struct dentry *dentry, char action,
u32 seq);
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
+
+extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+}
-extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
+extern bool enable_unsafe_idmap;
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 132b64eeecd4..2c7b151a7c95 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/bug.h>
@@ -6,64 +7,139 @@
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/ceph/mdsmap.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
+#include "mdsmap.h"
+#include "mds_client.h"
#include "super.h"
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
{
int n = 0;
- int i;
-
- /* special case for one mds */
- if (1 == m->m_max_mds && m->m_info[0].state > 0)
- return 0;
+ int i, j;
/* count */
- for (i = 0; i < m->m_max_mds; i++)
- if (m->m_info[i].state > 0)
+ for (i = 0; i < m->possible_max_rank; i++)
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++;
if (n == 0)
return -1;
/* pick */
- n = prandom_u32() % n;
- i = 0;
- for (i = 0; n > 0; i++, n--)
- while (m->m_info[i].state <= 0)
- i++;
+ n = get_random_u32_below(n);
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
+ j++;
+ if (j > n)
+ break;
+ }
return i;
}
/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int mds;
+
+ mds = __mdsmap_get_random_mds(m, false);
+ if (mds == m->possible_max_rank || mds == -1)
+ mds = __mdsmap_get_random_mds(m, true);
+
+ return mds == m->possible_max_rank ? -1 : mds;
+}
+
+#define __decode_and_drop_type(p, end, type, bad) \
+ do { \
+ if (*p + sizeof(type) > end) \
+ goto bad; \
+ *p += sizeof(type); \
+ } while (0)
+
+#define __decode_and_drop_set(p, end, type, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = sizeof(type) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+#define __decode_and_drop_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 n; \
+ size_t need; \
+ ceph_decode_32_safe(p, end, n, bad); \
+ need = (sizeof(ktype) + sizeof(vtype)) * n; \
+ ceph_decode_need(p, end, need, bad); \
+ *p += need; \
+ } while (0)
+
+
+static int __decode_and_drop_compat_set(void **p, void* end)
+{
+ int i;
+ /* compat, ro_compat, incompat*/
+ for (i = 0; i < 3; i++) {
+ u32 n;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ /* mask */
+ *p += sizeof(u64);
+ /* names (map<u64, string>) */
+ n = ceph_decode_32(p);
+ while (n-- > 0) {
+ u32 len;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
+ bad);
+ *p += sizeof(u64);
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, bad);
+ *p += len;
+ }
+ }
+ return 0;
+bad:
+ return -1;
+}
+
+/*
* Decode an MDS map
*
* Ignore any fields we don't care about (there are quite a few of
* them).
*/
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mdsmap *m;
const void *start = *p;
int i, j, n;
- int err = -EINVAL;
- u16 version;
+ int err;
+ u8 mdsmap_v;
+ u16 mdsmap_ev;
+ u32 target;
m = kzalloc(sizeof(*m), GFP_NOFS);
- if (m == NULL)
+ if (!m)
return ERR_PTR(-ENOMEM);
- ceph_decode_16_safe(p, end, version, bad);
- if (version > 3) {
- pr_warning("got mdsmap version %d > 3, failing", version);
- goto bad;
+ ceph_decode_need(p, end, 1 + 1, bad);
+ mdsmap_v = ceph_decode_8(p);
+ *p += sizeof(u8); /* mdsmap_cv */
+ if (mdsmap_v >= 4) {
+ u32 mdsmap_len;
+ ceph_decode_32_safe(p, end, mdsmap_len, bad);
+ if (end < *p + mdsmap_len)
+ goto bad;
+ end = *p + mdsmap_len;
}
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -76,46 +152,77 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
- if (m->m_info == NULL)
- goto badmem;
+ /*
+ * pick out the active nodes as the m_num_active_mds, the
+ * m_num_active_mds maybe larger than m_max_mds when decreasing
+ * the max_mds in cluster side, in other case it should less
+ * than or equal to m_max_mds.
+ */
+ m->m_num_active_mds = n = ceph_decode_32(p);
+
+ /*
+ * the possible max rank, it maybe larger than the m_num_active_mds,
+ * for example if the mds_max == 2 in the cluster, when the MDS(0)
+ * was laggy and being replaced by a new MDS, we will temporarily
+ * receive a new mds map with n_num_mds == 1 and the active MDS(1),
+ * and the mds rank >= m_num_active_mds.
+ */
+ m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
+
+ m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
+ if (!m->m_info)
+ goto nomem;
/* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
s32 mds, inc, state;
- u64 state_seq;
- u8 infoversion;
+ u8 info_v;
+ void *info_end = NULL;
struct ceph_entity_addr addr;
u32 num_export_targets;
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
+ bool laggy;
- ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+ ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p);
- infoversion = ceph_decode_8(p);
+ info_v= ceph_decode_8(p);
+ if (info_v >= 4) {
+ u32 info_len;
+ ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+ *p += sizeof(u8); /* info_cv */
+ info_len = ceph_decode_32(p);
+ info_end = *p + info_len;
+ if (info_end > end)
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
*p += sizeof(u64);
namelen = ceph_decode_32(p); /* skip mds name */
*p += namelen;
- ceph_decode_need(p, end,
- 4*sizeof(u32) + sizeof(u64) +
- sizeof(addr) + sizeof(struct ceph_timespec),
- bad);
- mds = ceph_decode_32(p);
- inc = ceph_decode_32(p);
- state = ceph_decode_32(p);
- state_seq = ceph_decode_64(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
- ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+ ceph_decode_32_safe(p, end, mds, bad);
+ ceph_decode_32_safe(p, end, inc, bad);
+ ceph_decode_32_safe(p, end, state, bad);
+ *p += sizeof(u64); /* state_seq */
+ if (info_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ err = ceph_decode_entity_addr(p, end, &addr);
+ if (err)
+ goto corrupt;
+
+ ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+ bad);
+ laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
- if (infoversion >= 2) {
+ if (info_v >= 2) {
ceph_decode_32_safe(p, end, num_export_targets, bad);
pexport_targets = *p;
*p += num_export_targets * sizeof(u32);
@@ -123,29 +230,42 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
num_export_targets = 0;
}
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
- i+1, n, global_id, mds, inc,
- ceph_pr_addr(&addr.in_addr),
- ceph_mds_state_name(state));
+ if (info_end && *p != info_end) {
+ if (*p > info_end)
+ goto bad;
+ *p = info_end;
+ }
+
+ doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id,
+ mds, inc, ceph_pr_addr(&addr),
+ ceph_mds_state_name(state), laggy ? "(laggy)" : "");
- if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ if (mds < 0 || mds >= m->possible_max_rank) {
+ pr_warn_client(cl, "got incorrect mds(%d)\n", mds);
continue;
+ }
+
+ if (state <= 0) {
+ doutc(cl, "got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
+ continue;
+ }
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
info->addr = addr;
- info->laggy = (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
+ info->laggy = laggy;
info->num_export_targets = num_export_targets;
if (num_export_targets) {
info->export_targets = kcalloc(num_export_targets,
sizeof(u32), GFP_NOFS);
- if (info->export_targets == NULL)
- goto badmem;
- for (j = 0; j < num_export_targets; j++)
- info->export_targets[j] =
- ceph_decode_32(&pexport_targets);
+ if (!info->export_targets)
+ goto nomem;
+ for (j = 0; j < num_export_targets; j++) {
+ target = ceph_decode_32(&pexport_targets);
+ info->export_targets[j] = target;
+ }
} else {
info->export_targets = NULL;
}
@@ -156,34 +276,175 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_num_data_pg_pools = n;
m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools)
- goto badmem;
+ goto nomem;
ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++)
m->m_data_pg_pools[i] = ceph_decode_64(p);
m->m_cas_pg_pool = ceph_decode_64(p);
+ m->m_enabled = m->m_epoch > 1;
- /* ok, we don't care about the rest. */
- dout("mdsmap_decode success epoch %u\n", m->m_epoch);
- return m;
+ mdsmap_ev = 1;
+ if (mdsmap_v >= 2) {
+ ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
+ }
+ if (mdsmap_ev >= 3) {
+ if (__decode_and_drop_compat_set(p, end) < 0)
+ goto bad_ext;
+ }
+ /* metadata_pool */
+ if (mdsmap_ev < 5) {
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ } else {
+ __decode_and_drop_type(p, end, u64, bad_ext);
+ }
+
+ /* created + modified + tableserver */
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+ __decode_and_drop_type(p, end, u32, bad_ext);
+
+ /* in */
+ {
+ int num_laggy = 0;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
+
+ for (i = 0; i < n; i++) {
+ s32 mds = ceph_decode_32(p);
+ if (mds >= 0 && mds < m->possible_max_rank) {
+ if (m->m_info[mds].laggy)
+ num_laggy++;
+ }
+ }
+ m->m_num_laggy = num_laggy;
+
+ if (n > m->possible_max_rank) {
+ void *new_m_info = krealloc(m->m_info,
+ n * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ }
+ m->possible_max_rank = n;
+ }
+
+ /* inc */
+ __decode_and_drop_map(p, end, u32, u32, bad_ext);
+ /* up */
+ __decode_and_drop_map(p, end, u32, u64, bad_ext);
+ /* failed */
+ __decode_and_drop_set(p, end, u32, bad_ext);
+ /* stopped */
+ __decode_and_drop_set(p, end, u32, bad_ext);
-badmem:
+ if (mdsmap_ev >= 4) {
+ /* last_failure_osd_epoch */
+ __decode_and_drop_type(p, end, u32, bad_ext);
+ }
+ if (mdsmap_ev >= 6) {
+ /* ever_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ /* explicitly_allowed_snaps */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 7) {
+ /* inline_data_enabled */
+ __decode_and_drop_type(p, end, u8, bad_ext);
+ }
+ if (mdsmap_ev >= 8) {
+ u32 fsname_len;
+ /* enabled */
+ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
+ /* fs_name */
+ ceph_decode_32_safe(p, end, fsname_len, bad_ext);
+
+ /* validate fsname against mds_namespace */
+ if (!namespace_equals(mdsc->fsc->mount_options, *p,
+ fsname_len)) {
+ pr_warn_client(cl, "fsname %*pE doesn't match mds_namespace %s\n",
+ (int)fsname_len, (char *)*p,
+ mdsc->fsc->mount_options->mds_namespace);
+ goto bad;
+ }
+ /* skip fsname after validation */
+ ceph_decode_skip_n(p, end, fsname_len, bad);
+ }
+ /* damaged */
+ if (mdsmap_ev >= 9) {
+ size_t need;
+ ceph_decode_32_safe(p, end, n, bad_ext);
+ need = sizeof(u32) * n;
+ ceph_decode_need(p, end, need, bad_ext);
+ *p += need;
+ m->m_damaged = n > 0;
+ } else {
+ m->m_damaged = false;
+ }
+ if (mdsmap_ev >= 17) {
+ /* balancer */
+ ceph_decode_skip_string(p, end, bad_ext);
+ /* standby_count_wanted */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* old_max_mds */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* min_compat_client */
+ ceph_decode_skip_8(p, end, bad_ext);
+ /* required_client_features */
+ ceph_decode_skip_set(p, end, 64, bad_ext);
+ /* bal_rank_mask */
+ ceph_decode_skip_string(p, end, bad_ext);
+ }
+ if (mdsmap_ev >= 18) {
+ ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+ }
+bad_ext:
+ doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
+ *p = end;
+ doutc(cl, "success epoch %u\n", m->m_epoch);
+ return m;
+nomem:
err = -ENOMEM;
-bad:
- pr_err("corrupt mdsmap\n");
+ goto out_err;
+corrupt:
+ pr_err_client(cl, "corrupt mdsmap\n");
print_hex_dump(KERN_DEBUG, "mdsmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
+out_err:
ceph_mdsmap_destroy(m);
return ERR_PTR(err);
+bad:
+ err = -EINVAL;
+ goto corrupt;
}
void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_max_mds; i++)
- kfree(m->m_info[i].export_targets);
- kfree(m->m_info);
+ if (m->m_info) {
+ for (i = 0; i < m->possible_max_rank; i++)
+ kfree(m->m_info[i].export_targets);
+ kfree(m->m_info);
+ }
kfree(m->m_data_pg_pools);
kfree(m);
}
+
+bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
+{
+ int i, nr_active = 0;
+ if (!m->m_enabled)
+ return false;
+ if (m->m_damaged)
+ return false;
+ if (m->m_num_laggy == m->m_num_active_mds)
+ return false;
+ for (i = 0; i < m->possible_max_rank; i++) {
+ if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
+ nr_active++;
+ }
+ return nr_active > 0;
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..1f2171dd01bf
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+struct ceph_mds_client;
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ bool laggy;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ /*
+ * maximum size for xattrs blob.
+ * Zeroed by default to force the usage of the (sync) SETXATTR Op.
+ */
+ u64 m_max_xattr_size;
+ u32 m_max_mds; /* expected up:active mds number */
+ u32 m_num_active_mds; /* actual up:active mds number */
+ u32 possible_max_rank; /* possible max rank index */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
+
+ bool m_enabled;
+ bool m_damaged;
+ int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->possible_max_rank)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->possible_max_rank)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+ if (w >= 0 && w < m->possible_max_rank)
+ return m->m_info[w].laggy;
+ return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+ void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
new file mode 100644
index 000000000000..871c1090e520
--- /dev/null
+++ b/fs/ceph/metric.c
@@ -0,0 +1,362 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/percpu_counter.h>
+#include <linux/math64.h>
+
+#include "metric.h"
+#include "mds_client.h"
+
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_metric_dlease *dlease;
+ struct ceph_opened_files *files;
+ struct ceph_pinned_icaps *icaps;
+ struct ceph_opened_inodes *inodes;
+ struct ceph_read_io_size *rsize;
+ struct ceph_write_io_size *wsize;
+ struct ceph_client_metric *m = &mdsc->metric;
+ u64 nr_caps = atomic64_read(&m->total_caps);
+ u32 header_len = sizeof(struct ceph_metric_header);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_msg *msg;
+ s64 sum;
+ s32 items = 0;
+ s32 len;
+
+ /* Do not send the metrics until the MDS rank is ready */
+ mutex_lock(&mdsc->mutex);
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) {
+ mutex_unlock(&mdsc->mutex);
+ return false;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+ + sizeof(*meta) + sizeof(*dlease) + sizeof(*files)
+ + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize)
+ + sizeof(*wsize);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err_client(cl, "to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->header.ver = 1;
+ cap->header.compat = 1;
+ cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis));
+ cap->total = cpu_to_le64(nr_caps);
+ items++;
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(cap + 1);
+ read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->header.ver = 2;
+ read->header.compat = 1;
+ read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
+ sum = m->metric[METRIC_READ].latency_sum;
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
+ items++;
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->header.ver = 2;
+ write->header.compat = 1;
+ write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
+ sum = m->metric[METRIC_WRITE].latency_sum;
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
+ items++;
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->header.ver = 2;
+ meta->header.compat = 1;
+ meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
+ sum = m->metric[METRIC_METADATA].latency_sum;
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
+ items++;
+
+ /* encode the dentry lease metric */
+ dlease = (struct ceph_metric_dlease *)(meta + 1);
+ dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+ dlease->header.ver = 1;
+ dlease->header.compat = 1;
+ dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len);
+ dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit));
+ dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis));
+ dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
+ items++;
+
+ sum = percpu_counter_sum(&m->total_inodes);
+
+ /* encode the opened files metric */
+ files = (struct ceph_opened_files *)(dlease + 1);
+ files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES);
+ files->header.ver = 1;
+ files->header.compat = 1;
+ files->header.data_len = cpu_to_le32(sizeof(*files) - header_len);
+ files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files));
+ files->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the pinned icaps metric */
+ icaps = (struct ceph_pinned_icaps *)(files + 1);
+ icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS);
+ icaps->header.ver = 1;
+ icaps->header.compat = 1;
+ icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len);
+ icaps->pinned_icaps = cpu_to_le64(nr_caps);
+ icaps->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the opened inodes metric */
+ inodes = (struct ceph_opened_inodes *)(icaps + 1);
+ inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES);
+ inodes->header.ver = 1;
+ inodes->header.compat = 1;
+ inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len);
+ inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes));
+ inodes->total = cpu_to_le64(sum);
+ items++;
+
+ /* encode the read io size metric */
+ rsize = (struct ceph_read_io_size *)(inodes + 1);
+ rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES);
+ rsize->header.ver = 1;
+ rsize->header.compat = 1;
+ rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len);
+ rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total);
+ rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum);
+ items++;
+
+ /* encode the write io size metric */
+ wsize = (struct ceph_write_io_size *)(rsize + 1);
+ wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES);
+ wsize->header.ver = 1;
+ wsize->header.compat = 1;
+ wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len);
+ wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total);
+ wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum);
+ items++;
+
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = len;
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
+
+static void metric_get_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *s;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ s = __ceph_lookup_mds_session(mdsc, i);
+ if (!s)
+ continue;
+
+ /*
+ * Skip it if MDS doesn't support the metric collection,
+ * or the MDS will close the session's socket connection
+ * directly when it get this message.
+ */
+ if (check_session_state(s) &&
+ test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) {
+ mdsc->metric.session = s;
+ break;
+ }
+
+ ceph_put_mds_session(s);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+ struct ceph_client_metric *m =
+ container_of(work, struct ceph_client_metric, delayed_work.work);
+ struct ceph_mds_client *mdsc =
+ container_of(m, struct ceph_mds_client, metric);
+
+ if (mdsc->stopping || disable_send_metrics)
+ return;
+
+ if (!m->session || !check_session_state(m->session)) {
+ if (m->session) {
+ ceph_put_mds_session(m->session);
+ m->session = NULL;
+ }
+ metric_get_session(mdsc);
+ }
+ if (m->session) {
+ ceph_mdsc_send_metrics(mdsc, m->session);
+ metric_schedule_delayed(m);
+ }
+}
+
+int ceph_metric_init(struct ceph_client_metric *m)
+{
+ struct ceph_metric *metric;
+ int ret, i;
+
+ if (!m)
+ return -EINVAL;
+
+ atomic64_set(&m->total_dentries, 0);
+ ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL);
+ if (ret)
+ goto err_d_lease_mis;
+
+ atomic64_set(&m->total_caps, 0);
+ ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL);
+ if (ret)
+ goto err_i_caps_hit;
+
+ ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL);
+ if (ret)
+ goto err_i_caps_mis;
+
+ for (i = 0; i < METRIC_MAX; i++) {
+ metric = &m->metric[i];
+ spin_lock_init(&metric->lock);
+ metric->size_sum = 0;
+ metric->size_min = U64_MAX;
+ metric->size_max = 0;
+ metric->total = 0;
+ metric->latency_sum = 0;
+ metric->latency_avg = 0;
+ metric->latency_sq_sum = 0;
+ metric->latency_min = KTIME_MAX;
+ metric->latency_max = 0;
+ }
+
+ atomic64_set(&m->opened_files, 0);
+ ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_opened_inodes;
+ ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
+ if (ret)
+ goto err_total_inodes;
+
+ m->session = NULL;
+ INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
+ return 0;
+
+err_total_inodes:
+ percpu_counter_destroy(&m->opened_inodes);
+err_opened_inodes:
+ percpu_counter_destroy(&m->i_caps_mis);
+err_i_caps_mis:
+ percpu_counter_destroy(&m->i_caps_hit);
+err_i_caps_hit:
+ percpu_counter_destroy(&m->d_lease_mis);
+err_d_lease_mis:
+ percpu_counter_destroy(&m->d_lease_hit);
+
+ return ret;
+}
+
+void ceph_metric_destroy(struct ceph_client_metric *m)
+{
+ if (!m)
+ return;
+
+ cancel_delayed_work_sync(&m->delayed_work);
+
+ percpu_counter_destroy(&m->total_inodes);
+ percpu_counter_destroy(&m->opened_inodes);
+ percpu_counter_destroy(&m->i_caps_mis);
+ percpu_counter_destroy(&m->i_caps_hit);
+ percpu_counter_destroy(&m->d_lease_mis);
+ percpu_counter_destroy(&m->d_lease_hit);
+
+ ceph_put_mds_session(m->session);
+}
+
+#define METRIC_UPDATE_MIN_MAX(min, max, new) \
+{ \
+ if (unlikely(new < min)) \
+ min = new; \
+ if (unlikely(new > max)) \
+ max = new; \
+}
+
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
+{
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
+}
+
+void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ktime_t lat = ktime_sub(r_end, r_start);
+ ktime_t total;
+
+ if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT))
+ return;
+
+ spin_lock(&m->lock);
+ total = ++m->total;
+ m->size_sum += size;
+ METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
+ m->latency_sum += lat;
+ METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
+ spin_unlock(&m->lock);
+}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
new file mode 100644
index 000000000000..0d0c44bd3332
--- /dev/null
+++ b/fs/ceph/metric.h
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDS_METRIC_H
+#define _FS_CEPH_MDS_METRIC_H
+
+#include <linux/ceph/types.h>
+#include <linux/percpu_counter.h>
+#include <linux/ktime.h>
+
+extern bool disable_send_metrics;
+
+enum ceph_metric_type {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+ CLIENT_METRIC_TYPE_OPENED_FILES,
+ CLIENT_METRIC_TYPE_PINNED_ICAPS,
+ CLIENT_METRIC_TYPE_OPENED_INODES,
+ CLIENT_METRIC_TYPE_READ_IO_SIZES,
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+};
+
+/*
+ * This will always have the highest metric bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
+}
+
+struct ceph_metric_header {
+ __le32 type; /* ceph metric type */
+ __u8 ver;
+ __u8 compat;
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+} __packed;
+
+/* metric caps header */
+struct ceph_metric_cap {
+ struct ceph_metric_header header;
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+ struct ceph_metric_header header;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
+} __packed;
+
+/* metric dentry lease header */
+struct ceph_metric_dlease {
+ struct ceph_metric_header header;
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __packed;
+
+/* metric opened files header */
+struct ceph_opened_files {
+ struct ceph_metric_header header;
+ __le64 opened_files;
+ __le64 total;
+} __packed;
+
+/* metric pinned i_caps header */
+struct ceph_pinned_icaps {
+ struct ceph_metric_header header;
+ __le64 pinned_icaps;
+ __le64 total;
+} __packed;
+
+/* metric opened inodes header */
+struct ceph_opened_inodes {
+ struct ceph_metric_header header;
+ __le64 opened_inodes;
+ __le64 total;
+} __packed;
+
+/* metric read io size header */
+struct ceph_read_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
+/* metric write io size header */
+struct ceph_write_io_size {
+ struct ceph_metric_header header;
+ __le64 total_ops;
+ __le64 total_size;
+} __packed;
+
+struct ceph_metric_head {
+ __le32 num; /* the number of metrics that will be sent */
+} __packed;
+
+enum metric_type {
+ METRIC_READ,
+ METRIC_WRITE,
+ METRIC_METADATA,
+ METRIC_COPYFROM,
+ METRIC_MAX
+};
+
+struct ceph_metric {
+ spinlock_t lock;
+ u64 total;
+ u64 size_sum;
+ u64 size_min;
+ u64 size_max;
+ ktime_t latency_sum;
+ ktime_t latency_avg;
+ ktime_t latency_sq_sum;
+ ktime_t latency_min;
+ ktime_t latency_max;
+};
+
+/* This is the global metrics */
+struct ceph_client_metric {
+ atomic64_t total_dentries;
+ struct percpu_counter d_lease_hit;
+ struct percpu_counter d_lease_mis;
+
+ atomic64_t total_caps;
+ struct percpu_counter i_caps_hit;
+ struct percpu_counter i_caps_mis;
+
+ struct ceph_metric metric[METRIC_MAX];
+
+ /* The total number of directories and files that are opened */
+ atomic64_t opened_files;
+
+ /* The total number of inodes that have opened files or directories */
+ struct percpu_counter opened_inodes;
+ struct percpu_counter total_inodes;
+
+ struct ceph_mds_session *session;
+ struct delayed_work delayed_work; /* delayed work */
+};
+
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+ if (disable_send_metrics)
+ return;
+
+ /* per second */
+ schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ));
+}
+
+extern int ceph_metric_init(struct ceph_client_metric *m);
+extern void ceph_metric_destroy(struct ceph_client_metric *m);
+
+static inline void ceph_update_cap_hit(struct ceph_client_metric *m)
+{
+ percpu_counter_inc(&m->i_caps_hit);
+}
+
+static inline void ceph_update_cap_mis(struct ceph_client_metric *m)
+{
+ percpu_counter_inc(&m->i_caps_mis);
+}
+
+extern void ceph_update_metrics(struct ceph_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc);
+
+static inline void ceph_update_read_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_READ],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_write_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_WRITE],
+ r_start, r_end, size, rc);
+}
+static inline void ceph_update_metadata_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_METADATA],
+ r_start, r_end, 0, rc);
+}
+static inline void ceph_update_copyfrom_metrics(struct ceph_client_metric *m,
+ ktime_t r_start, ktime_t r_end,
+ unsigned int size, int rc)
+{
+ ceph_update_metrics(&m->metric[METRIC_COPYFROM],
+ r_start, r_end, size, rc);
+}
+#endif /* _FS_CEPH_MDS_METRIC_H */
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
new file mode 100644
index 000000000000..d90eda19bcc4
--- /dev/null
+++ b/fs/ceph/quota.c
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * quota.c - CephFS quota
+ *
+ * Copyright (C) 2017-2018 SUSE
+ */
+
+#include <linux/statfs.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ if (inc)
+ atomic64_inc(&mdsc->quotarealms_count);
+ else
+ atomic64_dec(&mdsc->quotarealms_count);
+}
+
+static inline bool ceph_has_realms_with_quotas(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+ struct inode *root = d_inode(sb->s_root);
+
+ if (atomic64_read(&mdsc->quotarealms_count) > 0)
+ return true;
+ /* if root is the real CephFS root, we don't have quota realms */
+ if (root && ceph_ino(root) == CEPH_INO_ROOT)
+ return false;
+ /* MDS stray dirs have no quota realms */
+ if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino))
+ return false;
+ /* otherwise, we can't know for sure */
+ return true;
+}
+
+void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct super_block *sb = mdsc->fsc->sb;
+ struct ceph_mds_quota *h = msg->front.iov_base;
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_vino vino;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
+
+ if (msg->front.iov_len < sizeof(*h)) {
+ pr_err_client(cl, "corrupt message mds%d len %d\n",
+ session->s_mds, (int)msg->front.iov_len);
+ ceph_msg_dump(msg);
+ goto out;
+ }
+
+ /* lookup inode */
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ inode = ceph_find_inode(sb, vino);
+ if (!inode) {
+ pr_warn_client(cl, "failed to find inode %llx\n", vino.ino);
+ goto out;
+ }
+ ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_rbytes = le64_to_cpu(h->rbytes);
+ ci->i_rfiles = le64_to_cpu(h->rfiles);
+ ci->i_rsubdirs = le64_to_cpu(h->rsubdirs);
+ __ceph_update_quota(ci, le64_to_cpu(h->max_bytes),
+ le64_to_cpu(h->max_files));
+ spin_unlock(&ci->i_ceph_lock);
+
+ iput(inode);
+out:
+ ceph_dec_mds_stopping_blocker(mdsc);
+}
+
+static struct ceph_quotarealm_inode *
+find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
+{
+ struct ceph_quotarealm_inode *qri = NULL;
+ struct rb_node **node, *parent = NULL;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ mutex_lock(&mdsc->quotarealms_inodes_mutex);
+ node = &(mdsc->quotarealms_inodes.rb_node);
+ while (*node) {
+ parent = *node;
+ qri = container_of(*node, struct ceph_quotarealm_inode, node);
+
+ if (ino < qri->ino)
+ node = &((*node)->rb_left);
+ else if (ino > qri->ino)
+ node = &((*node)->rb_right);
+ else
+ break;
+ }
+ if (!qri || (qri->ino != ino)) {
+ /* Not found, create a new one and insert it */
+ qri = kmalloc(sizeof(*qri), GFP_KERNEL);
+ if (qri) {
+ qri->ino = ino;
+ qri->inode = NULL;
+ qri->timeout = 0;
+ mutex_init(&qri->mutex);
+ rb_link_node(&qri->node, parent, node);
+ rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
+ } else
+ pr_warn_client(cl, "Failed to alloc quotarealms_inode\n");
+ }
+ mutex_unlock(&mdsc->quotarealms_inodes_mutex);
+
+ return qri;
+}
+
+/*
+ * This function will try to lookup a realm inode which isn't visible in the
+ * filesystem mountpoint. A list of these kind of inodes (not visible) is
+ * maintained in the mdsc and freed only when the filesystem is umounted.
+ *
+ * Note that these inodes are kept in this list even if the lookup fails, which
+ * allows to prevent useless lookup requests.
+ */
+static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
+ struct super_block *sb,
+ struct ceph_snap_realm *realm)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_quotarealm_inode *qri;
+ struct inode *in;
+
+ qri = find_quotarealm_inode(mdsc, realm->ino);
+ if (!qri)
+ return NULL;
+
+ mutex_lock(&qri->mutex);
+ if (qri->inode && ceph_is_any_caps(qri->inode)) {
+ /* A request has already returned the inode */
+ mutex_unlock(&qri->mutex);
+ return qri->inode;
+ }
+ /* Check if this inode lookup has failed recently */
+ if (qri->timeout &&
+ time_before_eq(jiffies, qri->timeout)) {
+ mutex_unlock(&qri->mutex);
+ return NULL;
+ }
+ if (qri->inode) {
+ /* get caps */
+ int ret = __ceph_do_getattr(qri->inode, NULL,
+ CEPH_STAT_CAP_INODE, true);
+ if (ret >= 0)
+ in = qri->inode;
+ else
+ in = ERR_PTR(ret);
+ } else {
+ in = ceph_lookup_inode(sb, realm->ino);
+ }
+
+ if (IS_ERR(in)) {
+ doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino,
+ PTR_ERR(in));
+ qri->timeout = jiffies + secs_to_jiffies(60); /* XXX */
+ } else {
+ qri->timeout = 0;
+ qri->inode = in;
+ }
+ mutex_unlock(&qri->mutex);
+
+ return in;
+}
+
+void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
+{
+ struct ceph_quotarealm_inode *qri;
+ struct rb_node *node;
+
+ /*
+ * It should now be safe to clean quotarealms_inode tree without holding
+ * mdsc->quotarealms_inodes_mutex...
+ */
+ mutex_lock(&mdsc->quotarealms_inodes_mutex);
+ while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) {
+ node = rb_first(&mdsc->quotarealms_inodes);
+ qri = rb_entry(node, struct ceph_quotarealm_inode, node);
+ rb_erase(node, &mdsc->quotarealms_inodes);
+ iput(qri->inode);
+ kfree(qri);
+ }
+ mutex_unlock(&mdsc->quotarealms_inodes_mutex);
+}
+
+/*
+ * This function walks through the snaprealm for an inode and set the
+ * realmp with the first snaprealm that has quotas set (max_files,
+ * max_bytes, or any, depending on the 'which_quota' argument). If the root is
+ * reached, set the realmp with the root ceph_snap_realm instead.
+ *
+ * Note that the caller is responsible for calling ceph_put_snap_realm() on the
+ * returned realm.
+ *
+ * Callers of this function need to hold mdsc->snap_rwsem. However, if there's
+ * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence
+ * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
+ * this function will return -EAGAIN; otherwise, the snaprealms walk-through
+ * will be restarted.
+ */
+static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
+ enum quota_get_realm which_quota,
+ struct ceph_snap_realm **realmp, bool retry)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_inode_info *ci = NULL;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ bool has_quota;
+
+ if (realmp)
+ *realmp = NULL;
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return 0;
+
+restart:
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
+ while (realm) {
+ bool has_inode;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ has_inode = realm->inode;
+ in = has_inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (has_inode && !in)
+ break;
+ if (!in) {
+ up_read(&mdsc->snap_rwsem);
+ in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+ down_read(&mdsc->snap_rwsem);
+ if (IS_ERR_OR_NULL(in))
+ break;
+ ceph_put_snap_realm(mdsc, realm);
+ if (!retry)
+ return -EAGAIN;
+ goto restart;
+ }
+
+ ci = ceph_inode(in);
+ has_quota = __ceph_has_quota(ci, which_quota);
+ iput(in);
+
+ next = realm->parent;
+ if (has_quota || !next) {
+ if (realmp)
+ *realmp = realm;
+ return 0;
+ }
+
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+
+ return 0;
+}
+
+bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
+ struct ceph_snap_realm *old_realm, *new_realm;
+ bool is_same;
+ int ret;
+
+restart:
+ /*
+ * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
+ * However, get_quota_realm may drop it temporarily. By setting the
+ * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
+ * dropped and we can then restart the whole operation.
+ */
+ down_read(&mdsc->snap_rwsem);
+ get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true);
+ ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false);
+ if (ret == -EAGAIN) {
+ up_read(&mdsc->snap_rwsem);
+ if (old_realm)
+ ceph_put_snap_realm(mdsc, old_realm);
+ goto restart;
+ }
+ is_same = (old_realm == new_realm);
+ up_read(&mdsc->snap_rwsem);
+
+ if (old_realm)
+ ceph_put_snap_realm(mdsc, old_realm);
+ if (new_realm)
+ ceph_put_snap_realm(mdsc, new_realm);
+
+ return is_same;
+}
+
+enum quota_check_op {
+ QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */
+ QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files
+ limit is approaching */
+};
+
+/*
+ * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each
+ * realm, it will execute quota check operation defined by the 'op' parameter.
+ * The snaprealm walk is interrupted if the quota check detects that the quota
+ * is exceeded or if the root inode is reached.
+ */
+static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
+ loff_t delta)
+{
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm, *next;
+ struct inode *in;
+ u64 max, rvalue;
+ bool exceeded = false;
+
+ if (ceph_snap(inode) != CEPH_NOSNAP)
+ return false;
+
+ down_read(&mdsc->snap_rwsem);
+restart:
+ realm = ceph_inode(inode)->i_snap_realm;
+ if (realm)
+ ceph_get_snap_realm(mdsc, realm);
+ else
+ pr_err_ratelimited_client(cl,
+ "%p %llx.%llx null i_snap_realm\n",
+ inode, ceph_vinop(inode));
+ while (realm) {
+ bool has_inode;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ has_inode = realm->inode;
+ in = has_inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (has_inode && !in)
+ break;
+ if (!in) {
+ up_read(&mdsc->snap_rwsem);
+ in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+ down_read(&mdsc->snap_rwsem);
+ if (IS_ERR_OR_NULL(in))
+ break;
+ ceph_put_snap_realm(mdsc, realm);
+ goto restart;
+ }
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (op == QUOTA_CHECK_MAX_FILES_OP) {
+ max = ci->i_max_files;
+ rvalue = ci->i_rfiles + ci->i_rsubdirs;
+ } else {
+ max = ci->i_max_bytes;
+ rvalue = ci->i_rbytes;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ switch (op) {
+ case QUOTA_CHECK_MAX_FILES_OP:
+ case QUOTA_CHECK_MAX_BYTES_OP:
+ exceeded = (max && (rvalue + delta > max));
+ break;
+ case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP:
+ if (max) {
+ if (rvalue >= max)
+ exceeded = true;
+ else {
+ /*
+ * when we're writing more that 1/16th
+ * of the available space
+ */
+ exceeded =
+ (((max - rvalue) >> 4) < delta);
+ }
+ }
+ break;
+ default:
+ /* Shouldn't happen */
+ pr_warn_client(cl, "Invalid quota check op (%d)\n", op);
+ exceeded = true; /* Just break the loop */
+ }
+ iput(in);
+
+ next = realm->parent;
+ if (exceeded || !next)
+ break;
+ ceph_get_snap_realm(mdsc, next);
+ ceph_put_snap_realm(mdsc, realm);
+ realm = next;
+ }
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
+ up_read(&mdsc->snap_rwsem);
+
+ return exceeded;
+}
+
+/*
+ * ceph_quota_is_max_files_exceeded - check if we can create a new file
+ * @inode: directory where a new file is being created
+ *
+ * This functions returns true is max_files quota allows a new file to be
+ * created. It is necessary to walk through the snaprealm hierarchy (until the
+ * FS root) to check all realms with quotas set.
+ */
+bool ceph_quota_is_max_files_exceeded(struct inode *inode)
+{
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ WARN_ON(!S_ISDIR(inode->i_mode));
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1);
+}
+
+/*
+ * ceph_quota_is_max_bytes_exceeded - check if we can write to a file
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This functions returns true is max_bytes quota allows a file size to reach
+ * @newsize; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize)
+{
+ loff_t size = i_size_read(inode);
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size));
+}
+
+/*
+ * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes
+ * @inode: inode being written
+ * @newsize: new size if write succeeds
+ *
+ * This function returns true if the new file size @newsize will be consuming
+ * more than 1/16th of the available quota space; it returns false otherwise.
+ */
+bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize)
+{
+ loff_t size = ceph_inode(inode)->i_reported_size;
+
+ if (!ceph_has_realms_with_quotas(inode))
+ return false;
+
+ /* return immediately if we're decreasing file size */
+ if (newsize <= size)
+ return false;
+
+ return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP,
+ (newsize - size));
+}
+
+/*
+ * ceph_quota_update_statfs - if root has quota update statfs with quota status
+ * @fsc: filesystem client instance
+ * @buf: statfs to update
+ *
+ * If the mounted filesystem root has max_bytes quota set, update the filesystem
+ * statistics with the quota status.
+ *
+ * This function returns true if the stats have been updated, false otherwise.
+ */
+bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_realm *realm;
+ struct inode *in;
+ u64 total = 0, used, free;
+ bool is_updated = false;
+
+ down_read(&mdsc->snap_rwsem);
+ get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
+ &realm, true);
+ up_read(&mdsc->snap_rwsem);
+ if (!realm)
+ return false;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ in = realm->inode ? igrab(realm->inode) : NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (in) {
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_max_bytes) {
+ total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_BLOCK_SHIFT;
+ /* For quota size less than 4MB, use 4KB block size */
+ if (!total) {
+ total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT;
+ used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
+ /* It is possible for a quota to be exceeded.
+ * Report 'zero' in that case
+ */
+ free = total > used ? total - used : 0;
+ /* For quota size less than 4KB, report the
+ * total=used=4KB,free=0 when quota is full
+ * and total=free=4KB, used=0 otherwise */
+ if (!total) {
+ total = 1;
+ free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0;
+ buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ if (total) {
+ buf->f_blocks = total;
+ buf->f_bfree = free;
+ buf->f_bavail = free;
+ is_updated = true;
+ }
+ iput(in);
+ }
+ ceph_put_snap_realm(mdsc, realm);
+
+ return is_updated;
+}
+
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f01645a27752..c65f2b202b2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,13 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
#include <linux/sort.h>
#include <linux/slab.h>
-
+#include <linux/iversion.h>
#include "super.h"
#include "mds_client.h"
-
#include <linux/ceph/decode.h>
+/* unused map expires after 5 minutes */
+#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
+
/*
* Snapshots in ceph are driven in large part by cooperation from the
* client. In contrast to local file systems or file servers that
@@ -57,26 +61,26 @@
/*
* increase ref count for the realm
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("get_realm %p %d -> %d\n", realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
/*
- * since we _only_ increment realm refs or empty the empty
- * list with snap_rwsem held, adjusting the empty list here is
- * safe. we do need to protect against concurrent empty list
- * additions, however.
+ * The 0->1 and 1->0 transitions must take the snap_empty_lock
+ * atomically with the refcount change. Go ahead and bump the
+ * nref here, unless it's 0, in which case we take the spinlock
+ * and then do the increment and remove it from the list.
*/
- if (atomic_read(&realm->nref) == 0) {
- spin_lock(&mdsc->snap_empty_lock);
- list_del_init(&realm->empty_item);
- spin_unlock(&mdsc->snap_empty_lock);
- }
+ if (atomic_inc_not_zero(&realm->nref))
+ return;
- atomic_inc(&realm->nref);
+ spin_lock(&mdsc->snap_empty_lock);
+ if (atomic_inc_return(&realm->nref) == 1)
+ list_del_init(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -112,34 +116,46 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
realm = kzalloc(sizeof(*realm), GFP_NOFS);
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ mdsc->num_snap_realms++;
+
+ doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm);
return realm;
}
/*
* lookup the realm rooted at @ino.
*
- * caller must hold snap_rwsem for write.
+ * caller must hold snap_rwsem.
*/
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
while (n) {
r = rb_entry(n, struct ceph_snap_realm, node);
if (ino < r->ino)
@@ -147,13 +163,23 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ doutc(cl, "%llx %p\n", r->ino, r);
return r;
}
}
return NULL;
}
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *r;
+ r = __lookup_snap_realm(mdsc, ino);
+ if (r)
+ ceph_get_snap_realm(mdsc, r);
+ return r;
+}
+
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
@@ -163,9 +189,13 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ struct ceph_client *cl = mdsc->fsc->client;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ doutc(cl, "%p %llx\n", realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
+ mdsc->num_snap_realms--;
if (realm->parent) {
list_del_init(&realm->child_item);
@@ -184,28 +214,30 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ /*
+ * We do not require the snap_empty_lock here, as any caller that
+ * increments the value must hold the snap_rwsem.
+ */
if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/*
- * caller needn't hold any locks
+ * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/
void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm)
{
- dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
- atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
- if (!atomic_dec_and_test(&realm->nref))
+ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
return;
if (down_write_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem);
} else {
- spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock);
}
@@ -222,6 +254,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{
struct ceph_snap_realm *realm;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
spin_lock(&mdsc->snap_empty_lock);
while (!list_empty(&mdsc->snap_empty)) {
realm = list_first_entry(&mdsc->snap_empty,
@@ -234,9 +268,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -253,8 +292,11 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm,
u64 parentino)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent;
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
if (realm->parent_ino == parentino)
return 0;
@@ -264,16 +306,14 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm,
+ realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
}
realm->parent_ino = parentino;
realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
list_add(&realm->child_item, &parent->children);
return 1;
}
@@ -288,11 +328,16 @@ static int cmpu64_rev(const void *a, const void *b)
return 0;
}
+
/*
* build the snap context for a given realm.
*/
-static int build_snap_context(struct ceph_snap_realm *realm)
+static int build_snap_context(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
int err = 0;
@@ -305,9 +350,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -320,11 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
- realm->cached_context->seq,
- (unsigned int) realm->cached_context->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ (unsigned int)realm->cached_context->num_snaps);
return 0;
}
@@ -361,13 +405,13 @@ static int build_snap_context(struct ceph_snap_realm *realm)
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm,
+ snapc, snapc->seq, (unsigned int) snapc->num_snaps);
- if (realm->cached_context)
- ceph_put_snap_context(realm->cached_context);
+ ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
+ /* queue realm for cap_snap creation */
+ list_add_tail(&realm->dirty_item, dirty_realms);
return 0;
fail:
@@ -379,23 +423,63 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err);
return err;
}
/*
* rebuild snap context for the given realm and all of its children.
*/
-static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
+ struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ struct ceph_client *cl = mdsc->fsc->client;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm);
+ last = build_snap_context(mdsc, _realm, &realm_queue,
+ dirty_realms);
+ doutc(cl, "%llx %p, %s\n", realm->ino, realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child);
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
+
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
+
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -420,6 +504,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
return 0;
}
+static bool has_new_snaps(struct ceph_snap_context *o,
+ struct ceph_snap_context *n)
+{
+ if (n->num_snaps == 0)
+ return false;
+ /* snaps are in descending order */
+ return n->snaps[0] > o->seq;
+}
/*
* When a snapshot is applied, the size/mtime inode metadata is queued
@@ -435,22 +527,23 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
+ struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
+ old_snapc = ci->i_head_snapc;
+ new_snapc = ci->i_snap_realm->cached_context;
+
/*
* If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish
@@ -464,72 +557,98 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
- kfree(capsnap);
- } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
- CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
- struct ceph_snap_context *snapc = ci->i_head_snapc;
-
- /*
- * if we are a sync write, we may need to go to the snaprealm
- * to get the current snapc.
- */
- if (!snapc)
- snapc = ci->i_snap_realm->cached_context;
+ doutc(cl, "%p %llx.%llx already pending\n", inode,
+ ceph_vinop(inode));
+ goto update_snapc;
+ }
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+ doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode,
+ ceph_vinop(inode));
+ goto update_snapc;
+ }
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
- inode, capsnap, snapc, ceph_cap_string(dirty));
- ihold(inode);
+ BUG_ON(!old_snapc);
- atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
- INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
+ /*
+ * There is no need to send FLUSHSNAP message to MDS if there is
+ * no new snapshot. But when there is dirty pages or on-going
+ * writes, we still need to create cap_snap. cap_snap is needed
+ * by the write path and page writeback path.
+ *
+ * also see ceph_try_drop_cap_snap()
+ */
+ if (has_new_snaps(old_snapc, new_snapc)) {
+ if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+ capsnap->need_flush = true;
+ } else {
+ if (!(used & CEPH_CAP_FILE_WR) &&
+ ci->i_wrbuffer_ref_head == 0) {
+ doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n",
+ inode, ceph_vinop(inode));
+ goto update_snapc;
+ }
+ }
- capsnap->follows = snapc->seq;
- capsnap->issued = __ceph_caps_issued(ci, NULL);
- capsnap->dirty = dirty;
+ doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
+ ihold(inode);
- capsnap->mode = inode->i_mode;
- capsnap->uid = inode->i_uid;
- capsnap->gid = inode->i_gid;
+ capsnap->follows = old_snapc->seq;
+ capsnap->issued = __ceph_caps_issued(ci, NULL);
+ capsnap->dirty = dirty;
- if (dirty & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- capsnap->xattr_blob =
- ceph_buffer_get(ci->i_xattrs.blob);
- capsnap->xattr_version = ci->i_xattrs.version;
- } else {
- capsnap->xattr_blob = NULL;
- capsnap->xattr_version = 0;
- }
+ capsnap->mode = inode->i_mode;
+ capsnap->uid = inode->i_uid;
+ capsnap->gid = inode->i_gid;
- /* dirty page count moved from _head to this cap_snap;
- all subsequent writes page dirties occur _after_ this
- snapshot. */
- capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
- ci->i_wrbuffer_ref_head = 0;
- capsnap->context = snapc;
- ci->i_head_snapc =
- ceph_get_snap_context(ci->i_snap_realm->cached_context);
- dout(" new snapc is %p\n", ci->i_head_snapc);
- list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
- if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
- capsnap, snapc, snapc->seq);
- capsnap->writing = 1;
- } else {
- /* note mtime, size NOW. */
- __ceph_finish_cap_snap(ci, capsnap);
- }
+ if (dirty & CEPH_CAP_XATTR_EXCL) {
+ old_blob = __ceph_build_xattrs_blob(ci);
+ capsnap->xattr_blob =
+ ceph_buffer_get(ci->i_xattrs.blob);
+ capsnap->xattr_version = ci->i_xattrs.version;
} else {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
- kfree(capsnap);
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_version = 0;
}
+ capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+ /* dirty page count moved from _head to this cap_snap;
+ all subsequent writes page dirties occur _after_ this
+ snapshot. */
+ capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+ ci->i_wrbuffer_ref_head = 0;
+ capsnap->context = old_snapc;
+ list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+ if (used & CEPH_CAP_FILE_WR) {
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", inode, ceph_vinop(inode), capsnap,
+ old_snapc, old_snapc->seq);
+ capsnap->writing = 1;
+ } else {
+ /* note mtime, size NOW. */
+ __ceph_finish_cap_snap(ci, capsnap);
+ }
+ *pcapsnap = NULL;
+ old_snapc = NULL;
+
+update_snapc:
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
+ ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+ doutc(cl, " new snapc is %p\n", new_snapc);
+ }
spin_unlock(&ci->i_ceph_lock);
+
+ ceph_buffer_put(old_blob);
+ ceph_put_snap_context(old_snapc);
}
/*
@@ -543,30 +662,55 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_client *cl = mdsc->fsc->client;
BUG_ON(capsnap->writing);
- capsnap->size = inode->i_size;
- capsnap->mtime = inode->i_mtime;
- capsnap->atime = inode->i_atime;
- capsnap->ctime = inode->i_ctime;
+ capsnap->size = i_size_read(inode);
+ capsnap->mtime = inode_get_mtime(inode);
+ capsnap->atime = inode_get_atime(inode);
+ capsnap->ctime = inode_get_ctime(inode);
+ capsnap->btime = ci->i_btime;
+ capsnap->change_attr = inode_peek_iversion_raw(inode);
capsnap->time_warp_seq = ci->i_time_warp_seq;
+ capsnap->truncate_size = ci->i_truncate_size;
+ capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu still has %d dirty pages\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq,
+ ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
+ return 0;
+ }
+
+ /*
+ * Defer flushing the capsnap if the dirty buffer not flushed yet.
+ * And trigger to flush the buffer immediately.
+ */
+ if (ci->i_wrbuffer_ref) {
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+ "s=%llu used WRBUFFER, delaying\n", inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
+ ceph_queue_writeback(inode);
return 0;
}
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
- capsnap->context->seq, ceph_cap_string(capsnap->dirty),
- capsnap->size);
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ inode, ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ if (list_empty(&ci->i_snap_flush_item)) {
+ ihold(inode);
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ }
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
@@ -575,40 +719,54 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
* Queue cap_snaps for snap writeback for this realm and its children.
* Called under snap_rwsem, so realm topology won't change.
*/
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
- struct ceph_snap_realm *child;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ doutc(cl, "%p %llx inode\n", realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
- list_for_each_entry(ci, &realm->inodes_with_caps,
- i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
+ iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err_client(cl,
+ "ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
- if (lastinode)
- iput(lastinode);
-
- list_for_each_entry(child, &realm->children, child_item) {
- dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
- realm, realm->ino, child, child->ino);
- list_del_init(&child->dirty_item);
- list_add(&child->dirty_item, &realm->dirty_item);
- }
+ iput(lastinode);
- list_del_init(&realm->dirty_item);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ doutc(cl, "%p %llx done\n", realm, realm->ino);
}
/*
@@ -619,18 +777,28 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
* Caller must hold snap_rwsem for write.
*/
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm;
- int invalidate = 0;
+ struct ceph_snap_realm *first_realm = NULL;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ struct ceph_client *client = mdsc->fsc->client;
+ int rebuild_snapcs;
int err = -ENOMEM;
+ int ret;
LIST_HEAD(dirty_realms);
- dout("update_snap_trace deletion=%d\n", deletion);
+ lockdep_assert_held_write(&mdsc->snap_rwsem);
+
+ doutc(cl, "deletion=%d\n", deletion);
more:
+ realm = NULL;
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -654,11 +822,11 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
- realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino,
+ realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
realm->created = le64_to_cpu(ri->created);
@@ -676,29 +844,42 @@ more:
if (err < 0)
goto fail;
- /* queue realm for cap_snap creation */
- list_add(&realm->dirty_item, &dirty_realms);
+ if (realm->seq > mdsc->last_snap_seq)
+ mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
- realm->ino, realm, realm->seq);
- invalidate = 1;
+ doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm,
+ realm->seq);
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
- realm->ino, realm, realm->seq);
+ doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm,
+ realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
+
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
+
+ if (!first_realm)
+ first_realm = realm;
+ else
+ ceph_put_snap_realm(mdsc, realm);
if (p < e)
goto more;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
- rebuild_snap_realms(realm);
-
/*
* queue cap snaps _after_ we've built the new snap contexts,
* so that i_head_snapc can be set appropriately.
@@ -706,16 +887,48 @@ more:
while (!list_empty(&dirty_realms)) {
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
- queue_realm_cap_snaps(realm);
+ list_del_init(&realm->dirty_item);
+ queue_realm_cap_snaps(mdsc, realm);
}
+ if (realm_ret)
+ *realm_ret = first_realm;
+ else
+ ceph_put_snap_realm(mdsc, first_realm);
+
__cleanup_empty_realms(mdsc);
return 0;
bad:
- err = -EINVAL;
+ err = -EIO;
fail:
- pr_err("update_snap_trace error %d\n", err);
+ if (realm && !IS_ERR(realm))
+ ceph_put_snap_realm(mdsc, realm);
+ if (first_realm)
+ ceph_put_snap_realm(mdsc, first_realm);
+ pr_err_client(cl, "error %d\n", err);
+
+ /*
+ * When receiving a corrupted snap trace we don't know what
+ * exactly has happened in MDS side. And we shouldn't continue
+ * writing to OSD, which may corrupt the snapshot contents.
+ *
+ * Just try to blocklist this kclient and then this kclient
+ * must be remounted to continue after the corrupted metadata
+ * fixed in the MDS side.
+ */
+ WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
+ ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+ if (ret)
+ pr_err_client(cl, "failed to blocklist %s: %d\n",
+ ceph_pr_addr(&client->msgr.inst.addr), ret);
+
+ WARN(1, "[client.%lld] %s %s%sdo remount to continue%s",
+ client->monc.auth->global_id, __func__,
+ ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+ ret ? "" : " was blocklisted, ",
+ err == -EIO ? " after corrupted snaptrace is fixed" : "");
+
return err;
}
@@ -728,33 +941,66 @@ fail:
*/
static void flush_snaps(struct ceph_mds_client *mdsc)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci;
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ doutc(cl, "begin\n");
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_flush_snaps(ci, &session);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
spin_unlock(&mdsc->snap_flush_lock);
- if (session) {
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- }
- dout("flush_snaps done\n");
+ ceph_put_mds_session(session);
+ doutc(cl, "done\n");
}
+/**
+ * ceph_change_snap_realm - change the snap_realm for an inode
+ * @inode: inode to move to new snap realm
+ * @realm: new realm to move inode into (may be NULL)
+ *
+ * Detach an inode from its old snaprealm (if any) and attach it to
+ * the new snaprealm (if any). The old snap realm reference held by
+ * the inode is put. If realm is non-NULL, then the caller's reference
+ * to it is taken over by the inode.
+ */
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+ struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
+
+ lockdep_assert_held(&ci->i_ceph_lock);
+
+ if (oldrealm) {
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ if (oldrealm->ino == ci->i_vino.ino)
+ oldrealm->inode = NULL;
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, oldrealm);
+ }
+
+ ci->i_snap_realm = realm;
+
+ if (realm) {
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps);
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ }
+}
/*
* Handle a snap notification from the MDS.
@@ -771,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
+ struct ceph_client *cl = mdsc->fsc->client;
struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
@@ -784,6 +1031,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
__le64 *split_inos = NULL, *split_realms = NULL;
int i;
int locked_rwsem = 0;
+ bool close_sessions = false;
+
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
/* decode */
if (msg->front.iov_len < sizeof(*h))
@@ -797,12 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
-
- mutex_lock(&session->s_mutex);
- session->s_seq++;
- mutex_unlock(&session->s_mutex);
+ doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);
down_write(&mdsc->snap_rwsem);
locked_rwsem = 1;
@@ -832,9 +1079,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
if (IS_ERR(realm))
goto out;
}
- ceph_get_snap_realm(mdsc, realm);
- dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
struct ceph_vino vino = {
.ino = le64_to_cpu(split_inos[i]),
@@ -842,7 +1088,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
};
struct inode *inode = ceph_find_inode(sb, vino);
struct ceph_inode_info *ci;
- struct ceph_snap_realm *oldrealm;
if (!inode)
continue;
@@ -860,28 +1105,17 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
- ci->i_snap_realm);
+ doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
+ ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
- /*
- * Move the inode to the new realm
- */
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- list_add(&ci->i_snap_realm_item,
- &realm->inodes_with_caps);
- oldrealm = ci->i_snap_realm;
- ci->i_snap_realm = realm;
- spin_unlock(&realm->inodes_with_caps_lock);
- spin_unlock(&ci->i_ceph_lock);
+ doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
- ceph_put_snap_realm(mdsc, oldrealm);
-
+ ceph_change_snap_realm(inode, realm);
+ spin_unlock(&ci->i_ceph_lock);
iput(inode);
continue;
@@ -893,20 +1127,37 @@ skip_inode:
/* we may have taken some of the old realm's children. */
for (i = 0; i < num_split_realms; i++) {
struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
+ __lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i]));
if (!child)
continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
+ } else {
+ /*
+ * In the non-split case both 'num_split_inos' and
+ * 'num_split_realms' should be 0, making this a no-op.
+ * However the MDS happens to populate 'split_realms' list
+ * in one of the UPDATE op cases by mistake.
+ *
+ * Skip both lists just in case to ensure that 'p' is
+ * positioned at the start of realm info, as expected by
+ * ceph_update_snap_trace().
+ */
+ p += sizeof(u64) * num_split_inos;
+ p += sizeof(u64) * num_split_realms;
}
/*
* update using the provided snap trace. if we are deleting a
* snap, we can avoid queueing cap_snaps.
*/
- ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
+ if (ceph_update_snap_trace(mdsc, p, e,
+ op == CEPH_SNAP_OP_DESTROY,
+ NULL)) {
+ close_sessions = true;
+ goto bad;
+ }
if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
@@ -917,16 +1168,176 @@ skip_inode:
up_write(&mdsc->snap_rwsem);
flush_snaps(mdsc);
+ ceph_dec_mds_stopping_blocker(mdsc);
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err_client(cl, "corrupt snap message from mds%d\n", mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
up_write(&mdsc->snap_rwsem);
+
+ ceph_dec_mds_stopping_blocker(mdsc);
+
+ if (close_sessions)
+ ceph_mdsc_close_sessions(mdsc);
return;
}
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_snapid_map *sm, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&mdsc->snapid_map_lock);
+ p = &mdsc->snapid_map_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap) {
+ p = &(*p)->rb_left;
+ } else if (snap < exist->snap) {
+ p = &(*p)->rb_right;
+ } else {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ break;
+ }
+ exist = NULL;
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
+ return exist;
+ }
+
+ sm = kmalloc(sizeof(*sm), GFP_NOFS);
+ if (!sm)
+ return NULL;
+
+ ret = get_anon_bdev(&sm->dev);
+ if (ret < 0) {
+ kfree(sm);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&sm->lru);
+ atomic_set(&sm->ref, 1);
+ sm->snap = snap;
+
+ exist = NULL;
+ parent = NULL;
+ p = &mdsc->snapid_map_tree.rb_node;
+ spin_lock(&mdsc->snapid_map_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_snapid_map, node);
+ if (snap > exist->snap)
+ p = &(*p)->rb_left;
+ else if (snap < exist->snap)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist) {
+ if (atomic_inc_return(&exist->ref) == 1)
+ list_del_init(&exist->lru);
+ } else {
+ rb_link_node(&sm->node, parent, p);
+ rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+ if (exist) {
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+ exist->dev);
+ return exist;
+ }
+
+ doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm)
+{
+ if (!sm)
+ return;
+ if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+ if (!RB_EMPTY_NODE(&sm->node)) {
+ sm->last_used = jiffies;
+ list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+ spin_unlock(&mdsc->snapid_map_lock);
+ } else {
+ /* already cleaned up by
+ * ceph_cleanup_snapid_map() */
+ spin_unlock(&mdsc->snapid_map_lock);
+ kfree(sm);
+ }
+ }
+}
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_snapid_map *sm;
+ unsigned long now;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ now = jiffies;
+
+ while (!list_empty(&mdsc->snapid_map_lru)) {
+ sm = list_first_entry(&mdsc->snapid_map_lru,
+ struct ceph_snapid_map, lru);
+ if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
+ break;
+
+ rb_erase(&sm->node, &mdsc->snapid_map_tree);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+ free_anon_bdev(sm->dev);
+ kfree(sm);
+ }
+}
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_snapid_map *sm;
+ struct rb_node *p;
+ LIST_HEAD(to_free);
+
+ spin_lock(&mdsc->snapid_map_lock);
+ while ((p = rb_first(&mdsc->snapid_map_tree))) {
+ sm = rb_entry(p, struct ceph_snapid_map, node);
+ rb_erase(p, &mdsc->snapid_map_tree);
+ RB_CLEAR_NODE(p);
+ list_move(&sm->lru, &to_free);
+ }
+ spin_unlock(&mdsc->snapid_map_lock);
+
+ while (!list_empty(&to_free)) {
+ sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+ list_del(&sm->lru);
+ free_anon_bdev(sm->dev);
+ if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+ pr_err_client(cl, "snapid map %llx -> %x still in use\n",
+ sm->snap, sm->dev);
+ }
+ kfree(sm);
+ }
+}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0f..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Ceph fs string constants
*/
@@ -41,6 +42,11 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
case CEPH_SESSION_STALE: return "stale";
case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+ case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+ case CEPH_SESSION_FORCE_RO: return "force_ro";
+ case CEPH_SESSION_REJECT: return "reject";
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog";
}
return "???";
}
@@ -52,7 +58,9 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
+ case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
@@ -72,6 +80,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LSSNAP: return "lssnap";
case CEPH_MDS_OP_MKSNAP: return "mksnap";
case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+ case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6627b26a800c..f6bf24b5c683 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ceph/ceph_debug.h>
@@ -8,7 +9,8 @@
#include <linux/in6.h>
#include <linux/module.h>
#include <linux/mount.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -17,6 +19,8 @@
#include "super.h"
#include "mds_client.h"
+#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h>
@@ -24,6 +28,11 @@
#include <linux/ceph/auth.h>
#include <linux/ceph/debugfs.h>
+#include <uapi/linux/magic.h>
+
+static DEFINE_SPINLOCK(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
/*
* Ceph superblock operations
*
@@ -35,33 +44,30 @@
*/
static void ceph_put_super(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
- dout("put_super\n");
+ doutc(fsc->client, "begin\n");
+ ceph_fscrypt_free_dummy_policy(fsc);
ceph_mdsc_close_sessions(fsc->mdsc);
-
- /*
- * ensure we release the bdi before put_anon_super releases
- * the device name.
- */
- if (s->s_bdi == &fsc->backing_dev_info) {
- bdi_unregister(&fsc->backing_dev_info);
- s->s_bdi = NULL;
- }
-
- return;
+ doutc(fsc->client, "done\n");
}
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
- struct ceph_monmap *monmap = fsc->client->monc.monmap;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry));
+ struct ceph_mon_client *monc = &fsc->client->monc;
struct ceph_statfs st;
- u64 fsid;
- int err;
+ int i, err;
+ u64 data_pool;
+
+ doutc(fsc->client, "begin\n");
+ if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
+ data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
+ } else {
+ data_pool = CEPH_NOPOOL;
+ }
- dout("statfs\n");
- err = ceph_monc_do_statfs(&fsc->client->monc, &st);
+ err = ceph_monc_do_statfs(monc, data_pool, &st);
if (err < 0)
return err;
@@ -69,48 +75,66 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
/*
- * express utilization in terms of large blocks to avoid
+ * Express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
- *
+ */
+ buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+
+ /*
+ * By default use root quota for stats; fallback to overall filesystem
+ * usage if using 'noquotadf' mount option or if the root dir doesn't
+ * have max_bytes quota set.
+ */
+ if (ceph_test_mount_opt(fsc, NOQUOTADF) ||
+ !ceph_quota_update_statfs(fsc, buf)) {
+ buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ }
+
+ /*
* NOTE: for the time being, we make bsize == frsize to humor
* not-yet-ancient versions of glibc that are broken.
* Someday, we will probably want to report a real block
* size... whatever that may mean for a network file system!
*/
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
- buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
- buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
- buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bsize = buf->f_frsize;
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
- /* leave fsid little-endian, regardless of host endianness */
- fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
- buf->f_fsid.val[0] = fsid & 0xffffffff;
- buf->f_fsid.val[1] = fsid >> 32;
+ /* Must convert the fsid, for consistent values across arches */
+ buf->f_fsid.val[0] = 0;
+ mutex_lock(&monc->mutex);
+ for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i)
+ buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]);
+ mutex_unlock(&monc->mutex);
+
+ /* fold the fs_cluster_id into the upper bits */
+ buf->f_fsid.val[1] = monc->fs_cluster_id;
+ doutc(fsc->client, "done\n");
return 0;
}
-
static int ceph_sync_fs(struct super_block *sb, int wait)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
if (!wait) {
- dout("sync_fs (non-blocking)\n");
+ doutc(cl, "(non-blocking)\n");
ceph_flush_dirty_caps(fsc->mdsc);
- dout("sync_fs (non-blocking) done\n");
+ ceph_flush_cap_releases(fsc->mdsc);
+ doutc(cl, "(non-blocking) done\n");
return 0;
}
- dout("sync_fs (blocking)\n");
+ doutc(cl, "(blocking)\n");
ceph_osdc_sync(&fsc->client->osdc);
ceph_mdsc_sync(fsc->mdsc);
- dout("sync_fs (blocking) done\n");
+ doutc(cl, "(blocking) done\n");
return 0;
}
@@ -123,153 +147,486 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
- Opt_cap_release_safety,
+ Opt_caps_max,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
- Opt_last_int,
/* int args above */
Opt_snapdirname,
- Opt_last_string,
+ Opt_mds_namespace,
+ Opt_recover_session,
+ Opt_source,
+ Opt_mon_addr,
+ Opt_test_dummy_encryption,
/* string args above */
Opt_dirstat,
- Opt_nodirstat,
Opt_rbytes,
- Opt_norbytes,
Opt_asyncreaddir,
- Opt_noasyncreaddir,
Opt_dcache,
- Opt_nodcache,
Opt_ino32,
- Opt_noino32,
+ Opt_fscache,
+ Opt_poolperm,
+ Opt_require_active_mds,
+ Opt_acl,
+ Opt_quotadf,
+ Opt_copyfrom,
+ Opt_wsync,
+ Opt_pagecache,
+ Opt_sparseread,
};
-static match_table_t fsopt_tokens = {
- {Opt_wsize, "wsize=%d"},
- {Opt_rsize, "rsize=%d"},
- {Opt_rasize, "rasize=%d"},
- {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
- {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
- {Opt_cap_release_safety, "cap_release_safety=%d"},
- {Opt_readdir_max_entries, "readdir_max_entries=%d"},
- {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
- {Opt_congestion_kb, "write_congestion_kb=%d"},
- /* int args above */
- {Opt_snapdirname, "snapdirname=%s"},
- /* string args above */
- {Opt_dirstat, "dirstat"},
- {Opt_nodirstat, "nodirstat"},
- {Opt_rbytes, "rbytes"},
- {Opt_norbytes, "norbytes"},
- {Opt_asyncreaddir, "asyncreaddir"},
- {Opt_noasyncreaddir, "noasyncreaddir"},
- {Opt_dcache, "dcache"},
- {Opt_nodcache, "nodcache"},
- {Opt_ino32, "ino32"},
- {Opt_noino32, "noino32"},
- {-1, NULL}
+enum ceph_recover_session_mode {
+ ceph_recover_session_no,
+ ceph_recover_session_clean
};
-static int parse_fsopt_token(char *c, void *private)
+static const struct constant_table ceph_param_recover[] = {
+ { "no", ceph_recover_session_no },
+ { "clean", ceph_recover_session_clean },
+ {}
+};
+
+static const struct fs_parameter_spec ceph_mount_parameters[] = {
+ fsparam_flag_no ("acl", Opt_acl),
+ fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir),
+ fsparam_s32 ("caps_max", Opt_caps_max),
+ fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max),
+ fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min),
+ fsparam_u32 ("write_congestion_kb", Opt_congestion_kb),
+ fsparam_flag_no ("copyfrom", Opt_copyfrom),
+ fsparam_flag_no ("dcache", Opt_dcache),
+ fsparam_flag_no ("dirstat", Opt_dirstat),
+ fsparam_flag_no ("fsc", Opt_fscache), // fsc|nofsc
+ fsparam_string ("fsc", Opt_fscache), // fsc=...
+ fsparam_flag_no ("ino32", Opt_ino32),
+ fsparam_string ("mds_namespace", Opt_mds_namespace),
+ fsparam_string ("mon_addr", Opt_mon_addr),
+ fsparam_flag_no ("poolperm", Opt_poolperm),
+ fsparam_flag_no ("quotadf", Opt_quotadf),
+ fsparam_u32 ("rasize", Opt_rasize),
+ fsparam_flag_no ("rbytes", Opt_rbytes),
+ fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes),
+ fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries),
+ fsparam_enum ("recover_session", Opt_recover_session, ceph_param_recover),
+ fsparam_flag_no ("require_active_mds", Opt_require_active_mds),
+ fsparam_u32 ("rsize", Opt_rsize),
+ fsparam_string ("snapdirname", Opt_snapdirname),
+ fsparam_string ("source", Opt_source),
+ fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_u32 ("wsize", Opt_wsize),
+ fsparam_flag_no ("wsync", Opt_wsync),
+ fsparam_flag_no ("pagecache", Opt_pagecache),
+ fsparam_flag_no ("sparseread", Opt_sparseread),
+ {}
+};
+
+struct ceph_parse_opts_ctx {
+ struct ceph_options *copts;
+ struct ceph_mount_options *opts;
+};
+
+/*
+ * Remove adjacent slashes and then the trailing slash, unless it is
+ * the only remaining character.
+ *
+ * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
+ */
+static void canonicalize_path(char *path)
{
- struct ceph_mount_options *fsopt = private;
- substring_t argstr[MAX_OPT_ARGS];
- int token, intval, ret;
+ int i, j = 0;
- token = match_token((char *)c, fsopt_tokens, argstr);
- if (token < 0)
+ for (i = 0; path[i] != '\0'; i++) {
+ if (path[i] != '/' || j < 1 || path[j - 1] != '/')
+ path[j++] = path[i];
+ }
+
+ if (j > 1 && path[j - 1] == '/')
+ j--;
+ path[j] = '\0';
+}
+
+static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ int r;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ if (*dev_name_end != ':')
+ return invalfc(fc, "separator ':' missing in source");
+
+ r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name,
+ pctx->copts, fc->log.log, ',');
+ if (r)
+ return r;
+
+ fsopt->new_dev_syntax = false;
+ return 0;
+}
+
+static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end,
+ struct fs_context *fc)
+{
+ size_t len;
+ struct ceph_fsid fsid;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_options *opts = pctx->copts;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ const char *name_start = dev_name;
+ const char *fsid_start, *fs_name_start;
+
+ if (*dev_name_end != '=') {
+ dout("separator '=' missing in source");
return -EINVAL;
+ }
- if (token < Opt_last_int) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- return ret;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
+ fsid_start = strchr(dev_name, '@');
+ if (!fsid_start)
+ return invalfc(fc, "missing cluster fsid");
+ len = fsid_start - name_start;
+ kfree(opts->name);
+ opts->name = kstrndup(name_start, len, GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+ dout("using %s entity name", opts->name);
+
+ ++fsid_start; /* start of cluster fsid */
+ fs_name_start = strchr(fsid_start, '.');
+ if (!fs_name_start)
+ return invalfc(fc, "missing file system name");
+
+ if (ceph_parse_fsid(fsid_start, &fsid))
+ return invalfc(fc, "Invalid FSID");
+
+ ++fs_name_start; /* start of file system name */
+ len = dev_name_end - fs_name_start;
+
+ if (!namespace_equals(fsopt, fs_name_start, len))
+ return invalfc(fc, "Mismatching mds_namespace");
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
+ dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace);
+
+ fsopt->new_dev_syntax = true;
+ return 0;
+}
+
+/*
+ * Parse the source parameter for new device format. Distinguish the device
+ * spec from the path. Try parsing new device format and fallback to old
+ * format if needed.
+ *
+ * New device syntax will looks like:
+ * <device_spec>=/<path>
+ * where
+ * <device_spec> is name@fsid.fsname
+ * <path> is optional, but if present must begin with '/'
+ * (monitor addresses are passed via mount option)
+ *
+ * Old device syntax is:
+ * <server_spec>[,<server_spec>...]:[<path>]
+ * where
+ * <server_spec> is <ip>[:<port>]
+ * <path> is optional, but if present must begin with '/'
+ */
+static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ char *dev_name = param->string, *dev_name_end;
+ int ret;
+
+ dout("'%s'\n", dev_name);
+ if (!dev_name || !*dev_name)
+ return invalfc(fc, "Empty source");
+
+ dev_name_end = strchr(dev_name, '/');
+ if (dev_name_end) {
+ /*
+ * The server_path will include the whole chars from userland
+ * including the leading '/'.
+ */
+ kfree(fsopt->server_path);
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path)
+ return -ENOMEM;
+
+ canonicalize_path(fsopt->server_path);
} else {
- dout("got token %d\n", token);
+ dev_name_end = dev_name + strlen(dev_name);
}
+ dev_name_end--; /* back up to separator */
+ if (dev_name_end < dev_name)
+ return invalfc(fc, "Path missing in source");
+
+ dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
+ if (fsopt->server_path)
+ dout("server path '%s'\n", fsopt->server_path);
+
+ dout("trying new device syntax");
+ ret = ceph_parse_new_source(dev_name, dev_name_end, fc);
+ if (ret) {
+ if (ret != -EINVAL)
+ return ret;
+ dout("trying old device syntax");
+ ret = ceph_parse_old_source(dev_name, dev_name_end, fc);
+ if (ret)
+ return ret;
+ }
+
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
+}
+
+static int ceph_parse_mon_addr(struct fs_parameter *param,
+ struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+
+ kfree(fsopt->mon_addr);
+ fsopt->mon_addr = param->string;
+ param->string = NULL;
+
+ return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr),
+ pctx->copts, fc->log.log, '/');
+}
+
+static int ceph_parse_mount_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct fs_parse_result result;
+ unsigned int mode;
+ int token, ret;
+
+ ret = ceph_parse_param(param, pctx->copts, fc->log.log);
+ if (ret != -ENOPARAM)
+ return ret;
+
+ token = fs_parse(fc, ceph_mount_parameters, param, &result);
+ dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token);
+ if (token < 0)
+ return token;
+
switch (token) {
case Opt_snapdirname:
+ if (strlen(param->string) > NAME_MAX)
+ return invalfc(fc, "snapdirname too long");
kfree(fsopt->snapdir_name);
- fsopt->snapdir_name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- if (!fsopt->snapdir_name)
- return -ENOMEM;
+ fsopt->snapdir_name = param->string;
+ param->string = NULL;
break;
-
- /* misc */
+ case Opt_mds_namespace:
+ if (!namespace_equals(fsopt, param->string, strlen(param->string)))
+ return invalfc(fc, "Mismatching mds_namespace");
+ kfree(fsopt->mds_namespace);
+ fsopt->mds_namespace = param->string;
+ param->string = NULL;
+ break;
+ case Opt_recover_session:
+ mode = result.uint_32;
+ if (mode == ceph_recover_session_no)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
+ else if (mode == ceph_recover_session_clean)
+ fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
+ else
+ BUG();
+ break;
+ case Opt_source:
+ if (fc->source)
+ return invalfc(fc, "Multiple sources specified");
+ return ceph_parse_source(param, fc);
+ case Opt_mon_addr:
+ return ceph_parse_mon_addr(param, fc);
case Opt_wsize:
- fsopt->wsize = intval;
+ if (result.uint_32 < PAGE_SIZE ||
+ result.uint_32 > CEPH_MAX_WRITE_SIZE)
+ goto out_of_range;
+ fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE);
break;
case Opt_rsize:
- fsopt->rsize = intval;
+ if (result.uint_32 < PAGE_SIZE ||
+ result.uint_32 > CEPH_MAX_READ_SIZE)
+ goto out_of_range;
+ fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE);
break;
case Opt_rasize:
- fsopt->rasize = intval;
+ fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE);
break;
case Opt_caps_wanted_delay_min:
- fsopt->caps_wanted_delay_min = intval;
+ if (result.uint_32 < 1)
+ goto out_of_range;
+ fsopt->caps_wanted_delay_min = result.uint_32;
break;
case Opt_caps_wanted_delay_max:
- fsopt->caps_wanted_delay_max = intval;
+ if (result.uint_32 < 1)
+ goto out_of_range;
+ fsopt->caps_wanted_delay_max = result.uint_32;
+ break;
+ case Opt_caps_max:
+ if (result.int_32 < 0)
+ goto out_of_range;
+ fsopt->caps_max = result.int_32;
break;
case Opt_readdir_max_entries:
- fsopt->max_readdir = intval;
+ if (result.uint_32 < 1)
+ goto out_of_range;
+ fsopt->max_readdir = result.uint_32;
break;
case Opt_readdir_max_bytes:
- fsopt->max_readdir_bytes = intval;
+ if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0)
+ goto out_of_range;
+ fsopt->max_readdir_bytes = result.uint_32;
break;
case Opt_congestion_kb:
- fsopt->congestion_kb = intval;
+ if (result.uint_32 < 1024) /* at least 1M */
+ goto out_of_range;
+ fsopt->congestion_kb = result.uint_32;
break;
case Opt_dirstat:
- fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
- break;
- case Opt_nodirstat:
- fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+ if (!result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
break;
case Opt_rbytes:
- fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
- break;
- case Opt_norbytes:
- fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+ if (!result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
break;
case Opt_asyncreaddir:
- fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
- break;
- case Opt_noasyncreaddir:
- fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
break;
case Opt_dcache:
- fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
- break;
- case Opt_nodcache:
- fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+ if (!result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
break;
case Opt_ino32:
- fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+ if (!result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+ break;
+
+ case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
+ kfree(fsopt->fscache_uniq);
+ fsopt->fscache_uniq = NULL;
+ if (result.negated) {
+ fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
+ } else {
+ fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+ fsopt->fscache_uniq = param->string;
+ param->string = NULL;
+ }
+ break;
+#else
+ return invalfc(fc, "fscache support is disabled");
+#endif
+ case Opt_poolperm:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
+ break;
+ case Opt_require_active_mds:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
+ break;
+ case Opt_quotadf:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
+ break;
+ case Opt_copyfrom:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
+ case Opt_acl:
+ if (!result.negated) {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#else
+ return invalfc(fc, "POSIX ACL support is disabled");
+#endif
+ } else {
+ fc->sb_flags &= ~SB_POSIXACL;
+ }
+ break;
+ case Opt_wsync:
+ if (!result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
break;
- case Opt_noino32:
- fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
+ case Opt_pagecache:
+ if (result.negated)
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE;
+ else
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
+ break;
+ case Opt_sparseread:
+ if (result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD;
+ break;
+ case Opt_test_dummy_encryption:
+#ifdef CONFIG_FS_ENCRYPTION
+ fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy);
+ ret = fscrypt_parse_test_dummy_encryption(param,
+ &fsopt->dummy_enc_policy);
+ if (ret == -EINVAL) {
+ warnfc(fc, "Value of option \"%s\" is unrecognized",
+ param->key);
+ } else if (ret == -EEXIST) {
+ warnfc(fc, "Conflicting test_dummy_encryption options");
+ ret = -EINVAL;
+ }
+#else
+ warnfc(fc,
+ "FS encryption not supported: test_dummy_encryption mount option ignored");
+#endif
break;
default:
- BUG_ON(token);
+ BUG();
}
return 0;
+
+out_of_range:
+ return invalfc(fc, "%s out of range", param->key);
}
static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
+ if (!args)
+ return;
+
kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
+ kfree(args->server_path);
+ kfree(args->fscache_uniq);
+ kfree(args->mon_addr);
+ fscrypt_free_dummy_policy(&args->dummy_enc_policy);
kfree(args);
}
@@ -301,84 +658,23 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
if (ret)
return ret;
- return ceph_compare_options(new_opt, fsc->client);
-}
-
-static int parse_mount_options(struct ceph_mount_options **pfsopt,
- struct ceph_options **popt,
- int flags, char *options,
- const char *dev_name,
- const char **path)
-{
- struct ceph_mount_options *fsopt;
- const char *dev_name_end;
- int err;
-
- if (!dev_name || !*dev_name)
- return -EINVAL;
-
- fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
- if (!fsopt)
- return -ENOMEM;
-
- dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
-
- fsopt->sb_flags = flags;
- fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
-
- fsopt->rsize = CEPH_RSIZE_DEFAULT;
- fsopt->rasize = CEPH_RASIZE_DEFAULT;
- fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
- fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- fsopt->congestion_kb = default_congestion_kb();
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
- /*
- * Distinguish the server list from the path in "dev_name".
- * Internally we do not include the leading '/' in the path.
- *
- * "dev_name" will look like:
- * <server_spec>[,<server_spec>...]:[<path>]
- * where
- * <server_spec> is <ip>[:<port>]
- * <path> is optional, but if present must begin with '/'
- */
- dev_name_end = strchr(dev_name, '/');
- if (dev_name_end) {
- /* skip over leading '/' for path */
- *path = dev_name_end + 1;
- } else {
- /* path is empty */
- dev_name_end = dev_name + strlen(dev_name);
- *path = dev_name_end;
- }
- err = -EINVAL;
- dev_name_end--; /* back up to ':' separator */
- if (dev_name_end < dev_name || *dev_name_end != ':') {
- pr_err("device name is missing path (no : separator in %s)\n",
- dev_name);
- goto out;
- }
- dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
- dout("server path '%s'\n", *path);
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+ if (ret)
+ return ret;
- *popt = ceph_parse_options(options, dev_name, dev_name_end,
- parse_fsopt_token, (void *)fsopt);
- if (IS_ERR(*popt)) {
- err = PTR_ERR(*popt);
- goto out;
- }
+ ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
+ if (ret)
+ return ret;
- /* success */
- *pfsopt = fsopt;
- return 0;
+ ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr);
+ if (ret)
+ return ret;
-out:
- destroy_mount_options(fsopt);
- return err;
+ return ceph_compare_options(new_opt, fsc->client);
}
/**
@@ -388,64 +684,93 @@ out:
*/
static int ceph_show_options(struct seq_file *m, struct dentry *root)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb);
struct ceph_mount_options *fsopt = fsc->mount_options;
- struct ceph_options *opt = fsc->client->options;
-
- if (opt->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsid=%pU", &opt->fsid);
- if (opt->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (opt->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
-
- if (opt->name)
- seq_printf(m, ",name=%s", opt->name);
- if (opt->key)
- seq_puts(m, ",secret=<hidden>");
-
- if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
- if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
- if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
- seq_printf(m, ",osdkeepalivetimeout=%d",
- opt->osd_keepalive_timeout);
+ size_t pos;
+ int ret;
+
+ /* a comma between MNT/MS and client options */
+ seq_putc(m, ',');
+ pos = m->count;
+
+ ret = ceph_print_client_options(m, fsc->client, false);
+ if (ret)
+ return ret;
+
+ /* retract our comma if no client options */
+ if (m->count == pos)
+ m->count--;
if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
seq_puts(m, ",dirstat");
- if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
+ seq_puts(m, ",rbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
- if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
- seq_puts(m, ",dcache");
- else
+ if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
seq_puts(m, ",nodcache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_INO32)
+ seq_puts(m, ",ino32");
+ if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ seq_show_option(m, "fsc", fsopt->fscache_uniq);
+ }
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
+ seq_puts(m, ",nopoolperm");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF)
+ seq_puts(m, ",noquotadf");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ if (root->d_sb->s_flags & SB_POSIXACL)
+ seq_puts(m, ",acl");
+ else
+ seq_puts(m, ",noacl");
+#endif
+
+ if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0)
+ seq_puts(m, ",copyfrom");
- if (fsopt->wsize)
- seq_printf(m, ",wsize=%d", fsopt->wsize);
- if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
- seq_printf(m, ",rsize=%d", fsopt->rsize);
+ /* dump mds_namespace when old device syntax is in use */
+ if (fsopt->mds_namespace && !fsopt->new_dev_syntax)
+ seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+
+ if (fsopt->mon_addr)
+ seq_printf(m, ",mon_addr=%s", fsopt->mon_addr);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
+ seq_show_option(m, "recover_session", "clean");
+
+ if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
+ seq_puts(m, ",wsync");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
+ seq_puts(m, ",nopagecache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
+ seq_puts(m, ",sparseread");
+
+ fscrypt_show_test_dummy_encryption(m, ',', root->d_sb);
+
+ if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
+ seq_printf(m, ",wsize=%u", fsopt->wsize);
+ if (fsopt->rsize != CEPH_MAX_READ_SIZE)
+ seq_printf(m, ",rsize=%u", fsopt->rsize);
if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
- seq_printf(m, ",rasize=%d", fsopt->rasize);
+ seq_printf(m, ",rasize=%u", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
- seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb);
+ if (fsopt->caps_max)
+ seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_min=%d",
+ seq_printf(m, ",caps_wanted_delay_min=%u",
fsopt->caps_wanted_delay_min);
if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_max=%d",
+ seq_printf(m, ",caps_wanted_delay_max=%u",
fsopt->caps_wanted_delay_max);
- if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
- seq_printf(m, ",cap_release_safety=%d",
- fsopt->cap_release_safety);
if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
- seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+ seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir);
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
- seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+ seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ seq_show_option(m, "snapdirname", fsopt->snapdir_name);
+
return 0;
}
@@ -460,9 +785,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
return 0;
-
default:
return -1;
}
@@ -470,106 +797,106 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
/*
* create a new fs client
+ *
+ * Success or not, this function consumes @fsopt and @opt.
*/
static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const unsigned supported_features =
- CEPH_FEATURE_FLOCK |
- CEPH_FEATURE_DIRLAYOUTHASH;
- const unsigned required_features = 0;
- int page_count;
- size_t size;
- int err = -ENOMEM;
+ int err;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
- if (!fsc)
- return ERR_PTR(-ENOMEM);
+ if (!fsc) {
+ err = -ENOMEM;
+ goto fail;
+ }
- fsc->client = ceph_create_client(opt, fsc, supported_features,
- required_features);
+ fsc->client = ceph_create_client(opt, fsc);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
}
+ opt = NULL; /* fsc->client now owns this */
+
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.want_mdsmap = 1;
+ ceph_set_opt(fsc->client, ABORT_ON_FULL);
+
+ if (!fsopt->mds_namespace) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
fsc->mount_options = fsopt;
fsc->sb = NULL;
fsc->mount_state = CEPH_MOUNT_MOUNTING;
+ fsc->filp_gen = 1;
+ fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
-
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
+ fsc->write_congested = false;
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
* to be processed in parallel, limit concurrency.
*/
- fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
- if (fsc->wb_wq == NULL)
- goto fail_bdi;
- fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
- if (fsc->pg_inv_wq == NULL)
- goto fail_wb_wq;
- fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
- if (fsc->trunc_wq == NULL)
- goto fail_pg_inv_wq;
-
- /* set up mempools */
- err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
- size = sizeof (struct page *) * (page_count ? page_count : 1);
- fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
- if (!fsc->wb_pagevec_pool)
- goto fail_trunc_wq;
+ fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0);
+ if (!fsc->inode_wq)
+ goto fail_client;
+ fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1);
+ if (!fsc->cap_wq)
+ goto fail_inode_wq;
+
+ hash_init(fsc->async_unlink_conflict);
+ spin_lock_init(&fsc->async_unlink_conflict_lock);
- /* caps */
- fsc->min_caps = fsopt->max_readdir;
+ spin_lock(&ceph_fsc_lock);
+ list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
+ spin_unlock(&ceph_fsc_lock);
return fsc;
-fail_trunc_wq:
- destroy_workqueue(fsc->trunc_wq);
-fail_pg_inv_wq:
- destroy_workqueue(fsc->pg_inv_wq);
-fail_wb_wq:
- destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
+fail_inode_wq:
+ destroy_workqueue(fsc->inode_wq);
fail_client:
ceph_destroy_client(fsc->client);
fail:
kfree(fsc);
+ if (opt)
+ ceph_destroy_options(opt);
+ destroy_mount_options(fsopt);
return ERR_PTR(err);
}
-static void destroy_fs_client(struct ceph_fs_client *fsc)
+static void flush_fs_workqueues(struct ceph_fs_client *fsc)
{
- dout("destroy_fs_client %p\n", fsc);
+ flush_workqueue(fsc->inode_wq);
+ flush_workqueue(fsc->cap_wq);
+}
- destroy_workqueue(fsc->wb_wq);
- destroy_workqueue(fsc->pg_inv_wq);
- destroy_workqueue(fsc->trunc_wq);
+static void destroy_fs_client(struct ceph_fs_client *fsc)
+{
+ doutc(fsc->client, "%p\n", fsc);
- bdi_destroy(&fsc->backing_dev_info);
+ spin_lock(&ceph_fsc_lock);
+ list_del(&fsc->metric_wakeup);
+ spin_unlock(&ceph_fsc_lock);
- mempool_destroy(fsc->wb_pagevec_pool);
+ ceph_mdsc_destroy(fsc);
+ destroy_workqueue(fsc->inode_wq);
+ destroy_workqueue(fsc->cap_wq);
destroy_mount_options(fsc->mount_options);
- ceph_fs_debugfs_cleanup(fsc);
-
ceph_destroy_client(fsc->client);
kfree(fsc);
- dout("destroy_fs_client %p done\n", fsc);
+ dout("%s: %p done\n", __func__, fsc);
}
/*
@@ -577,49 +904,84 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
+struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
+mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
{
+ int error = -ENOMEM;
+
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
+ if (!ceph_inode_cachep)
return -ENOMEM;
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0);
+ if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
+ ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ceph_cap_flush_cachep)
+ goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ceph_dentry_cachep)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0);
+ if (!ceph_file_cachep)
goto bad_file;
+ ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0);
+ if (!ceph_dir_file_cachep)
+ goto bad_dir_file;
+
+ ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0);
+ if (!ceph_mds_request_cachep)
+ goto bad_mds_req;
+
+ ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+ (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *));
+ if (!ceph_wb_pagevec_pool)
+ goto bad_pagevec_pool;
+
return 0;
+bad_pagevec_pool:
+ kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
+ kmem_cache_destroy(ceph_dir_file_cachep);
+bad_dir_file:
+ kmem_cache_destroy(ceph_file_cachep);
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
+ kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
+ return error;
}
static void destroy_caches(void)
@@ -629,32 +991,45 @@ static void destroy_caches(void)
* destroy cache.
*/
rcu_barrier();
+
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+ kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
+ kmem_cache_destroy(ceph_dir_file_cachep);
+ kmem_cache_destroy(ceph_mds_request_cachep);
+ mempool_destroy(ceph_wb_pagevec_pool);
}
+static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+{
+ ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+ ceph_mdsc_force_umount(fsc->mdsc);
+ fsc->filp_gen++; // invalidate open files
+}
/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
+ * ceph_umount_begin - initiate forced umount. Tear down the
* mount, skipping steps that may hang while waiting for server(s).
*/
-static void ceph_umount_begin(struct super_block *sb)
+void ceph_umount_begin(struct super_block *sb)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+
+ doutc(fsc->client, "starting forced umount\n");
- dout("ceph_umount_begin - starting forced umount\n");
- if (!fsc)
- return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
- return;
+ __ceph_umount_begin(fsc);
}
static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
+ .free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
+ .drop_inode = inode_just_drop,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
@@ -670,40 +1045,40 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
const char *path,
unsigned long started)
{
+ struct ceph_client *cl = fsc->client;
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req = NULL;
int err;
struct dentry *root;
/* open dir */
- dout("open_root_inode opening '%s'\n", path);
+ doutc(cl, "opening '%s'\n", path);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
req->r_path1 = kstrdup(path, GFP_NOFS);
+ if (!req->r_path1) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started;
- req->r_timeout = fsc->client->options->mount_timeout * HZ;
+ req->r_timeout = fsc->client->options->mount_timeout;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == 0) {
struct inode *inode = req->r_target_inode;
req->r_target_inode = NULL;
- dout("open_root_inode success\n");
- if (ceph_ino(inode) == CEPH_INO_ROOT &&
- fsc->sb->s_root == NULL) {
- root = d_make_root(inode);
- if (!root) {
- root = ERR_PTR(-ENOMEM);
- goto out;
- }
- } else {
- root = d_obtain_alias(inode);
+ doutc(cl, "success\n");
+ root = d_make_root(inode);
+ if (!root) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
}
- ceph_init_dentry(root);
- dout("open_root_inode success, root dentry is %p\n", root);
+ doutc(cl, "success, root dentry is %p\n", root);
} else {
root = ERR_PTR(err);
}
@@ -712,127 +1087,178 @@ out:
return root;
}
+#ifdef CONFIG_FS_ENCRYPTION
+static int ceph_apply_test_dummy_encryption(struct super_block *sb,
+ struct fs_context *fc,
+ struct ceph_mount_options *fsopt)
+{
+ struct ceph_fs_client *fsc = sb->s_fs_info;
+ if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy))
+ return 0;
+ /* No changing encryption context on remount. */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
+ &fsc->fsc_dummy_enc_policy))
+ return 0;
+ errorfc(fc, "Can't set test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+
+ /* Also make sure fsopt doesn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
+ &fsc->fsc_dummy_enc_policy))
+ return 0;
+ errorfc(fc, "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+
+ fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy;
+ memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy));
+
+ warnfc(fc, "test_dummy_encryption mode enabled");
+ return 0;
+}
+#else
+static int ceph_apply_test_dummy_encryption(struct super_block *sb,
+ struct fs_context *fc,
+ struct ceph_mount_options *fsopt)
+{
+ return 0;
+}
+#endif
/*
* mount: join the ceph cluster, and open root directory.
*/
static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
- const char *path)
+ struct fs_context *fc)
{
+ struct ceph_client *cl = fsc->client;
int err;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
- int first = 0; /* first vfsmount for this super_block */
- dout("mount start\n");
+ doutc(cl, "mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
- err = __ceph_open_session(fsc->client, started);
- if (err < 0)
- goto out;
+ if (!fsc->sb->s_root) {
+ const char *path = fsc->mount_options->server_path ?
+ fsc->mount_options->server_path + 1 : "";
- dout("mount opening root\n");
- root = open_root_dentry(fsc, "", started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
- if (fsc->sb->s_root) {
- dput(root);
- } else {
- fsc->sb->s_root = root;
- first = 1;
-
- err = ceph_fs_debugfs_init(fsc);
+ err = __ceph_open_session(fsc->client);
if (err < 0)
- goto fail;
- }
+ goto out;
+
+ /* setup fscache */
+ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
+ err = ceph_fscache_register_fs(fsc, fc);
+ if (err < 0)
+ goto out;
+ }
+
+ err = ceph_apply_test_dummy_encryption(fsc->sb, fc,
+ fsc->mount_options);
+ if (err)
+ goto out;
+
+ doutc(cl, "mount opening path '%s'\n", path);
+
+ ceph_fs_debugfs_init(fsc);
- if (path[0] == 0) {
- dget(root);
- } else {
- dout("mount opening base mountpoint\n");
root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
- goto fail;
+ goto out;
}
+ fsc->sb->s_root = dget(root);
+ } else {
+ root = dget(fsc->sb->s_root);
}
fsc->mount_state = CEPH_MOUNT_MOUNTED;
- dout("mount success\n");
+ doutc(cl, "mount success\n");
mutex_unlock(&fsc->client->mount_mutex);
return root;
out:
mutex_unlock(&fsc->client->mount_mutex);
+ ceph_fscrypt_free_dummy_policy(fsc);
return ERR_PTR(err);
-
-fail:
- if (first) {
- dput(fsc->sb->s_root);
- fsc->sb->s_root = NULL;
- }
- goto out;
}
-static int ceph_set_super(struct super_block *s, void *data)
+static int ceph_set_super(struct super_block *s, struct fs_context *fc)
{
- struct ceph_fs_client *fsc = data;
+ struct ceph_fs_client *fsc = s->s_fs_info;
+ struct ceph_client *cl = fsc->client;
int ret;
- dout("set_super %p data %p\n", s, data);
+ doutc(cl, "%p\n", s);
- s->s_flags = fsc->mount_options->sb_flags;
- s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
+ s->s_maxbytes = MAX_LFS_FILESIZE;
- s->s_fs_info = fsc;
+ s->s_xattr = ceph_xattr_handlers;
fsc->sb = s;
+ fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
s->s_op = &ceph_super_ops;
+ set_default_d_op(s, &ceph_dentry_ops);
s->s_export_op = &ceph_export_ops;
- s->s_time_gran = 1000; /* 1000 ns == 1 us */
+ s->s_time_gran = 1;
+ s->s_time_min = 0;
+ s->s_time_max = U32_MAX;
+ s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ s->s_magic = CEPH_SUPER_MAGIC;
- ret = set_anon_super(s, NULL); /* what is that second arg for? */
- if (ret != 0)
- goto fail;
-
- return ret;
+ ceph_fscrypt_set_ops(s);
-fail:
- s->s_fs_info = NULL;
- fsc->sb = NULL;
+ ret = set_anon_super_fc(s, fc);
+ if (ret != 0)
+ fsc->sb = NULL;
return ret;
}
/*
* share superblock if same fs AND options
*/
-static int ceph_compare_super(struct super_block *sb, void *data)
+static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
{
- struct ceph_fs_client *new = data;
+ struct ceph_fs_client *new = fc->s_fs_info;
struct ceph_mount_options *fsopt = new->mount_options;
struct ceph_options *opt = new->client->options;
- struct ceph_fs_client *other = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ struct ceph_client *cl = fsc->client;
- dout("ceph_compare_super %p\n", sb);
+ doutc(cl, "%p\n", sb);
- if (compare_mount_options(fsopt, opt, other)) {
- dout("monitor(s)/mount options don't match\n");
+ if (compare_mount_options(fsopt, opt, fsc)) {
+ doutc(cl, "monitor(s)/mount options don't match\n");
return 0;
}
if ((opt->flags & CEPH_OPT_FSID) &&
- ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
- dout("fsid doesn't match\n");
+ ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
+ doutc(cl, "fsid doesn't match\n");
+ return 0;
+ }
+ if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) {
+ doutc(cl, "flags differ\n");
+ return 0;
+ }
+
+ if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
+ doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n");
return 0;
}
- if (fsopt->sb_flags != other->mount_options->sb_flags) {
- dout("flags differ\n");
+
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ doutc(cl, "client has been forcibly unmounted\n");
return 0;
}
+
return 1;
}
@@ -841,124 +1267,361 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
{
int err;
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
/* set ra_pages based on rasize mount option? */
- if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
- fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
- >> PAGE_SHIFT;
- else
- fsc->backing_dev_info.ra_pages =
- default_backing_dev_info.ra_pages;
+ sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
+ /* set io_pages based on max osd read size */
+ sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
+
+ return 0;
}
-static struct dentry *ceph_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int ceph_get_tree(struct fs_context *fc)
{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
struct super_block *sb;
struct ceph_fs_client *fsc;
struct dentry *res;
+ int (*compare_super)(struct super_block *, struct fs_context *) =
+ ceph_compare_super;
int err;
- int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
- const char *path = NULL;
- struct ceph_mount_options *fsopt = NULL;
- struct ceph_options *opt = NULL;
-
- dout("ceph_mount\n");
- err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
- if (err < 0) {
- res = ERR_PTR(err);
- goto out_final;
- }
+
+ dout("ceph_get_tree\n");
+
+ if (!fc->source)
+ return invalfc(fc, "No source");
+ if (fsopt->new_dev_syntax && !fsopt->mon_addr)
+ return invalfc(fc, "No monitor address");
/* create client (which we may/may not use) */
- fsc = create_fs_client(fsopt, opt);
+ fsc = create_fs_client(pctx->opts, pctx->copts);
+ pctx->opts = NULL;
+ pctx->copts = NULL;
if (IS_ERR(fsc)) {
- res = ERR_CAST(fsc);
- destroy_mount_options(fsopt);
- ceph_destroy_options(opt);
+ err = PTR_ERR(fsc);
goto out_final;
}
err = ceph_mdsc_init(fsc);
- if (err < 0) {
- res = ERR_PTR(err);
+ if (err < 0)
goto out;
- }
if (ceph_test_opt(fsc->client, NOSHARE))
compare_super = NULL;
- sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
+
+ fc->s_fs_info = fsc;
+ sb = sget_fc(fc, compare_super, ceph_set_super);
+ fc->s_fs_info = NULL;
if (IS_ERR(sb)) {
- res = ERR_CAST(sb);
+ err = PTR_ERR(sb);
goto out;
}
- if (ceph_sb_to_client(sb) != fsc) {
- ceph_mdsc_destroy(fsc);
+ if (ceph_sb_to_fs_client(sb) != fsc) {
destroy_fs_client(fsc);
- fsc = ceph_sb_to_client(sb);
+ fsc = ceph_sb_to_fs_client(sb);
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
- if (err < 0) {
- res = ERR_PTR(err);
+ err = ceph_setup_bdi(sb, fsc);
+ if (err < 0)
goto out_splat;
- }
}
- res = ceph_real_mount(fsc, path);
- if (IS_ERR(res))
+ res = ceph_real_mount(fsc, fc);
+ if (IS_ERR(res)) {
+ err = PTR_ERR(res);
goto out_splat;
- dout("root %p inode %p ino %llx.%llx\n", res,
- res->d_inode, ceph_vinop(res->d_inode));
- return res;
+ }
+
+ doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res,
+ d_inode(res), ceph_vinop(d_inode(res)));
+ fc->root = fsc->sb->s_root;
+ return 0;
out_splat:
+ if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+ pr_info("No mds server is up or the cluster is laggy\n");
+ err = -EHOSTUNREACH;
+ }
+
ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb);
goto out_final;
out:
- ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc);
out_final:
- dout("ceph_mount fail %ld\n", PTR_ERR(res));
- return res;
+ dout("ceph_get_tree fail %d\n", err);
+ return err;
+}
+
+static void ceph_free_fc(struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+
+ if (pctx) {
+ destroy_mount_options(pctx->opts);
+ ceph_destroy_options(pctx->copts);
+ kfree(pctx);
+ }
+}
+
+static int ceph_reconfigure_fc(struct fs_context *fc)
+{
+ int err;
+ struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+ struct ceph_mount_options *fsopt = pctx->opts;
+ struct super_block *sb = fc->root->d_sb;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+
+ err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
+ if (err)
+ return err;
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+ ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+ else
+ ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
+ ceph_set_mount_opt(fsc, SPARSEREAD);
+ else
+ ceph_clear_mount_opt(fsc, SPARSEREAD);
+
+ if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
+ kfree(fsc->mount_options->mon_addr);
+ fsc->mount_options->mon_addr = fsopt->mon_addr;
+ fsopt->mon_addr = NULL;
+ pr_notice_client(fsc->client,
+ "monitor addresses recorded, but not used for reconnection");
+ }
+
+ sync_filesystem(sb);
+ return 0;
+}
+
+static const struct fs_context_operations ceph_context_ops = {
+ .free = ceph_free_fc,
+ .parse_param = ceph_parse_mount_param,
+ .get_tree = ceph_get_tree,
+ .reconfigure = ceph_reconfigure_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int ceph_init_fs_context(struct fs_context *fc)
+{
+ struct ceph_parse_opts_ctx *pctx;
+ struct ceph_mount_options *fsopt;
+
+ pctx = kzalloc(sizeof(*pctx), GFP_KERNEL);
+ if (!pctx)
+ return -ENOMEM;
+
+ pctx->copts = ceph_alloc_options();
+ if (!pctx->copts)
+ goto nomem;
+
+ pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL);
+ if (!pctx->opts)
+ goto nomem;
+
+ fsopt = pctx->opts;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+ fsopt->wsize = CEPH_MAX_WRITE_SIZE;
+ fsopt->rsize = CEPH_MAX_READ_SIZE;
+ fsopt->rasize = CEPH_RASIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ if (!fsopt->snapdir_name)
+ goto nomem;
+
+ fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+ fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+
+ fc->fs_private = pctx;
+ fc->ops = &ceph_context_ops;
+ return 0;
+
+nomem:
+ destroy_mount_options(pctx->opts);
+ ceph_destroy_options(pctx->copts);
+ kfree(pctx);
+ return -ENOMEM;
+}
+
+/*
+ * Return true if it successfully increases the blocker counter,
+ * or false if the mdsc is in stopping and flushed state.
+ */
+static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ spin_lock(&mdsc->stopping_lock);
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) {
+ spin_unlock(&mdsc->stopping_lock);
+ return false;
+ }
+ atomic_inc(&mdsc->stopping_blockers);
+ spin_unlock(&mdsc->stopping_lock);
+ return true;
+}
+
+static void __dec_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ spin_lock(&mdsc->stopping_lock);
+ if (!atomic_dec_return(&mdsc->stopping_blockers) &&
+ mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING)
+ complete_all(&mdsc->stopping_waiter);
+ spin_unlock(&mdsc->stopping_lock);
+}
+
+/* For metadata IO requests */
+bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ mutex_lock(&session->s_mutex);
+ inc_session_sequence(session);
+ mutex_unlock(&session->s_mutex);
+
+ return __inc_stopping_blocker(mdsc);
+}
+
+void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ __dec_stopping_blocker(mdsc);
+}
+
+/* For data IO requests */
+bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ return __inc_stopping_blocker(mdsc);
+}
+
+void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ __dec_stopping_blocker(mdsc);
}
static void ceph_kill_sb(struct super_block *s)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(fsc->mdsc);
- kill_anon_super(s); /* will call put_super after sb is r/o */
- ceph_mdsc_destroy(fsc);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ bool wait;
+
+ doutc(cl, "%p\n", s);
+
+ ceph_mdsc_pre_umount(mdsc);
+ flush_fs_workqueues(fsc);
+
+ /*
+ * Though the kill_anon_super() will finally trigger the
+ * sync_filesystem() anyway, we still need to do it here and
+ * then bump the stage of shutdown. This will allow us to
+ * drop any further message, which will increase the inodes'
+ * i_count reference counters but makes no sense any more,
+ * from MDSs.
+ *
+ * Without this when evicting the inodes it may fail in the
+ * kill_anon_super(), which will trigger a warning when
+ * destroying the fscrypt keyring and then possibly trigger
+ * a further crash in ceph module when the iput() tries to
+ * evict the inodes later.
+ */
+ sync_filesystem(s);
+
+ if (atomic64_read(&mdsc->dirty_folios) > 0) {
+ wait_queue_head_t *wq = &mdsc->flush_end_wq;
+ long timeleft = wait_event_killable_timeout(*wq,
+ atomic64_read(&mdsc->dirty_folios) <= 0,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
+ }
+
+ spin_lock(&mdsc->stopping_lock);
+ mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
+ wait = !!atomic_read(&mdsc->stopping_blockers);
+ spin_unlock(&mdsc->stopping_lock);
+
+ if (wait && atomic_read(&mdsc->stopping_blockers)) {
+ long timeleft = wait_for_completion_killable_timeout(
+ &mdsc->stopping_waiter,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
+ }
+
+ mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+ kill_anon_super(s);
+
+ fsc->client->extra_mon_dispatch = NULL;
+ ceph_fs_debugfs_cleanup(fsc);
+
+ ceph_fscache_unregister_fs(fsc);
+
destroy_fs_client(fsc);
}
static struct file_system_type ceph_fs_type = {
.owner = THIS_MODULE,
.name = "ceph",
- .mount = ceph_mount,
+ .init_fs_context = ceph_init_fs_context,
.kill_sb = ceph_kill_sb,
- .fs_flags = FS_RENAME_DOES_D_MOVE,
+ .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ceph");
-#define _STRINGIFY(x) #x
-#define STRINGIFY(x) _STRINGIFY(x)
+int ceph_force_reconnect(struct super_block *sb)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+ int err = 0;
+
+ fsc->mount_state = CEPH_MOUNT_RECOVER;
+ __ceph_umount_begin(fsc);
+
+ /* Make sure all page caches get invalidated.
+ * see remove_session_caps_cb() */
+ flush_workqueue(fsc->inode_wq);
+
+ /* In case that we were blocklisted. This also reset
+ * all mon/osd connections */
+ ceph_reset_client_addr(fsc->client);
+
+ ceph_osdc_clear_abort_err(&fsc->client->osdc);
+
+ fsc->blocklisted = false;
+ fsc->mount_state = CEPH_MOUNT_MOUNTED;
+
+ if (sb->s_root) {
+ err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
+ CEPH_STAT_CAP_INODE, true);
+ }
+ return err;
+}
static int __init init_ceph(void)
{
@@ -966,17 +1629,16 @@ static int __init init_ceph(void)
if (ret)
goto out;
- ceph_xattr_init();
+ ceph_flock_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
- goto out_icache;
+ goto out_caches;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0;
-out_icache:
- ceph_xattr_exit();
+out_caches:
destroy_caches();
out:
return ret;
@@ -986,10 +1648,53 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
- ceph_xattr_exit();
destroy_caches();
}
+static int param_set_metrics(const char *val, const struct kernel_param *kp)
+{
+ struct ceph_fs_client *fsc;
+ int ret;
+
+ ret = param_set_bool(val, kp);
+ if (ret) {
+ pr_err("Failed to parse sending metrics switch value '%s'\n",
+ val);
+ return ret;
+ } else if (!disable_send_metrics) {
+ // wake up all the mds clients
+ spin_lock(&ceph_fsc_lock);
+ list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) {
+ metric_schedule_delayed(&fsc->mdsc->metric);
+ }
+ spin_unlock(&ceph_fsc_lock);
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_metrics = {
+ .set = param_set_metrics,
+ .get = param_get_bool,
+};
+
+bool disable_send_metrics = false;
+module_param_cb(disable_send_metrics, &param_ops_metrics, &disable_send_metrics, 0644);
+MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+
+/* for both v1 and v2 syntax */
+static bool mount_support = true;
+static const struct kernel_param_ops param_ops_mount_syntax = {
+ .get = param_get_bool,
+};
+module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
+module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
+
+bool enable_unsafe_idmap = false;
+module_param(enable_unsafe_idmap, bool, 0644);
+MODULE_PARM_DESC(enable_unsafe_idmap,
+ "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID");
+
module_init(init_ceph);
module_exit(exit_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cbded572345e..a1f781c46b41 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,9 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_SUPER_H
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/backing-dev.h>
#include <linux/completion.h>
#include <linux/exportfs.h>
@@ -13,48 +15,81 @@
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
+#include <linux/posix_acl.h>
+#include <linux/refcount.h>
+#include <linux/security.h>
+#include <linux/netfs.h>
+#include <linux/fscache.h>
+#include <linux/hashtable.h>
#include <linux/ceph/libceph.h>
-
-/* f_type in struct statfs */
-#define CEPH_SUPER_MAGIC 0x00c36400
+#include "crypto.h"
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */
+#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
-
-#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
+#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
+#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
+#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
+#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
+#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
+#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
+#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */
+
+#define CEPH_MOUNT_OPT_DEFAULT \
+ (CEPH_MOUNT_OPT_DCACHE | \
+ CEPH_MOUNT_OPT_NOCOPYFROM | \
+ CEPH_MOUNT_OPT_ASYNC_DIROPS)
#define ceph_set_mount_opt(fsc, opt) \
- (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define CEPH_RSIZE_DEFAULT 0 /* max read size */
-#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */
+/* max size of osd read request, limited by libceph */
+#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN
+/* osd has a configurable limitation of max write size.
+ * CEPH_MSG_MAX_DATA_LEN should be small enough. */
+#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN
+#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
struct ceph_mount_options {
- int flags;
- int sb_flags;
-
- int wsize; /* max write size */
- int rsize; /* max read size */
- int rasize; /* max readahead */
- int congestion_kb; /* max writeback in flight */
- int caps_wanted_delay_min, caps_wanted_delay_max;
- int cap_release_safety;
- int max_readdir; /* max readdir result (entires) */
- int max_readdir_bytes; /* max readdir result (bytes) */
+ unsigned int flags;
+
+ unsigned int wsize; /* max write size */
+ unsigned int rsize; /* max read size */
+ unsigned int rasize; /* max readahead */
+ unsigned int congestion_kb; /* max writeback in flight */
+ unsigned int caps_wanted_delay_min, caps_wanted_delay_max;
+ int caps_max;
+ unsigned int max_readdir; /* max readdir result (entries) */
+ unsigned int max_readdir_bytes; /* max readdir result (bytes) */
+
+ bool new_dev_syntax;
/*
* everything above this point can be memcmp'd; everything below
@@ -62,36 +97,85 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
+ char *server_path; /* default NULL (means "/") */
+ char *fscache_uniq; /* default NULL */
+ char *mon_addr;
+ struct fscrypt_dummy_policy dummy_enc_policy;
};
+/*
+ * Check if the mds namespace in ceph_mount_options matches
+ * the passed in namespace string. First time match (when
+ * ->mds_namespace is NULL) is treated specially, since
+ * ->mds_namespace needs to be initialized by the caller.
+ */
+static inline int namespace_equals(struct ceph_mount_options *fsopt,
+ const char *namespace, size_t len)
+{
+ return !(fsopt->mds_namespace &&
+ (strlen(fsopt->mds_namespace) != len ||
+ strncmp(fsopt->mds_namespace, namespace, len)));
+}
+
+/* mount state */
+enum {
+ CEPH_MOUNT_MOUNTING,
+ CEPH_MOUNT_MOUNTED,
+ CEPH_MOUNT_UNMOUNTING,
+ CEPH_MOUNT_UNMOUNTED,
+ CEPH_MOUNT_SHUTDOWN,
+ CEPH_MOUNT_RECOVER,
+ CEPH_MOUNT_FENCE_IO,
+};
+
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
+
struct ceph_fs_client {
struct super_block *sb;
+ struct list_head metric_wakeup;
+
struct ceph_mount_options *mount_options;
struct ceph_client *client;
- unsigned long mount_state;
- int min_caps; /* min caps i added */
+ int mount_state;
+
+ bool blocklisted;
+
+ bool have_copy_from2;
+
+ u32 filp_gen;
+ loff_t max_file_size;
struct ceph_mds_client *mdsc;
- /* writeback */
- mempool_t *wb_pagevec_pool;
- struct workqueue_struct *wb_wq;
- struct workqueue_struct *pg_inv_wq;
- struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
+ bool write_congested;
- struct backing_dev_info backing_dev_info;
+ struct workqueue_struct *inode_wq;
+ struct workqueue_struct *cap_wq;
+
+ DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+ spinlock_t async_unlink_conflict_lock;
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+ struct dentry *debugfs_status;
+ struct dentry *debugfs_mds_sessions;
+ struct dentry *debugfs_metrics_dir;
#endif
-};
+#ifdef CONFIG_CEPH_FSCACHE
+ struct fscache_volume *fscache;
+#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_dummy_policy fsc_dummy_enc_policy;
+#endif
+};
/*
* File i/o capability. This tracks shared state with the metadata
@@ -108,20 +192,41 @@ struct ceph_cap {
struct rb_node ci_node; /* per-ci cap tree */
struct ceph_mds_session *session;
struct list_head session_caps; /* per-session caplist */
- int mds;
u64 cap_id; /* unique cap id (mds provided) */
- int issued; /* latest, from the mds */
- int implemented; /* implemented superset of issued (for revocation) */
- int mds_wanted;
+ union {
+ /* in-use caps */
+ struct {
+ int issued; /* latest, from the mds */
+ int implemented; /* implemented superset of
+ issued (for revocation) */
+ int mds; /* mds index for this cap */
+ int mds_wanted; /* caps wanted from this mds */
+ };
+ /* caps to release */
+ struct {
+ u64 cap_ino;
+ int queue_release;
+ };
+ };
u32 seq, issue_seq, mseq;
u32 cap_gen; /* active/stale cycle */
unsigned long last_used;
struct list_head caps_item;
};
-#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
-#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
+#define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */
+
+struct ceph_cap_flush {
+ u64 tid;
+ int caps;
+ bool wake; /* wake up flush waiters when finish ? */
+ bool is_capsnap; /* true means capsnap */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
@@ -129,11 +234,12 @@ struct ceph_cap {
* data before flushing the snapped state (tracked here) back to the MDS.
*/
struct ceph_cap_snap {
- atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
+ refcount_t nref;
+ struct list_head ci_item;
- u64 follows, flush_tid;
+ struct ceph_cap_flush cap_flush;
+
+ u64 follows;
int issued, dirty;
struct ceph_snap_context *context;
@@ -145,18 +251,23 @@ struct ceph_cap_snap {
u64 xattr_version;
u64 size;
- struct timespec mtime, atime, ctime;
+ u64 change_attr;
+ struct timespec64 mtime, atime, ctime, btime;
u64 time_warp_seq;
+ u64 truncate_size;
+ u32 truncate_seq;
int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */
+ bool inline_data;
+ bool need_flush;
};
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
- if (atomic_dec_and_test(&capsnap->nref)) {
+ if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -204,16 +315,28 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ struct dentry *dentry;
struct ceph_mds_session *lease_session;
- u32 lease_gen, lease_shared_gen;
+ struct list_head lease_list;
+ struct hlist_node hnode;
+ unsigned long flags;
+ int lease_shared_gen;
+ u32 lease_gen;
u32 lease_seq;
unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
- u64 time;
+ unsigned long time;
u64 offset;
};
+#define CEPH_DENTRY_REFERENCED (1 << 0)
+#define CEPH_DENTRY_LEASE_LIST (1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
+#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
+#define CEPH_DENTRY_ASYNC_CREATE_BIT (5)
+#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT)
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -236,28 +359,37 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
u64 i_version;
+ u64 i_inline_version;
u32 i_time_warp_seq;
- unsigned i_ceph_flags;
- atomic_t i_release_count;
- atomic_t i_complete_count;
+ unsigned long i_ceph_flags;
+ atomic64_t i_release_count;
+ atomic64_t i_ordered_count;
+ atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
- struct timespec i_rctime;
- u64 i_rbytes, i_rfiles, i_rsubdirs;
+ struct timespec64 i_rctime;
+ u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with complete dir */
+
+ /* quotas */
+ u64 i_max_bytes, i_max_files;
+
+ s32 i_dir_pin;
struct rb_root i_fragtree;
+ int i_fragtree_nsplits;
struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
@@ -267,30 +399,52 @@ struct ceph_inode_info {
struct rb_root i_caps; /* cap list */
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
- struct list_head i_dirty_item, i_flushing_item;
- u64 i_cap_flush_seq;
+
+ /*
+ * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
+ * is protected by the mdsc->cap_dirty_lock, but each individual item
+ * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty
+ * requires the mdsc->cap_dirty_lock. List presence for an item can
+ * be tested under the i_ceph_lock. Changing anything requires both.
+ */
+ struct list_head i_dirty_item;
+
+ /*
+ * Link to session's s_cap_flushing list. Protected in a similar
+ * fashion to i_dirty_item, but also by the s_mutex for changes. The
+ * s_cap_flushing list can be walked while holding either the s_mutex
+ * or msdc->cap_dirty_lock. List presence can also be checked while
+ * holding the i_ceph_lock for this inode.
+ */
+ struct list_head i_flushing_item;
+
/* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
- u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+ struct ceph_cap_flush *i_prealloc_cap_flush;
+ struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
- unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
- int i_cap_exporting_mds; /* to handle cap migration between */
- unsigned i_cap_exporting_mseq; /* mds's. */
- unsigned i_cap_exporting_issued;
struct ceph_cap_reservation i_cap_migration_resv;
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ unsigned long i_last_rd;
+ unsigned long i_last_wr;
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
+ struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */
int i_truncate_pending; /* still need to call vmtruncate */
+ /*
+ * For none fscrypt case it equals to i_truncate_size or it will
+ * equals to fscrypt_file_size
+ */
+ u64 i_truncate_pagecache_size;
u64 i_max_size; /* max file size authorized by mds */
u64 i_reported_size; /* (max_)size reported to or requested of mds */
@@ -299,58 +453,88 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
- int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
- u32 i_shared_gen; /* increment each time we get FILE_SHARED */
+ atomic_t i_filelock_ref;
+ atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
- struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
- struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
- int i_snap_realm_counter; /* snap realm (if caps) */
+ union {
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+ };
struct list_head i_snap_realm_item;
struct list_head i_snap_flush_item;
+ struct timespec64 i_btime;
+ struct timespec64 i_snap_btime;
+
+ struct work_struct i_work;
+ unsigned long i_work_mask;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_inode_info *i_crypt_info;
+ u32 fscrypt_auth_len;
+ u32 fscrypt_file_len;
+ u8 *fscrypt_auth;
+ u8 *fscrypt_file;
+#endif
+};
- struct work_struct i_wb_work; /* writeback work */
- struct work_struct i_pg_inv_work; /* page invalidation work */
+struct ceph_netfs_request_data {
+ int caps;
- struct work_struct i_vmtruncate_work;
+ /*
+ * Maximum size of a file readahead request.
+ * The fadvise could update the bdi's default ra_pages.
+ */
+ unsigned int file_ra_pages;
- struct inode vfs_inode; /* at end */
+ /* Set it if fadvise disables file readahead entirely */
+ bool file_ra_disabled;
};
-static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+static inline struct ceph_inode_info *
+ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
-static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *
+ceph_inode_to_fs_client(const struct inode *inode)
{
return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
-static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *
+ceph_sb_to_fs_client(const struct super_block *sb)
{
return (struct ceph_fs_client *)sb->s_fs_info;
}
-static inline struct ceph_vino ceph_vino(struct inode *inode)
+static inline struct ceph_mds_client *
+ceph_sb_to_mdsc(const struct super_block *sb)
+{
+ return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc;
+}
+
+static inline struct ceph_client *
+ceph_inode_to_client(const struct inode *inode)
+{
+ return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client;
+}
+
+static inline struct ceph_vino
+ceph_vino(const struct inode *inode)
{
return ceph_inode(inode)->i_vino;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * i_ino (kernel inode) st_ino (userspace)
- * i386 32 32
- * x86_64+ino32 64 32
- * x86_64 64 64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
{
u32 ino = vino & 0xffffffff;
ino ^= vino >> 32;
@@ -360,35 +544,18 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
}
/*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
*/
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{
-#if BITS_PER_LONG == 32
- return ceph_ino_to_ino32(vino.ino);
-#else
+ if (sizeof(ino_t) == sizeof(u32))
+ return ceph_ino_to_ino32(vino.ino);
return (ino_t)vino.ino;
-#endif
}
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
- if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
- ino = ceph_ino_to_ino32(ino);
- return ino;
-}
-#endif
-
-
/* for printf-style formatting */
#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -396,11 +563,34 @@ static inline u64 ceph_ino(struct inode *inode)
{
return ceph_inode(inode)->i_vino.ino;
}
+
static inline u64 ceph_snap(struct inode *inode)
{
return ceph_inode(inode)->i_vino.snap;
}
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+ if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32)))
+ return ceph_ino_to_ino32(ino);
+ return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+ return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
static inline int ceph_ino_compare(struct inode *inode, void *data)
{
struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -409,36 +599,138 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
ci->i_vino.snap == pvino->snap;
}
+/*
+ * The MDS reserves a set of inodes for its own usage. These should never
+ * be accessible by clients, and so the MDS has no reason to ever hand these
+ * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE.
+ *
+ * These come from src/mds/mdstypes.h in the ceph sources.
+ */
+#define CEPH_MAX_MDS 0x100
+#define CEPH_NUM_STRAY 10
+#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS)
+#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS)
+#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
+
+static inline bool ceph_vino_is_reserved(const struct ceph_vino vino)
+{
+ if (vino.ino >= CEPH_INO_SYSTEM_BASE ||
+ vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET)
+ return false;
+
+ /* Don't warn on mdsdirs */
+ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET,
+ "Attempt to access reserved inode number 0x%llx",
+ vino.ino);
+ return true;
+}
+
static inline struct inode *ceph_find_inode(struct super_block *sb,
struct ceph_vino vino)
{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
+ if (ceph_vino_is_reserved(vino))
+ return NULL;
+
+ /*
+ * NB: The hashval will be run through the fs/inode.c hash function
+ * anyway, so there is no need to squash the inode number down to
+ * 32-bits first. Just use low-order bits on arches with 32-bit long.
+ */
+ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
}
/*
* Ceph inode.
*/
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
+#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
+#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
+#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
+#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
+#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */
+#define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT)
+#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
+#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
+#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async
+ creating finishes */
+
+/*
+ * Masks of ceph inode work.
+ */
+#define CEPH_I_WORK_WRITEBACK 0
+#define CEPH_I_WORK_INVALIDATE_PAGES 1
+#define CEPH_I_WORK_VMTRUNCATE 2
+#define CEPH_I_WORK_CHECK_CAPS 3
+#define CEPH_I_WORK_FLUSH_SNAPS 4
+
+/*
+ * We set the ERROR_WRITE bit when we start seeing write errors on an inode
+ * and then clear it when they start succeeding. Note that we do a lockless
+ * check first, and only take the lock if it looks like it needs to be changed.
+ * The write submission code just takes this as a hint, so we're not too
+ * worried if a few slip through in either direction.
+ */
+static inline void ceph_set_error_write(struct ceph_inode_info *ci)
+{
+ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void ceph_clear_error_write(struct ceph_inode_info *ci)
+{
+ if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
- int release_count)
+ long long release_count,
+ long long ordered_count)
{
- atomic_set(&ci->i_complete_count, release_count);
+ /*
+ * Makes sure operations that setup readdir cache (update page
+ * cache and i_size) are strongly ordered w.r.t. the following
+ * atomic64_set() operations.
+ */
+ smp_mb();
+ atomic64_set(&ci->i_complete_seq[0], release_count);
+ atomic64_set(&ci->i_complete_seq[1], ordered_count);
}
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{
- atomic_inc(&ci->i_release_count);
+ atomic64_inc(&ci->i_release_count);
+}
+
+static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
+{
+ atomic64_inc(&ci->i_ordered_count);
}
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{
- return atomic_read(&ci->i_complete_count) ==
- atomic_read(&ci->i_release_count);
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count);
+}
+
+static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
+{
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count) &&
+ atomic64_read(&ci->i_complete_seq[1]) ==
+ atomic64_read(&ci->i_ordered_count);
}
static inline void ceph_dir_clear_complete(struct inode *inode)
@@ -446,11 +738,17 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
__ceph_dir_clear_complete(ceph_inode(inode));
}
-static inline bool ceph_dir_is_complete(struct inode *inode)
+static inline void ceph_dir_clear_ordered(struct inode *inode)
{
- return __ceph_dir_is_complete(ceph_inode(inode));
+ __ceph_dir_clear_ordered(ceph_inode(inode));
}
+static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
+{
+ bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
+ smp_rmb();
+ return ret;
+}
/* find a specific frag @f */
extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
@@ -464,16 +762,11 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
}
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
- return ((loff_t)frag << 32) | (loff_t)off;
-}
-
/*
* caps helpers
*/
@@ -484,6 +777,8 @@ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
+ int t);
extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
struct ceph_cap *cap);
@@ -496,12 +791,12 @@ static inline int ceph_caps_issued(struct ceph_inode_info *ci)
return issued;
}
-static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
- int touch)
+static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci,
+ int mask, int touch)
{
int r;
spin_lock(&ci->i_ceph_lock);
- r = __ceph_caps_issued_mask(ci, mask, touch);
+ r = __ceph_caps_issued_mask_metric(ci, mask, touch);
spin_unlock(&ci->i_ceph_lock);
return r;
}
@@ -510,37 +805,38 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
{
return ci->i_dirty_caps | ci->i_flushing_caps;
}
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf);
-extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
- int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (w & CEPH_CAP_FILE_BUFFER)
- w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
- return w;
+ return ci->i_nr_by_mode[0];
}
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
-extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
extern void ceph_caps_init(struct ceph_mds_client *mdsc);
extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
+ struct ceph_mount_options *fsopt);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
-extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
+extern void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session);
@@ -554,23 +850,79 @@ struct ceph_file_info {
short fmode; /* initialized on open */
short flags; /* CEPH_F_* */
+ spinlock_t rw_contexts_lock;
+ struct list_head rw_contexts;
+
+ u32 filp_gen;
+};
+
+struct ceph_dir_file_info {
+ struct ceph_file_info file_info;
+
/* readdir: position within the dir */
u32 frag;
struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */
- unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
- struct dentry *dentry; /* next dentry (for dcache readdir) */
- int dir_release_count;
+ long long dir_release_count;
+ long long dir_ordered_count;
+ int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */
char *dir_info;
int dir_info_len;
};
+struct ceph_rw_context {
+ struct list_head list;
+ struct task_struct *thread;
+ int caps;
+};
+
+#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \
+ struct ceph_rw_context _name = { \
+ .thread = current, \
+ .caps = _caps, \
+ }
+
+static inline void ceph_add_rw_context(struct ceph_file_info *cf,
+ struct ceph_rw_context *ctx)
+{
+ spin_lock(&cf->rw_contexts_lock);
+ list_add(&ctx->list, &cf->rw_contexts);
+ spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline void ceph_del_rw_context(struct ceph_file_info *cf,
+ struct ceph_rw_context *ctx)
+{
+ spin_lock(&cf->rw_contexts_lock);
+ list_del(&ctx->list);
+ spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline struct ceph_rw_context*
+ceph_find_rw_context(struct ceph_file_info *cf)
+{
+ struct ceph_rw_context *ctx, *found = NULL;
+ spin_lock(&cf->rw_contexts_lock);
+ list_for_each_entry(ctx, &cf->rw_contexts, list) {
+ if (ctx->thread == current) {
+ found = ctx;
+ break;
+ }
+ }
+ spin_unlock(&cf->rw_contexts_lock);
+ return found;
+}
+struct ceph_readdir_cache_control {
+ struct folio *folio;
+ struct dentry **dentries;
+ int index;
+};
/*
* A "snap realm" describes a subset of the file hierarchy sharing
@@ -583,6 +935,7 @@ struct ceph_file_info {
*/
struct ceph_snap_realm {
u64 ino;
+ struct inode *inode;
atomic_t nref;
struct rb_node node;
@@ -603,6 +956,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -632,7 +987,7 @@ static inline int default_congestion_kb(void)
* This allows larger machines to have larger/more transfers.
* Limit the default to 256M
*/
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10);
if (congestion_kb > 256*1024)
congestion_kb = 256*1024;
@@ -640,7 +995,8 @@ static inline int default_congestion_kb(void)
}
-
+/* super.c */
+extern int ceph_force_reconnect(struct super_block *sb);
/* snap.c */
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino);
@@ -649,14 +1005,24 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret);
+void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
+
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+ u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+ struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+void ceph_umount_begin(struct super_block *sb);
+
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
@@ -665,160 +1031,335 @@ extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
{
return !list_empty(&ci->i_cap_snaps) &&
- list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
- ci_item)->writing;
+ list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
+ ci_item)->writing;
}
/* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+struct ceph_acl_sec_ctx;
+
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
-extern void ceph_destroy_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
+extern void ceph_free_inode(struct inode *inode);
+
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+ umode_t *mode, struct ceph_acl_sec_ctx *as_ctx);
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx);
extern struct inode *ceph_get_inode(struct super_block *sb,
- struct ceph_vino vino);
+ struct ceph_vino vino,
+ struct inode *newino);
extern struct inode *ceph_get_snapdir(struct inode *parent);
extern int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size);
extern void ceph_fill_file_time(struct inode *inode, int issued,
- u64 time_warp_seq, struct timespec *ctime,
- struct timespec *mtime, struct timespec *atime);
+ u64 time_warp_seq, struct timespec64 *ctime,
+ struct timespec64 *mtime,
+ struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+ struct ceph_mds_reply_info_in *iinfo,
+ struct ceph_mds_reply_dirfrag *dirinfo,
+ struct ceph_mds_session *session, int cap_fmode,
+ struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
- struct ceph_mds_request *req,
- struct ceph_mds_session *session);
+ struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
-extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-
-extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_queue_vmtruncate(struct inode *inode);
-extern void ceph_queue_invalidate(struct inode *inode);
-extern void ceph_queue_writeback(struct inode *inode);
+void ceph_queue_inode_work(struct inode *inode, int work_bit);
-extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
-extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
-extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
+static inline void ceph_queue_vmtruncate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE);
+}
+
+static inline void ceph_queue_invalidate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES);
+}
+
+static inline void ceph_queue_writeback(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK);
+}
+
+static inline void ceph_queue_check_caps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS);
+}
+
+static inline void ceph_queue_flush_snaps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
+}
+
+extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
+extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
+ int mask, bool force);
+static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
+{
+ return __ceph_do_getattr(inode, NULL, mask, force);
+}
+extern int ceph_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask);
+
+struct ceph_iattr {
+ struct ceph_fscrypt_auth *fscrypt_auth;
+};
+
+extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct iattr *attr, struct ceph_iattr *cia);
+extern int ceph_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
+void ceph_inode_shutdown(struct inode *inode);
+
+static inline bool ceph_inode_is_shutdown(struct inode *inode)
+{
+ unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ int state = READ_ONCE(fsc->mount_state);
+
+ return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
+}
/* xattr.c */
-extern int ceph_setxattr(struct dentry *, const char *, const void *,
- size_t, int);
-extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern int ceph_removexattr(struct dentry *, const char *);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern void __init ceph_xattr_init(void);
-extern void ceph_xattr_exit(void);
+extern const struct xattr_handler * const ceph_xattr_handlers[];
+
+struct ceph_acl_sec_ctx {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ void *default_acl;
+ void *acl;
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+ struct lsm_context lsmctx;
+#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ struct ceph_fscrypt_auth *fscrypt_auth;
+#endif
+ struct ceph_pagelist *pagelist;
+};
+
+#ifdef CONFIG_SECURITY
+extern bool ceph_security_xattr_deadlock(struct inode *in);
+extern bool ceph_security_xattr_wanted(struct inode *in);
+#else
+static inline bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ return false;
+}
+static inline bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return false;
+}
+#endif
+
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *ctx);
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+ security_inode_invalidate_secctx(inode);
+}
+#else
+static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *ctx)
+{
+ return 0;
+}
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+}
+#endif
+
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
+
+/* acl.c */
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int, bool);
+int ceph_set_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct posix_acl *acl, int type);
+int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acl_sec_ctx *as_ctx);
+void ceph_init_inode_acls(struct inode *inode,
+ struct ceph_acl_sec_ctx *as_ctx);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+ forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ return 0;
+}
+static inline void ceph_init_inode_acls(struct inode *inode,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
/* caps.c */
extern const char *ceph_cap_string(int c);
extern void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
- spin_lock(&cap->ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- spin_unlock(&cap->ci->i_ceph_lock);
-}
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ bool queue_release);
+extern void __ceph_remove_caps(struct ceph_inode_info *ci);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
-extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
- u64 cap_id, u32 migrate_seq, u32 issue_seq);
-extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
+extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+ struct ceph_inode_info *ci);
+extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci,
+ int mds);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
-extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+ bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void __ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
+extern void ceph_remove_capsnap(struct inode *inode,
+ struct ceph_cap_snap *capsnap,
+ bool *wake_ci, bool *wake_mdsc);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
+extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
-
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc);
+extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+ struct inode *dir,
int mds, int drop, int unless);
-extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
- int *got, loff_t endoff);
+extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi,
+ int need, int want, loff_t endoff, int *got);
+extern int ceph_get_caps(struct file *filp, int need, int want,
+ loff_t endoff, int *got);
+extern int ceph_try_get_caps(struct inode *inode,
+ int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+ struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+extern const struct netfs_request_ops ceph_netfs_ops;
+int ceph_mmap_prepare(struct vm_area_desc *desc);
+extern int ceph_uninline_data(struct file *file);
+extern int ceph_pool_perm_check(struct inode *inode, int need);
+extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+
+static inline bool ceph_has_inline_data(struct ceph_inode_info *ci)
+{
+ if (ci->i_inline_version == CEPH_INLINE_NONE ||
+ ci->i_inline_version == 1) /* initial version, no data */
+ return false;
+ return true;
+}
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern const struct address_space_operations ceph_aops;
+extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags, umode_t mode,
- int *opened);
+ struct file *file, unsigned flags, umode_t mode);
+extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
+ struct iov_iter *to, int *retry_op,
+ u64 *last_objver);
extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
+ char *data, size_t len);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
extern const struct inode_operations ceph_dir_iops;
-extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
- ceph_snapdir_dentry_ops;
+extern const struct inode_operations ceph_snapdir_iops;
+extern const struct dentry_operations ceph_dentry_ops;
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
-extern int ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
+extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
+ struct dentry *dentry);
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
-extern void ceph_dentry_lru_add(struct dentry *dn);
-extern void ceph_dentry_lru_touch(struct dentry *dn);
-extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di);
+extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern int ceph_trim_dentries(struct ceph_mds_client *mdsc);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
-extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
-
-/*
- * our d_ops vary depending on whether the inode is live,
- * snapshotted (read-only), or a virtual ".snap" directory.
- */
-int ceph_init_dentry(struct dentry *dentry);
-
+extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/* ioctl.c */
extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* export.c */
extern const struct export_operations ceph_export_ops;
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino);
/* locks.c */
+extern __init void ceph_flock_init(void);
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
@@ -829,10 +1370,81 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode,
extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks);
-extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
/* debugfs.c */
-extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+/* quota.c */
+
+enum quota_get_realm {
+ QUOTA_GET_MAX_FILES,
+ QUOTA_GET_MAX_BYTES,
+ QUOTA_GET_ANY
+};
+
+static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
+ enum quota_get_realm which)
+{
+ bool has_quota = false;
+
+ switch (which) {
+ case QUOTA_GET_MAX_BYTES:
+ has_quota = !!ci->i_max_bytes;
+ break;
+ case QUOTA_GET_MAX_FILES:
+ has_quota = !!ci->i_max_files;
+ break;
+ default:
+ has_quota = !!(ci->i_max_files || ci->i_max_bytes);
+ }
+ return has_quota;
+}
+
+extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
+
+static inline void __ceph_update_quota(struct ceph_inode_info *ci,
+ u64 max_bytes, u64 max_files)
+{
+ bool had_quota, has_quota;
+ had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
+ ci->i_max_bytes = max_bytes;
+ ci->i_max_files = max_files;
+ has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
+
+ if (had_quota != has_quota)
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
+}
+
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+ int cnt = 0;
+
+ if (IS_ENCRYPTED(inode)) {
+ cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+ cnt = 0;
+ }
+
+ return cnt;
+}
+
+extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern bool ceph_quota_is_max_files_exceeded(struct inode *inode);
+extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new);
+extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
+ loff_t newlen);
+extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
+ struct kstatfs *buf);
+extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
+
+bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc);
+bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc);
+void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc);
#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/util.c b/fs/ceph/util.c
new file mode 100644
index 000000000000..2c34875675bf
--- /dev/null
+++ b/fs/ceph/util.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;
+ return 1;
+}
+
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0 && fl->stripe_unit == 0 &&
+ fl->stripe_count == 0 && fl->object_size == 0)
+ fl->pool_id = -1;
+}
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+
+int ceph_flags_to_mode(int flags)
+{
+ int mode;
+
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+
+ switch (flags & O_ACCMODE) {
+ case O_WRONLY:
+ mode = CEPH_FILE_MODE_WR;
+ break;
+ case O_RDONLY:
+ mode = CEPH_FILE_MODE_RD;
+ break;
+ case O_RDWR:
+ case O_ACCMODE: /* this is what the VFS does */
+ mode = CEPH_FILE_MODE_RDWR;
+ break;
+ }
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+ return mode;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+ int caps = CEPH_CAP_PIN;
+
+ if (mode & CEPH_FILE_MODE_RD)
+ caps |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ if (mode & CEPH_FILE_MODE_WR)
+ caps |= CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ if (mode & CEPH_FILE_MODE_LAZY)
+ caps |= CEPH_CAP_FILE_LAZYIO;
+
+ return caps;
+}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532a..ad1f30bea175 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,4 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/pagelist.h>
#include "super.h"
#include "mds_client.h"
@@ -6,16 +8,20 @@
#include <linux/ceph/decode.h>
#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
#define XATTR_CEPH_PREFIX "ceph."
#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr);
+
static bool ceph_is_valid_xattr(const char *name)
{
- return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
- !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
+ return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
@@ -27,169 +33,379 @@ static bool ceph_is_valid_xattr(const char *name)
struct ceph_vxattr {
char *name;
size_t name_size; /* strlen(name) + 1 (for '\0') */
- size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
- size_t size);
- bool readonly, hidden;
+ ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+ size_t size);
bool (*exists_cb)(struct ceph_inode_info *ci);
+ unsigned int flags;
};
+#define VXATTR_FLAG_READONLY (1<<0)
+#define VXATTR_FLAG_HIDDEN (1<<1)
+#define VXATTR_FLAG_RSTAT (1<<2)
+#define VXATTR_FLAG_DIRSTAT (1<<3)
+
/* layouts */
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
- size_t s;
- char *p = (char *)&ci->i_layout;
-
- for (s = 0; s < sizeof(ci->i_layout); s++, p++)
- if (*p)
- return true;
- return false;
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
}
-static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- int ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+ struct ceph_client *cl = fsc->client;
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
+ const char *ns_field = " pool_namespace=";
+ char buf[128];
+ size_t len, total_len = 0;
+ ssize_t ret;
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
- down_read(&osdc->map_sem);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+
+ doutc(cl, "%p\n", &ci->netfs.inode);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size,
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- pool_name);
- else
- ret = snprintf(val, size,
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- (unsigned long long)pool);
-
- up_read(&osdc->map_sem);
+ if (pool_name) {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ ret = total_len;
+ if (size >= total_len) {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
+ memcpy(val + ret, pool_name, len);
+ ret += len;
+ }
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
+ }
+ }
+ up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
return ret;
}
-static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
- char *val, size_t size)
+/*
+ * The convention with strings in xattrs is that they should not be NULL
+ * terminated, since we're returning the length with them. snprintf always
+ * NULL terminates however, so call it on a temporary buffer and then memcpy
+ * the result into place.
+ */
+static __printf(3, 4)
+int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout));
+ int ret;
+ va_list args;
+ char buf[96]; /* NB: reevaluate size if new vxattrs are added */
+
+ va_start(args, fmt);
+ ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args);
+ va_end(args);
+
+ /* Sanity check */
+ if (size && ret + 1 > sizeof(buf)) {
+ WARN_ONCE(true, "Returned length too big (%d)", ret);
+ return -E2BIG;
+ }
+
+ if (ret <= size)
+ memcpy(val, buf, ret);
+ return ret;
}
-static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit);
}
-static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count);
}
-static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
- char *val, size_t size)
+static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+ char *val, size_t size)
{
- int ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size);
+}
+
+static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ ssize_t ret;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
- down_read(&osdc->map_sem);
+ down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size, "%s", pool_name);
- else
- ret = snprintf(val, size, "%lld", (unsigned long long)pool);
- up_read(&osdc->map_sem);
+ if (pool_name) {
+ ret = strlen(pool_name);
+ if (ret <= size)
+ memcpy(val, pool_name, ret);
+ } else {
+ ret = ceph_fmt_xattr(val, size, "%lld", pool);
+ }
+ up_read(&osdc->lock);
+ return ret;
+}
+
+static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ ssize_t ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+
+ if (ns) {
+ ret = ns->len;
+ if (ret <= size)
+ memcpy(val, ns->str, ret);
+ ceph_put_string(ns);
+ }
return ret;
}
/* directories */
-static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
+static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_files);
+}
+
+static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs);
+}
+
+static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld",
+ ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
size_t size)
{
- return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles);
}
-static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_files);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs);
}
-static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,
+static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps);
+}
+
+static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
size_t size)
{
- return snprintf(val, size, "%lld", ci->i_subdirs);
+ return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes);
}
-static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+ return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_rctime);
}
-static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,
- size_t size)
+/* dir pin */
+static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci)
{
- return snprintf(val, size, "%lld", ci->i_rfiles);
+ return ci->i_dir_pin != -ENODATA;
}
-static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%lld", ci->i_rsubdirs);
+ return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin);
}
-static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,
- size_t size)
+/* quotas */
+static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci)
{
- return snprintf(val, size, "%lld", ci->i_rbytes);
+ bool ret = false;
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ ci->i_vino.snap == CEPH_NOSNAP &&
+ ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino)
+ ret = true;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
}
-static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
- size_t size)
+static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val,
+ size_t size)
{
- return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,
- (long)ci->i_rctime.tv_nsec);
+ return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu",
+ ci->i_max_bytes, ci->i_max_files);
}
+static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes);
+}
+
+static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files);
+}
+
+/* snapshots */
+static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci)
+{
+ return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0);
+}
+
+static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_snap_btime);
+}
+
+static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+
+ return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
+}
+
+static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+
+ return ceph_fmt_xattr(val, size, "client%lld",
+ ceph_client_gid(fsc->client));
+}
+
+static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val,
+ size_t size)
+{
+ int issued;
+
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ceph_fmt_xattr(val, size, "%s/0x%x",
+ ceph_cap_string(issued), issued);
+}
+
+static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = ceph_fmt_xattr(val, size, "%d",
+ ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1);
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci)
+{
+ return ci->fscrypt_auth_len;
+}
+
+static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ if (size) {
+ if (size < ci->fscrypt_auth_len)
+ return -ERANGE;
+ memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len);
+ }
+ return ci->fscrypt_auth_len;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
-#define XATTR_NAME_CEPH(_type, _name) \
+#define XATTR_NAME_CEPH(_type, _name, _flags) \
{ \
.name = CEPH_XATTR_NAME(_type, _name), \
.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
- .readonly = true, \
- .hidden = false, \
- .exists_cb = NULL, \
+ .exists_cb = NULL, \
+ .flags = (VXATTR_FLAG_READONLY | _flags), \
+ }
+#define XATTR_RSTAT_FIELD(_type, _name) \
+ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
}
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
- .readonly = false, \
- .hidden = true, \
.exists_cb = ceph_vxattrcb_layout_exists, \
+ .flags = VXATTR_FLAG_HIDDEN, \
+ }
+#define XATTR_QUOTA_FIELD(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = ceph_vxattrcb_quota_exists, \
+ .flags = VXATTR_FLAG_HIDDEN, \
}
static struct ceph_vxattr ceph_dir_vxattrs[] = {
@@ -197,25 +413,55 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
.name = "ceph.dir.layout",
.name_size = sizeof("ceph.dir.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
- .readonly = false,
- .hidden = false,
.exists_cb = ceph_vxattrcb_layout_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
- XATTR_NAME_CEPH(dir, entries),
- XATTR_NAME_CEPH(dir, files),
- XATTR_NAME_CEPH(dir, subdirs),
- XATTR_NAME_CEPH(dir, rentries),
- XATTR_NAME_CEPH(dir, rfiles),
- XATTR_NAME_CEPH(dir, rsubdirs),
- XATTR_NAME_CEPH(dir, rbytes),
- XATTR_NAME_CEPH(dir, rctime),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
+ XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT),
+ XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT),
+ XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT),
+ XATTR_RSTAT_FIELD(dir, rentries),
+ XATTR_RSTAT_FIELD(dir, rfiles),
+ XATTR_RSTAT_FIELD(dir, rsubdirs),
+ XATTR_RSTAT_FIELD(dir, rsnaps),
+ XATTR_RSTAT_FIELD(dir, rbytes),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
+ {
+ .name = "ceph.dir.pin",
+ .name_size = sizeof("ceph.dir.pin"),
+ .getxattr_cb = ceph_vxattrcb_dir_pin,
+ .exists_cb = ceph_vxattrcb_dir_pin_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ {
+ .name = "ceph.quota",
+ .name_size = sizeof("ceph.quota"),
+ .getxattr_cb = ceph_vxattrcb_quota,
+ .exists_cb = ceph_vxattrcb_quota_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ XATTR_QUOTA_FIELD(quota, max_bytes),
+ XATTR_QUOTA_FIELD(quota, max_files),
+ {
+ .name = "ceph.snap.btime",
+ .name_size = sizeof("ceph.snap.btime"),
+ .getxattr_cb = ceph_vxattrcb_snap_btime,
+ .exists_cb = ceph_vxattrcb_snap_btime_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.caps",
+ .name_size = sizeof("ceph.caps"),
+ .getxattr_cb = ceph_vxattrcb_caps,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
{ .name = NULL, 0 } /* Required table terminator */
};
-static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */
@@ -224,17 +470,64 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
.name = "ceph.file.layout",
.name_size = sizeof("ceph.file.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
- .readonly = false,
- .hidden = false,
.exists_cb = ceph_vxattrcb_layout_exists,
+ .flags = VXATTR_FLAG_HIDDEN,
},
XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
+ {
+ .name = "ceph.snap.btime",
+ .name_size = sizeof("ceph.snap.btime"),
+ .getxattr_cb = ceph_vxattrcb_snap_btime,
+ .exists_cb = ceph_vxattrcb_snap_btime_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.caps",
+ .name_size = sizeof("ceph.caps"),
+ .getxattr_cb = ceph_vxattrcb_caps,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_HIDDEN,
+ },
+ { .name = NULL, 0 } /* Required table terminator */
+};
+
+static struct ceph_vxattr ceph_common_vxattrs[] = {
+ {
+ .name = "ceph.cluster_fsid",
+ .name_size = sizeof("ceph.cluster_fsid"),
+ .getxattr_cb = ceph_vxattrcb_cluster_fsid,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.client_id",
+ .name_size = sizeof("ceph.client_id"),
+ .getxattr_cb = ceph_vxattrcb_client_id,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+ {
+ .name = "ceph.auth_mds",
+ .name_size = sizeof("ceph.auth_mds"),
+ .getxattr_cb = ceph_vxattrcb_auth_mds,
+ .exists_cb = NULL,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ {
+ .name = "ceph.fscrypt.auth",
+ .name_size = sizeof("ceph.fscrypt.auth"),
+ .getxattr_cb = ceph_vxattrcb_fscrypt_auth,
+ .exists_cb = ceph_vxattrcb_fscrypt_auth_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+#endif /* CONFIG_FS_ENCRYPTION */
{ .name = NULL, 0 } /* Required table terminator */
};
-static size_t ceph_file_vxattrs_name_size; /* total size of all names */
static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
{
@@ -245,47 +538,6 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
return NULL;
}
-static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- if (vxattrs == ceph_dir_vxattrs)
- return ceph_dir_vxattrs_name_size;
- if (vxattrs == ceph_file_vxattrs)
- return ceph_file_vxattrs_name_size;
- BUG();
-
- return 0;
-}
-
-/*
- * Compute the aggregate size (including terminating '\0') of all
- * virtual extended attribute names in the given vxattr table.
- */
-static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
-{
- struct ceph_vxattr *vxattr;
- size_t size = 0;
-
- for (vxattr = vxattrs; vxattr->name; vxattr++)
- if (!vxattr->hidden)
- size += vxattr->name_size;
-
- return size;
-}
-
-/* Routines called at initialization and exit time */
-
-void __init ceph_xattr_init(void)
-{
- ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs);
- ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs);
-}
-
-void ceph_xattr_exit(void)
-{
- ceph_dir_vxattrs_name_size = 0;
- ceph_file_vxattrs_name_size = 0;
-}
-
static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
const char *name)
{
@@ -299,16 +551,26 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
}
}
+ vxattr = ceph_common_vxattrs;
+ while (vxattr->name) {
+ if (!strcmp(vxattr->name, name))
+ return vxattr;
+ vxattr++;
+ }
+
return NULL;
}
+#define MAX_XATTR_VAL_PRINT_LEN 256
+
static int __set_xattr(struct ceph_inode_info *ci,
const char *name, int name_len,
const char *val, int val_len,
- int dirty,
- int should_free_name, int should_free_val,
+ int flags, int update_xattr,
struct ceph_inode_xattr **newxattr)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -335,23 +597,45 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr = NULL;
}
+ if (update_xattr) {
+ int err = 0;
+
+ if (xattr && (flags & XATTR_CREATE))
+ err = -EEXIST;
+ else if (!xattr && (flags & XATTR_REPLACE))
+ err = -ENODATA;
+ if (err) {
+ kfree(name);
+ kfree(val);
+ kfree(*newxattr);
+ return err;
+ }
+ if (update_xattr < 0) {
+ if (xattr)
+ __remove_xattr(ci, xattr);
+ kfree(name);
+ kfree(*newxattr);
+ return 0;
+ }
+ }
+
if (!xattr) {
new = 1;
xattr = *newxattr;
xattr->name = name;
xattr->name_len = name_len;
- xattr->should_free_name = should_free_name;
+ xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
- dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
} else {
kfree(*newxattr);
*newxattr = NULL;
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
- if (should_free_name) {
- kfree((void *)name);
+ if (update_xattr) {
+ kfree(name);
name = xattr->name;
}
ci->i_xattrs.names_size -= xattr->name_len;
@@ -365,17 +649,19 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->val = "";
xattr->val_len = val_len;
- xattr->dirty = dirty;
- xattr->should_free_val = (val && should_free_val);
+ xattr->dirty = update_xattr;
+ xattr->should_free_val = (val && update_xattr);
if (new) {
rb_link_node(&xattr->node, parent, p);
rb_insert_color(&xattr->node, &ci->i_xattrs.index);
- dout("__set_xattr_val p=%p\n", p);
+ doutc(cl, "p=%p\n", p);
}
- dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+ doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode,
+ ceph_vinop(inode), xattr, name_len, name, min(val_len,
+ MAX_XATTR_VAL_PRINT_LEN), val,
+ val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
return 0;
}
@@ -383,6 +669,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
const char *name)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
@@ -401,13 +688,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
else if (c > 0)
p = &(*p)->rb_right;
else {
- dout("__get_xattr %s: found %.*s\n", name,
- xattr->val_len, xattr->val);
+ int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
+
+ doutc(cl, "%s found %.*s%s\n", name, len, xattr->val,
+ xattr->val_len > len ? "..." : "");
return xattr;
}
}
- dout("__get_xattr %s: not found\n", name);
+ doutc(cl, "%s not found\n", name);
return NULL;
}
@@ -417,9 +706,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr)
BUG_ON(!xattr);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
kfree(xattr);
}
@@ -428,14 +717,14 @@ static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr)
{
if (!xattr)
- return -EOPNOTSUPP;
+ return -ENODATA;
rb_erase(&xattr->node, &ci->i_xattrs.index);
if (xattr->should_free_name)
- kfree((void *)xattr->name);
+ kfree(xattr->name);
if (xattr->should_free_val)
- kfree((void *)xattr->val);
+ kfree(xattr->val);
ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len;
@@ -445,35 +734,23 @@ static int __remove_xattr(struct ceph_inode_info *ci,
return 0;
}
-static int __remove_xattr_by_name(struct ceph_inode_info *ci,
- const char *name)
-{
- struct rb_node **p;
- struct ceph_inode_xattr *xattr;
- int err;
-
- p = &ci->i_xattrs.index.rb_node;
- xattr = __get_xattr(ci, name);
- err = __remove_xattr(ci, xattr);
- return err;
-}
-
static char *__copy_xattr_names(struct ceph_inode_info *ci,
char *dest)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+ doutc(cl, "count=%d\n", ci->i_xattrs.count);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
memcpy(dest, xattr->name, xattr->name_len);
dest[xattr->name_len] = '\0';
- dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
- xattr->name_len, ci->i_xattrs.names_size);
+ doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+ xattr->name_len, ci->i_xattrs.names_size);
dest += xattr->name_len + 1;
p = rb_next(p);
@@ -484,19 +761,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci,
void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
struct rb_node *p, *tmp;
struct ceph_inode_xattr *xattr = NULL;
p = rb_first(&ci->i_xattrs.index);
- dout("__ceph_destroy_xattrs p=%p\n", p);
+ doutc(cl, "p=%p\n", p);
while (p) {
xattr = rb_entry(p, struct ceph_inode_xattr, node);
tmp = p;
p = rb_next(tmp);
- dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
- xattr->name_len, xattr->name);
+ doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name);
rb_erase(tmp, &ci->i_xattrs.index);
__free_xattr(xattr);
@@ -513,19 +790,20 @@ static int __build_xattrs(struct inode *inode)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
+ struct ceph_client *cl = ceph_inode_to_client(inode);
u32 namelen;
u32 numattr = 0;
void *p, *end;
u32 len;
const char *name, *val;
struct ceph_inode_info *ci = ceph_inode(inode);
- int xattr_version;
+ u64 xattr_version;
struct ceph_inode_xattr **xattrs = NULL;
int err = 0;
int i;
- dout("__build_xattrs() len=%d\n",
- ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+ doutc(cl, "len=%d\n",
+ ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
return 0; /* already built */
@@ -541,12 +819,12 @@ start:
xattr_version = ci->i_xattrs.version;
spin_unlock(&ci->i_ceph_lock);
- xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+ xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
GFP_NOFS);
err = -ENOMEM;
if (!xattrs)
goto bad_lock;
- memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+
for (i = 0; i < numattr; i++) {
xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
GFP_NOFS);
@@ -574,7 +852,7 @@ start:
p += len;
err = __set_xattr(ci, name, namelen, val, len,
- 0, 0, 0, &xattrs[numattr]);
+ 0, 0, &xattrs[numattr]);
if (err < 0)
goto bad;
@@ -600,6 +878,8 @@ bad:
static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int val_size)
{
+ struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
+
/*
* 4 bytes for the length, and additional 4 bytes per each xattr name,
* 4 bytes per each value
@@ -607,9 +887,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
int size = 4 + ci->i_xattrs.count*(4 + 4) +
ci->i_xattrs.names_size +
ci->i_xattrs.vals_size;
- dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
- ci->i_xattrs.count, ci->i_xattrs.names_size,
- ci->i_xattrs.vals_size);
+ doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count,
+ ci->i_xattrs.names_size, ci->i_xattrs.vals_size);
if (name_size)
size += 4 + 4 + name_size + val_size;
@@ -618,16 +897,21 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
}
/*
- * If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
+ * If there are dirty xattrs, re-encode xattrs into the prealloc_blob
+ * and swap into place. It returns the old i_xattrs.blob (or NULL) so
+ * that it can be freed by the caller as the i_ceph_lock is likely to be
+ * held.
*/
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
{
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
+ struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -655,56 +939,103 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = NULL;
ci->i_xattrs.dirty = false;
ci->i_xattrs.version++;
}
+
+ return old_blob;
+}
+
+static inline int __get_request_mask(struct inode *in) {
+ struct ceph_mds_request *req = current->journal_info;
+ int mask = 0;
+ if (req && req->r_target_inode == in) {
+ if (req->r_op == CEPH_MDS_OP_LOOKUP ||
+ req->r_op == CEPH_MDS_OP_LOOKUPINO ||
+ req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
+ req->r_op == CEPH_MDS_OP_GETATTR) {
+ mask = le32_to_cpu(req->r_args.getattr.mask);
+ } else if (req->r_op == CEPH_MDS_OP_OPEN ||
+ req->r_op == CEPH_MDS_OP_CREATE) {
+ mask = le32_to_cpu(req->r_args.open.mask);
+ }
+ }
+ return mask;
}
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
-
- if (!ceph_is_valid_xattr(name))
- return -ENODATA;
+ struct ceph_vxattr *vxattr;
+ int req_mask;
+ ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
- err = vxattr->getxattr_cb(ci, value, size);
+ if (vxattr) {
+ int mask = 0;
+ if (vxattr->flags & VXATTR_FLAG_RSTAT)
+ mask |= CEPH_STAT_RSTAT;
+ if (vxattr->flags & VXATTR_FLAG_DIRSTAT)
+ mask |= CEPH_CAP_FILE_SHARED;
+ err = ceph_do_getattr(inode, mask, true);
+ if (err)
+ return err;
+ err = -ENODATA;
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+ err = vxattr->getxattr_cb(ci, value, size);
+ if (size && size < err)
+ err = -ERANGE;
+ }
+ return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
return err;
}
+handle_non_vxattrs:
+ req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
- dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), name, ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto get_xattr;
- } else {
+ if (ci->i_xattrs.version == 0 ||
+ !((req_mask & CEPH_CAP_XATTR_SHARED) ||
+ __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
+
+ /* security module gets xattr while filling trace */
+ if (current->journal_info) {
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
+ return -EBUSY;
+ }
+
/* get xattrs from mds (if we don't already have them) */
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
return err;
+ spin_lock(&ci->i_ceph_lock);
}
- spin_lock(&ci->i_ceph_lock);
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
-get_xattr:
err = -ENODATA; /* == ENOATTR */
xattr = __get_xattr(ci, name);
if (!xattr)
@@ -720,6 +1051,10 @@ get_xattr:
memcpy(value, xattr->val, xattr->val_len);
+ if (current->journal_info &&
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+ security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
+ ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out:
spin_unlock(&ci->i_ceph_lock);
return err;
@@ -727,167 +1062,147 @@ out:
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
- u32 vir_namelen = 0;
+ bool len_only = (size == 0);
u32 namelen;
int err;
- u32 len;
- int i;
spin_lock(&ci->i_ceph_lock);
- dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
- ci->i_xattrs.version, ci->i_xattrs.index_version);
+ doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode,
+ ceph_vinop(inode), ci->i_xattrs.version,
+ ci->i_xattrs.index_version);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
- goto list_xattr;
- } else {
+ if (ci->i_xattrs.version == 0 ||
+ !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) {
spin_unlock(&ci->i_ceph_lock);
- err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
return err;
+ spin_lock(&ci->i_ceph_lock);
}
- spin_lock(&ci->i_ceph_lock);
-
err = __build_xattrs(inode);
if (err < 0)
goto out;
-list_xattr:
- /*
- * Start with virtual dir xattr names (if any) (including
- * terminating '\0' characters for each).
- */
- vir_namelen = ceph_vxattrs_name_size(vxattrs);
-
- /* adding 1 byte per each variable due to the null termination */
+ /* add 1 byte for each xattr due to the null termination */
namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
- err = -ERANGE;
- if (size && vir_namelen + namelen > size)
- goto out;
-
- err = namelen + vir_namelen;
- if (size == 0)
- goto out;
-
- names = __copy_xattr_names(ci, names);
-
- /* virtual xattr names, too */
- err = namelen;
- if (vxattrs) {
- for (i = 0; vxattrs[i].name; i++) {
- if (!vxattrs[i].hidden &&
- !(vxattrs[i].exists_cb &&
- !vxattrs[i].exists_cb(ci))) {
- len = sprintf(names, "%s", vxattrs[i].name);
- names += len + 1;
- err += len + 1;
- }
+ if (!len_only) {
+ if (namelen > size) {
+ err = -ERANGE;
+ goto out;
}
+ names = __copy_xattr_names(ci, names);
+ size -= namelen;
}
-
+ err = namelen;
out:
spin_unlock(&ci->i_ceph_lock);
return err;
}
-static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+static int ceph_sync_setxattr(struct inode *inode, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
+ struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
+ struct ceph_pagelist *pagelist = NULL;
+ int op = CEPH_MDS_OP_SETXATTR;
int err;
- int i, nr_pages;
- struct page **pages = NULL;
- void *kaddr;
-
- /* copy value into some pages */
- nr_pages = calc_pages_for(0, size);
- if (nr_pages) {
- pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
- if (!pages)
+
+ if (size > 0) {
+ /* copy value into pagelist */
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
+ if (!pagelist)
return -ENOMEM;
- err = -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = __page_cache_alloc(GFP_NOFS);
- if (!pages[i]) {
- nr_pages = i;
- goto out;
- }
- kaddr = kmap(pages[i]);
- memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
- min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
- }
+
+ err = ceph_pagelist_append(pagelist, value, size);
+ if (err)
+ goto out;
+ } else if (!value) {
+ if (flags & CEPH_XATTR_REPLACE)
+ op = CEPH_MDS_OP_RMXATTR;
+ else
+ flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value=%.*s\n", (int)size, value);
+ doutc(cl, "name %s value size %zu\n", name, size);
/* do request */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
- USE_AUTH_MDS);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
+
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (op == CEPH_MDS_OP_SETXATTR) {
+ req->r_args.setxattr.flags = cpu_to_le32(flags);
+ req->r_args.setxattr.osdmap_epoch =
+ cpu_to_le32(osdc->osdmap->epoch);
+ req->r_pagelist = pagelist;
+ pagelist = NULL;
+ }
+
req->r_inode = inode;
ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
req->r_num_caps = 1;
- req->r_args.setxattr.flags = cpu_to_le32(flags);
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- req->r_pages = pages;
- req->r_num_pages = nr_pages;
- req->r_data_len = size;
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
- dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+ doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version);
out:
- if (pages) {
- for (i = 0; i < nr_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- }
+ if (pagelist)
+ ceph_pagelist_release(pagelist);
return err;
}
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int __ceph_setxattr(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf = NULL;
+ struct ceph_buffer *old_blob = NULL;
int issued;
int err;
- int dirty;
+ int dirty = 0;
int name_len = strlen(name);
int val_len = size;
char *newname = NULL;
char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
+ bool check_realm = false;
+ bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
- if (!ceph_is_valid_xattr(name))
- return -EOPNOTSUPP;
-
vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
+ if (vxattr) {
+ if (vxattr->flags & VXATTR_FLAG_READONLY)
+ return -EOPNOTSUPP;
+ if (value && !strncmp(vxattr->name, "ceph.quota", 10))
+ check_realm = true;
+ }
/* pass any unhandled ceph.* xattrs through to the MDS */
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
@@ -909,147 +1224,245 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr)
goto out;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ goto out;
+
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
- if (!(issued & CEPH_CAP_XATTR_EXCL))
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+ (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+ doutc(cl, "sync version: %llu size: %d max: %llu\n",
+ ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
- __build_xattrs(inode);
+ }
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if (!lock_snap_rwsem && !ci->i_head_snapc) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ goto retry;
+ }
+ }
+
+ doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode,
+ ceph_vinop(inode), name, ceph_cap_string(issued));
+ __build_xattrs(inode);
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;
spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
+ ceph_buffer_put(old_blob); /* Shouldn't be required */
+ doutc(cl, " pre-allocating new blob size=%d\n",
+ required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
- goto out;
+ goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
+ /* prealloc_blob can't be released while holding i_ceph_lock */
if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ old_blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = blob;
goto retry;
}
- err = __set_xattr(ci, newname, name_len, newval,
- val_len, 1, 1, 1, &xattr);
+ err = __set_xattr(ci, newname, name_len, newval, val_len,
+ flags, value ? 1 : -1, &xattr);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ if (!err) {
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+ &prealloc_cf);
+ ci->i_xattrs.dirty = true;
+ inode_set_ctime_current(inode);
+ }
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ ceph_free_cap_flush(prealloc_cf);
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+
+ /* security module set xattr while filling trace */
+ if (current->journal_info) {
+ pr_warn_ratelimited_client(cl,
+ "sync %p %llx.%llx during filling trace\n",
+ inode, ceph_vinop(inode));
+ err = -EBUSY;
+ } else {
+ err = ceph_sync_setxattr(inode, name, value, size, flags);
+ if (err >= 0 && check_realm) {
+ /* check if snaprealm was created for quota inode */
+ spin_lock(&ci->i_ceph_lock);
+ if ((ci->i_max_files || ci->i_max_bytes) &&
+ !(ci->i_snap_realm &&
+ ci->i_snap_realm->ino == ci->i_vino.ino))
+ err = -EOPNOTSUPP;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ }
out:
+ ceph_free_cap_flush(prealloc_cf);
kfree(newname);
kfree(newval);
kfree(xattr);
return err;
}
-static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+static int ceph_get_xattr_handler(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
- struct inode *parent_inode;
- struct ceph_mds_request *req;
- int err;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
- USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
- ceph_mdsc_put_request(req);
- return err;
+ if (!ceph_is_valid_xattr(name))
+ return -EOPNOTSUPP;
+ return __ceph_getxattr(inode, name, value, size);
}
-int ceph_removexattr(struct dentry *dentry, const char *name)
+static int ceph_set_xattr_handler(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
- struct ceph_vxattr *vxattr;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int issued;
- int err;
- int required_blob_size;
- int dirty;
-
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
+ return __ceph_setxattr(inode, name, value, size, flags);
+}
- vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && vxattr->readonly)
- return -EOPNOTSUPP;
+static const struct xattr_handler ceph_other_xattr_handler = {
+ .prefix = "", /* match any name => handlers called with full name */
+ .get = ceph_get_xattr_handler,
+ .set = ceph_set_xattr_handler,
+};
- /* pass any unhandled ceph.* xattrs through to the MDS */
- if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
- goto do_sync_unlocked;
+#ifdef CONFIG_SECURITY
+bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return in->i_security != NULL;
+}
- err = -ENOMEM;
+bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ struct ceph_inode_info *ci;
+ bool ret;
+ if (!in->i_security)
+ return false;
+ ci = ceph_inode(in);
spin_lock(&ci->i_ceph_lock);
-retry:
- issued = __ceph_caps_issued(ci, NULL);
- dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
- if (!(issued & CEPH_CAP_XATTR_EXCL))
- goto do_sync;
- __build_xattrs(inode);
+ ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
+ !(ci->i_xattrs.version > 0 &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
- required_blob_size = __get_required_blob_size(ci, 0, 0);
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ struct ceph_pagelist *pagelist = as_ctx->pagelist;
+ const char *name;
+ size_t name_len;
+ int err;
- if (!ci->i_xattrs.prealloc_blob ||
- required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
- struct ceph_buffer *blob;
+ err = security_dentry_init_security(dentry, mode, &dentry->d_name,
+ &name, &as_ctx->lsmctx);
+ if (err < 0) {
+ WARN_ON_ONCE(err != -EOPNOTSUPP);
+ err = 0; /* do nothing */
+ goto out;
+ }
- spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
- blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
- if (!blob)
+ err = -ENOMEM;
+ if (!pagelist) {
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist)
goto out;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- ci->i_xattrs.prealloc_blob = blob;
- goto retry;
+ err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
+ if (err)
+ goto out;
+ ceph_pagelist_encode_32(pagelist, 1);
}
- err = __remove_xattr_by_name(ceph_inode(inode), name);
+ /*
+ * FIXME: Make security_dentry_init_security() generic. Currently
+ * It only supports single security module and only selinux has
+ * dentry_init_security hook.
+ */
+ name_len = strlen(name);
+ err = ceph_pagelist_reserve(pagelist,
+ 4 * 2 + name_len + as_ctx->lsmctx.len);
+ if (err)
+ goto out;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
- spin_unlock(&ci->i_ceph_lock);
- if (dirty)
- __mark_inode_dirty(inode, dirty);
- return err;
-do_sync:
- spin_unlock(&ci->i_ceph_lock);
-do_sync_unlocked:
- err = ceph_send_removexattr(dentry, name);
+ if (as_ctx->pagelist) {
+ /* update count of KV pairs */
+ BUG_ON(pagelist->length <= sizeof(__le32));
+ if (list_is_singular(&pagelist->head)) {
+ le32_add_cpu((__le32*)pagelist->mapped_tail, 1);
+ } else {
+ struct page *page = list_first_entry(&pagelist->head,
+ struct page, lru);
+ void *addr = kmap_atomic(page);
+ le32_add_cpu((__le32*)addr, 1);
+ kunmap_atomic(addr);
+ }
+ } else {
+ as_ctx->pagelist = pagelist;
+ }
+
+ ceph_pagelist_encode_32(pagelist, name_len);
+ ceph_pagelist_append(pagelist, name, name_len);
+
+ ceph_pagelist_encode_32(pagelist, as_ctx->lsmctx.len);
+ ceph_pagelist_append(pagelist, as_ctx->lsmctx.context,
+ as_ctx->lsmctx.len);
+
+ err = 0;
out:
+ if (pagelist && !as_ctx->pagelist)
+ ceph_pagelist_release(pagelist);
return err;
}
+#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
+#endif /* CONFIG_SECURITY */
+
+void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
+{
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ posix_acl_release(as_ctx->acl);
+ posix_acl_release(as_ctx->default_acl);
+#endif
+#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
+ security_release_secctx(&as_ctx->lsmctx);
+#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(as_ctx->fscrypt_auth);
+#endif
+ if (as_ctx->pagelist)
+ ceph_pagelist_release(as_ctx->pagelist);
+}
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler * const ceph_xattr_handlers[] = {
+ &ceph_other_xattr_handler,
+ NULL,
+};