diff options
Diffstat (limited to 'fs/ceph')
| -rw-r--r-- | fs/ceph/Kconfig | 38 | ||||
| -rw-r--r-- | fs/ceph/Makefile | 8 | ||||
| -rw-r--r-- | fs/ceph/acl.c | 265 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 3125 | ||||
| -rw-r--r-- | fs/ceph/cache.c | 112 | ||||
| -rw-r--r-- | fs/ceph/cache.h | 117 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 4760 | ||||
| -rw-r--r-- | fs/ceph/ceph_frag.c | 1 | ||||
| -rw-r--r-- | fs/ceph/crypto.c | 604 | ||||
| -rw-r--r-- | fs/ceph/crypto.h | 272 | ||||
| -rw-r--r-- | fs/ceph/debugfs.c | 391 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 2254 | ||||
| -rw-r--r-- | fs/ceph/export.c | 667 | ||||
| -rw-r--r-- | fs/ceph/file.c | 3217 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 3071 | ||||
| -rw-r--r-- | fs/ceph/io.c | 213 | ||||
| -rw-r--r-- | fs/ceph/io.h | 14 | ||||
| -rw-r--r-- | fs/ceph/ioctl.c | 234 | ||||
| -rw-r--r-- | fs/ceph/ioctl.h | 1 | ||||
| -rw-r--r-- | fs/ceph/locks.c | 507 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 5592 | ||||
| -rw-r--r-- | fs/ceph/mds_client.h | 424 | ||||
| -rw-r--r-- | fs/ceph/mdsmap.c | 395 | ||||
| -rw-r--r-- | fs/ceph/mdsmap.h | 79 | ||||
| -rw-r--r-- | fs/ceph/metric.c | 362 | ||||
| -rw-r--r-- | fs/ceph/metric.h | 244 | ||||
| -rw-r--r-- | fs/ceph/quota.c | 547 | ||||
| -rw-r--r-- | fs/ceph/snap.c | 853 | ||||
| -rw-r--r-- | fs/ceph/strings.c | 9 | ||||
| -rw-r--r-- | fs/ceph/super.c | 1605 | ||||
| -rw-r--r-- | fs/ceph/super.h | 1074 | ||||
| -rw-r--r-- | fs/ceph/util.c | 100 | ||||
| -rw-r--r-- | fs/ceph/xattr.c | 1209 |
33 files changed, 24909 insertions, 7455 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 49bc78243db9..3e7def3d31c1 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,10 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only config CEPH_FS tristate "Ceph distributed file system" depends on INET select CEPH_LIB - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO + select NETFS_SUPPORT + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION default n help Choose Y or M here to include support for mounting the @@ -12,7 +15,38 @@ config CEPH_FS scalable file system designed to provide high performance, reliable access to petabytes of storage. - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config CEPH_FS_SECURITY_LABEL + bool "CephFS Security Labels" + depends on CEPH_FS && SECURITY + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the Ceph filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index bd352125e829..1f77ca04c426 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for CEPH filesystem. # @@ -5,7 +6,10 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ - debugfs.o + debugfs.o util.o metric.o +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o +ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 000000000000..1564eacc253d --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" +#include "mds_client.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + int size; + unsigned int retry_cnt = 0; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + if (rcu) + return ERR_PTR(-ECHILD); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + +retry: + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size == -ERANGE && retry_cnt < 10) { + retry_cnt++; + kfree(value); + value = NULL; + goto retry; + } + + if (size > 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + } else if (size == -ENODATA || size == 0) { + acl = NULL; + } else { + pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n", + ceph_vinop(inode), size); + acl = ERR_PTR(-EIO); + } + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + struct inode *inode = d_inode(dentry); + struct timespec64 old_ctime = inode_get_ctime(inode); + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) { + ret = posix_acl_update_mode(idmap, inode, + &new_mode, &acl); + if (ret) + goto out; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + if (new_mode != old_mode) { + newattrs.ia_ctime = current_time(inode); + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + ret = __ceph_setattr(idmap, inode, &newattrs, NULL); + if (ret) + goto out_free; + } + + ret = __ceph_setxattr(inode, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_ctime = old_ctime; + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + __ceph_setattr(idmap, inode, &newattrs, NULL); + } + goto out_free; + } + + ceph_set_cached_acl(inode, type, acl); + +out_free: + kfree(value); +out: + return ret; +} + +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + struct posix_acl *acl, *default_acl; + size_t val_size1 = 0, val_size2 = 0; + struct ceph_pagelist *pagelist = NULL; + void *tmp_buf = NULL; + int err; + + err = posix_acl_create(dir, mode, &default_acl, &acl); + if (err) + return err; + + if (acl) { + err = posix_acl_equiv_mode(acl, mode); + if (err < 0) + goto out_err; + if (err == 0) { + posix_acl_release(acl); + acl = NULL; + } + } + + if (!default_acl && !acl) + return 0; + + if (acl) + val_size1 = posix_acl_xattr_size(acl->a_count); + if (default_acl) + val_size2 = posix_acl_xattr_size(default_acl->a_count); + + err = -ENOMEM; + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); + if (!tmp_buf) + goto out_err; + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) + goto out_err; + + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out_err; + + ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); + + if (acl) { + size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); + err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, + len); + err = posix_acl_to_xattr(&init_user_ns, acl, + tmp_buf, val_size1); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size1); + ceph_pagelist_append(pagelist, tmp_buf, val_size1); + } + if (default_acl) { + size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); + err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, + XATTR_NAME_POSIX_ACL_DEFAULT, len); + err = posix_acl_to_xattr(&init_user_ns, default_acl, + tmp_buf, val_size2); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size2); + ceph_pagelist_append(pagelist, tmp_buf, val_size2); + } + + kfree(tmp_buf); + + as_ctx->acl = acl; + as_ctx->default_acl = default_acl; + as_ctx->pagelist = pagelist; + return 0; + +out_err: + posix_acl_release(acl); + posix_acl_release(default_acl); + kfree(tmp_buf); + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) +{ + if (!inode) + return; + ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, as_ctx->acl); + ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, as_ctx->default_acl); +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5318a3b704f6..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,17 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/writeback.h> /* generic_writepages */ #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> +#include <linux/signal.h> +#include <linux/iversion.h> +#include <linux/ktime.h> +#include <linux/netfs.h> +#include <trace/events/netfs.h> #include "super.h" #include "mds_client.h" +#include "cache.h" +#include "metric.h" +#include "crypto.h" #include <linux/ceph/osd_client.h> +#include <linux/ceph/striper.h> /* * Ceph address space ops. @@ -54,6 +64,9 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct folio **foliop, void **_fsdata); + static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) @@ -65,1198 +78,2524 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page) * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ -static int ceph_set_page_dirty(struct page *page) +static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct address_space *mapping = page->mapping; - struct inode *inode; + struct inode *inode = mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; - int undo = 0; struct ceph_snap_context *snapc; - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - if (TestSetPageDirty(page)) { - dout("%p set_page_dirty %p idx %lu -- already dirty\n", - mapping->host, page, page->index); - return 0; + if (folio_test_dirty(folio)) { + doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n", + ceph_vinop(inode), folio, folio->index); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); + return false; } - inode = mapping->host; - ci = ceph_inode(inode); + atomic64_inc(&mdsc->dirty_folios); - /* - * Note that we're grabbing a snapc ref here without holding - * any locks! - */ - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + ci = ceph_inode(inode); /* dirty the head */ spin_lock(&ci->i_ceph_lock); - if (ci->i_head_snapc == NULL) - ci->i_head_snapc = ceph_get_snap_context(snapc); - ++ci->i_wrbuffer_ref_head; + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; - dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " - "snapc %p seq %lld (%d snaps)\n", - mapping->host, page, page->index, - ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, - ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, - snapc, snapc->seq, snapc->num_snaps); + doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + ceph_vinop(inode), folio, folio->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); - /* now adjust page */ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, page->mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); + /* + * Reference snap context in folio->private. Also set + * PagePrivate so that we get invalidate_folio callback. + */ + VM_WARN_ON_FOLIO(folio->private, folio); + folio_attach_private(folio, snapc); + + return ceph_fscache_dirty_folio(mapping, folio); +} - /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. - */ - page->private = (unsigned long)snapc; - SetPagePrivate(page); - } else { - dout("ANON set_page_dirty %p (raced truncate?)\n", page); - undo = 1; +/* + * If we are truncating the full folio (i.e. offset == 0), adjust the + * dirty folio counters appropriately. Only called if there is private + * data on the folio. + */ +static void ceph_invalidate_folio(struct folio *folio, size_t offset, + size_t length) +{ + struct inode *inode = folio->mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + + + if (offset != 0 || length != folio_size(folio)) { + doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n", + ceph_vinop(inode), folio->index, offset, length); + return; } - spin_unlock_irq(&mapping->tree_lock); + WARN_ON(!folio_test_locked(folio)); + if (folio_test_private(folio)) { + doutc(cl, "%llx.%llx idx %lu full dirty page\n", + ceph_vinop(inode), folio->index); - if (undo) - /* whoops, we failed to dirty the page */ + snapc = folio_detach_private(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + } - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - BUG_ON(!PageDirty(page)); - return 1; + netfs_invalidate_folio(folio, offset, length); } -/* - * If we are truncating the full page (i.e. offset == 0), adjust the - * dirty page counters appropriately. Only called if there is private - * data on the page. - */ -static void ceph_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { - struct inode *inode; - struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = page_snap_context(page); + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_layout *lo = &ci->i_layout; + unsigned long max_pages = inode->i_sb->s_bdi->ra_pages; + loff_t end = rreq->start + rreq->len, new_end; + struct ceph_netfs_request_data *priv = rreq->netfs_priv; + unsigned long max_len; + u32 blockoff; + + if (priv) { + /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */ + if (priv->file_ra_disabled) + max_pages = 0; + else + max_pages = priv->file_ra_pages; - BUG_ON(!PageLocked(page)); - BUG_ON(!PagePrivate(page)); - BUG_ON(!page->mapping); + } + + /* Readahead is disabled */ + if (!max_pages) + return; - inode = page->mapping->host; + max_len = max_pages << PAGE_SHIFT; /* - * We can get non-dirty pages here due to races between - * set_page_dirty and truncate_complete_page; just spit out a - * warning, in case we end up with accounting problems later. + * Try to expand the length forward by rounding up it to the next + * block, but do not exceed the file size, unless the original + * request already exceeds it. */ - if (!PageDirty(page)) - pr_err("%p invalidatepage %p page not dirty\n", inode, page); - - if (offset == 0 && length == PAGE_CACHE_SIZE) - ClearPageChecked(page); - - ci = ceph_inode(inode); - if (offset == 0 && length == PAGE_CACHE_SIZE) { - dout("%p invalidatepage %p idx %lu full dirty page\n", - inode, page, page->index); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); - } else { - dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", - inode, page, page->index, offset, length); + new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size); + if (new_end > end && new_end <= rreq->start + max_len) + rreq->len = new_end - rreq->start; + + /* Try to expand the start downward */ + div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); + if (rreq->len + blockoff <= max_len) { + rreq->start -= blockoff; + rreq->len += blockoff; } } -/* just a sanity check */ -static int ceph_releasepage(struct page *page, gfp_t g) +static void finish_netfs_read(struct ceph_osd_request *req) { - struct inode *inode = page->mapping ? page->mapping->host : NULL; - dout("%p releasepage %p idx %lu\n", inode, page, page->index); - WARN_ON(PageDirty(page)); - WARN_ON(PagePrivate(page)); - return 0; + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct netfs_io_subrequest *subreq = req->r_priv; + struct ceph_osd_req_op *op = &req->r_ops[0]; + int err = req->r_result; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); + + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, osd_data->length, err); + + doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result, + subreq->len, i_size_read(req->r_inode)); + + /* no object means success but no data */ + if (err == -ENOENT) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + err = 0; + } else if (err == -EBLOCKLISTED) { + fsc->blocklisted = true; + } + + if (err >= 0) { + if (sparse && err > 0) + err = ceph_sparse_ext_map_end(op); + if (err < subreq->len && + subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (IS_ENCRYPTED(inode) && err > 0) { + err = ceph_fscrypt_decrypt_extents(inode, + osd_data->pages, subreq->start, + op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (err > subreq->len) + err = subreq->len; + } + if (err > 0) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + } + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + ceph_put_page_vector(osd_data->pages, + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } + if (err > 0) { + subreq->transferred = err; + err = 0; + } + subreq->error = err; + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq); + iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * read a single page, without unlocking it. - */ -static int readpage_nounlock(struct file *filp, struct page *page) +static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) { - struct inode *inode = file_inode(filp); + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; - int err = 0; - u64 len = PAGE_CACHE_SIZE; - - dout("readpage inode %p file %p page %p index %lu\n", - inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - (u64) page_offset(page), &len, - ci->i_truncate_seq, ci->i_truncate_size, - &page, 1, 0); - if (err == -ENOENT) - err = 0; - if (err < 0) { - SetPageError(page); + ssize_t err = 0; + size_t len; + int mode; + + if (rreq->origin != NETFS_UNBUFFERED_READ && + rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) goto out; - } else if (err < PAGE_CACHE_SIZE) { - /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter); + if (err == 0) { + err = -EFAULT; + } else { + subreq->transferred += err; + err = 0; } - SetPageUptodate(page); + ceph_mdsc_put_request(req); out: - return err < 0 ? err : 0; + subreq->error = err; + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq); + return true; } -static int ceph_readpage(struct file *filp, struct page *page) +static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq) { - int r = readpage_nounlock(filp, page); - unlock_page(page); - return r; + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize); + return 0; } -/* - * Finish an async read(ahead) op. - */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { - struct inode *inode = req->r_inode; - struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); - int num_pages; - int i; + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_request *req = NULL; + struct ceph_vino vino = ceph_vino(inode); + int err; + u64 len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; + int extent_cnt; - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } - /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - for (i = 0; i < num_pages; i++) { - struct page *page = osd_data->pages[i]; - - if (bytes < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_CACHE_SIZE); + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) + return; + + // TODO: This rounding here is slightly dodgy. It *should* work, for + // now, as the cache only deals in blocks that are a multiple of + // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to + // happen is for the fscrypt driving to be moved into netfslib and the + // data in the cache also to be stored encrypted. + len = subreq->len; + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, + off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, len); + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); + if (err) + goto out; + } + + doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", + ceph_vinop(inode), subreq->start, subreq->len, len); + + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter + * instead of page arrays, and we don't have that as of yet. Once the + * dust settles on the write helpers and encrypt/decrypt routines for + * netfs, we should be able to rework this. + */ + if (IS_ENCRYPTED(inode)) { + struct page **pages; + size_t page_off; + + /* + * FIXME: io_iter.count needs to be corrected to aligned + * length. Otherwise, iov_iter_get_pages_alloc2() operates + * with the initial unaligned length value. As a result, + * ceph_msg_data_cursor_init() triggers BUG_ON() in the case + * if msg->sparse_read_total > msg->data_length. + */ + subreq->io_iter.count = len; + + err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); + if (err < 0) { + doutc(cl, "%llx.%llx failed to allocate pages, %d\n", + ceph_vinop(inode), err); + goto out; } - dout("finish_read %p uptodate %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); - bytes -= PAGE_CACHE_SIZE; + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + err = 0; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { + osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } + req->r_callback = finish_netfs_read; + req->r_priv = subreq; + req->r_inode = inode; + ihold(inode); + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + ceph_osdc_start_request(req->r_osdc, req); +out: + ceph_osdc_put_request(req); + if (err) { + subreq->error = err; + netfs_read_subreq_terminated(subreq); } - kfree(osd_data->pages); + doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } -static void ceph_unlock_page_vector(struct page **pages, int num_pages) +static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { - int i; + struct inode *inode = rreq->inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); + int got = 0, want = CEPH_CAP_FILE_CACHE; + struct ceph_netfs_request_data *priv; + int ret = 0; - for (i = 0; i < num_pages; i++) - unlock_page(pages[i]); -} + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); -/* - * start an async read(ahead) operation. return nr_pages we submitted - * a read for on success, or negative error code. - */ -static int start_read(struct inode *inode, struct list_head *page_list, int max) -{ - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; - struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = list_entry(page_list->prev, struct page, lru); - struct ceph_vino vino; - struct ceph_osd_request *req; - u64 off; - u64 len; - int i; - struct page **pages; - pgoff_t next_index; - int nr_pages = 0; - int ret; + if (rreq->origin != NETFS_READAHEAD) + return 0; - off = (u64) page_offset(page); + priv = kzalloc(sizeof(*priv), GFP_NOFS); + if (!priv) + return -ENOMEM; - /* count pages */ - next_index = page->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index != next_index) - break; - nr_pages++; - next_index++; - if (max && nr_pages == max) - break; + if (file) { + struct ceph_rw_context *rw_ctx; + struct ceph_file_info *fi = file->private_data; + + priv->file_ra_pages = file->f_ra.ra_pages; + priv->file_ra_disabled = file->f_mode & FMODE_RANDOM; + + rw_ctx = ceph_find_rw_context(fi); + if (rw_ctx) { + rreq->netfs_priv = priv; + return 0; + } } - len = nr_pages << PAGE_CACHE_SHIFT; - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, - off, len); - vino = ceph_vino(inode); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); - /* build page vector */ - nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); - ret = -ENOMEM; - if (!pages) + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) { + doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode)); goto out; - for (i = 0; i < nr_pages; ++i) { - page = list_entry(page_list->prev, struct page, lru); - BUG_ON(PageLocked(page)); - list_del(&page->lru); - - dout("start_read %p adding %p idx %lu\n", inode, page, - page->index); - if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_NOFS)) { - page_cache_release(page); - dout("start_read %p add_to_page_cache failed %p\n", - inode, page); - nr_pages = i; - goto out_pages; - } - pages[i] = page; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); - req->r_callback = finish_read; - req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + if (!(got & want)) { + doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode)); + ret = -EACCES; + goto out; + } + if (ret == 0) { + ret = -EACCES; + goto out; + } - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto out_pages; - ceph_osdc_put_request(req); - return nr_pages; + priv->caps = got; + rreq->netfs_priv = priv; + rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize; -out_pages: - ceph_unlock_page_vector(pages, nr_pages); - ceph_release_page_vector(pages, nr_pages); out: - ceph_osdc_put_request(req); + if (ret < 0) { + if (got) + ceph_put_cap_refs(ceph_inode(inode), got); + kfree(priv); + } + return ret; } +static void ceph_netfs_free_request(struct netfs_io_request *rreq) +{ + struct ceph_netfs_request_data *priv = rreq->netfs_priv; -/* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. - */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) + if (!priv) + return; + + if (priv->caps) + ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps); + kfree(priv); + rreq->netfs_priv = NULL; +} + +const struct netfs_request_ops ceph_netfs_ops = { + .init_request = ceph_init_request, + .free_request = ceph_netfs_free_request, + .prepare_read = ceph_netfs_prepare_read, + .issue_read = ceph_netfs_issue_read, + .expand_readahead = ceph_netfs_expand_readahead, + .check_write_begin = ceph_netfs_check_write_begin, +}; + +#ifdef CONFIG_CEPH_FSCACHE +static void ceph_set_page_fscache(struct page *page) { - struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - int rc = 0; - int max = 0; - - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) - max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) - >> PAGE_SHIFT; - - dout("readpages %p file %p nr_pages %d max %d\n", inode, - file, nr_pages, - max); - while (!list_empty(page_list)) { - rc = start_read(inode, page_list, max); - if (rc < 0) - goto out; - BUG_ON(rc == 0); - } -out: - dout("readpages %p file %p ret %d\n", inode, file, rc); - return rc; + folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } +static void ceph_fscache_write_terminated(void *priv, ssize_t error) +{ + struct inode *inode = priv; + + if (IS_ERR_VALUE(error) && error != -ENOBUFS) + ceph_fscache_invalidate(inode, false); +} + +static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), + ceph_fscache_write_terminated, inode, true, caching); +} +#else +static inline void ceph_set_page_fscache(struct page *page) +{ +} + +static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) +{ +} +#endif /* CONFIG_CEPH_FSCACHE */ + +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + + bool head_snapc; + struct ceph_snap_context *snapc; + struct ceph_snap_context *last_snapc; + + bool done; + bool should_loop; + bool range_whole; + pgoff_t start_index; + pgoff_t index; + pgoff_t end; + xa_mark_t tag; + + pgoff_t strip_unit_end; + unsigned int wsize; + unsigned int nr_folios; + unsigned int max_pages; + unsigned int locked_pages; + + int op_idx; + int num_ops; + u64 offset; + u64 len; + + struct folio_batch fbatch; + unsigned int processed_in_fbatch; + + bool from_pool; + struct page **pages; + struct page **data_pages; +}; + /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, - capsnap->context, capsnap->dirty_pages); - if (capsnap->dirty_pages) { - snapc = ceph_get_snap_context(capsnap->context); - if (snap_size) - *snap_size = capsnap->size; - break; + doutc(cl, " capsnap %p snapc %p has %d dirty pages\n", + capsnap, capsnap->context, capsnap->dirty_pages); + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); - dout(" head snapc %p has %d dirty pages\n", - snapc, ci->i_wrbuffer_ref_head); + doutc(cl, " head snapc %p has %d dirty pages\n", snapc, + ci->i_wrbuffer_ref_head); + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } } spin_unlock(&ci->i_ceph_lock); return snapc; } +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + u64 ret; + + snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) + end = ceph_fscrypt_page_offset(page) + thp_size(page); + ret = end > start ? end - start : 0; + if (ret && fscrypt_is_bounce_page(page)) + ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); + return ret; +} + /* - * Write a single page, but leave the page locked. + * Write a folio, but leave it locked. * - * If we get a write error, set the page error bit, but still adjust the - * dirty page accounting (i.e., page is no longer dirty). + * If we get a write error, mark the mapping for error, but still adjust the + * dirty page accounting (i.e., folio is no longer dirty). */ -static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +static int write_folio_nounlock(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode; - struct ceph_inode_info *ci; - struct ceph_fs_client *fsc; - struct ceph_osd_client *osdc; + struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; - loff_t page_off = page_offset(page); - long writeback_stat; - u64 truncate_size, snap_size = 0; - u32 truncate_seq; - int err = 0, len = PAGE_CACHE_SIZE; + loff_t page_off = folio_pos(folio); + int err; + loff_t len = folio_size(folio); + loff_t wlen; + struct ceph_writeback_ctl ceph_wbc; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + bool caching = ceph_is_cache_enabled(inode); + struct page *bounce_page = NULL; - dout("writepage %p idx %lu\n", page, page->index); + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, + folio->index); - if (!page->mapping || !page->mapping->host) { - dout("writepage %p - no mapping\n", page); - return -EFAULT; - } - inode = page->mapping->host; - ci = ceph_inode(inode); - fsc = ceph_inode_to_client(inode); - osdc = &fsc->client->osdc; + if (ceph_inode_is_shutdown(inode)) + return -EIO; /* verify this is a writeable snap context */ - snapc = page_snap_context(page); - if (snapc == NULL) { - dout("writepage %p page %p not dirty?\n", inode, page); - goto out; + snapc = page_snap_context(&folio->page); + if (!snapc) { + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), + folio); + return 0; } - oldest = get_oldest_context(inode, &snap_size); + oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - dout("writepage %p page %p snapc %p not writeable - noop\n", - inode, page, snapc); + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", + ceph_vinop(inode), folio, snapc); /* we should only noop if called by kswapd */ - WARN_ON((current->flags & PF_MEMALLOC) == 0); + WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - goto out; + folio_redirty_for_writepage(wbc, folio); + return 0; } ceph_put_snap_context(oldest); - spin_lock(&ci->i_ceph_lock); - truncate_seq = ci->i_truncate_seq; - truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); - spin_unlock(&ci->i_ceph_lock); - /* is this a partial page at end of file? */ - if (page_off >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); - goto out; + if (page_off >= ceph_wbc.i_size) { + doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n", + ceph_vinop(inode), folio->index, ceph_wbc.i_size); + folio_invalidate(folio, 0, folio_size(folio)); + return 0; } - if (snap_size < page_off + len) - len = snap_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", - inode, page, page->index, page_off, len, snapc); + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; + + wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, + snapc->seq); - writeback_stat = atomic_long_inc_return(&fsc->writeback_count); - if (writeback_stat > + if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - - set_page_writeback(page); - err = ceph_osdc_writepages(osdc, ceph_vino(inode), - &ci->i_layout, snapc, - page_off, len, - truncate_seq, truncate_size, - &inode->i_mtime, &page, 1); + fsc->write_congested = true; + + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); + if (IS_ERR(req)) { + folio_redirty_for_writepage(wbc, folio); + return PTR_ERR(req); + } + + if (wlen < len) + len = wlen; + + folio_start_writeback(folio); + if (caching) + ceph_set_page_fscache(&folio->page); + ceph_fscache_write_to_cache(inode, page_off, len, caching); + + if (IS_ENCRYPTED(inode)) { + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, + CEPH_FSCRYPT_BLOCK_SIZE, 0, + GFP_NOFS); + if (IS_ERR(bounce_page)) { + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); + ceph_osdc_put_request(req); + return PTR_ERR(bounce_page); + } + } + + /* it may be a short write due to an object boundary */ + WARN_ON_ONCE(len > folio_size(folio)); + osd_req_op_extent_osd_data_pages(req, 0, + bounce_page ? &bounce_page : &page, wlen, 0, + false, false); + doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n", + ceph_vinop(inode), page_off, len, wlen, + IS_ENCRYPTED(inode) ? "" : "not "); + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(osdc, req); + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, err); + fscrypt_free_bounce_page(bounce_page); + ceph_osdc_put_request(req); + if (err == 0) + err = len; + if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + doutc(cl, "%llx.%llx interrupted page %p\n", + ceph_vinop(inode), folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); + return err; + } + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + doutc(cl, "%llx.%llx setting mapping error %d %p\n", + ceph_vinop(inode), err, folio); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { - dout("writepage cleaned page %p\n", page); + doutc(cl, "%llx.%llx cleaned page %p\n", + ceph_vinop(inode), folio); err = 0; /* vfs expects us to return 0 */ } - page->private = 0; - ClearPagePrivate(page); - end_page_writeback(page); + oldest = folio_detach_private(folio); + WARN_ON_ONCE(oldest != snapc); + folio_end_writeback(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ -out: - return err; -} - -static int ceph_writepage(struct page *page, struct writeback_control *wbc) -{ - int err; - struct inode *inode = page->mapping->host; - BUG_ON(!inode); - ihold(inode); - err = writepage_nounlock(page, wbc); - unlock_page(page); - iput(inode); - return err; -} - -/* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + fsc->write_congested = false; - pagevec_init(&pvec, 0); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); + return err; } - /* * async writeback completion handler. * * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; - unsigned wrote; struct page *page; - int num_pages; - int i; + int num_pages, total_pages = 0; + int i, j; + int rc = req->r_result; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - int rc = req->r_result; - u64 bytes = req->r_ops[0].extent.length; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - long writeback_stat; - unsigned issued = ceph_caps_issued(ci); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + unsigned int len = 0; + bool remove_page; - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - if (rc >= 0) { - /* - * Assume we wrote the pages we originally sent. The - * osd might reply with fewer pages if our writeback - * raced with a truncation and was adjusted at the osd, - * so don't believe the reply. - */ - wrote = num_pages; - } else { - wrote = 0; + doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc); + if (rc < 0) { mapping_set_error(mapping, rc); + ceph_set_error_write(ci); + if (rc == -EBLOCKLISTED) + fsc->blocklisted = true; + } else { + ceph_clear_error_write(ci); } - dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", - inode, rc, bytes, wrote); + + /* + * We lost the cache cap, need to truncate the page before + * it is unlocked, otherwise we'd truncate it later in the + * page truncation thread, possibly losing some data that + * raced its way in + */ + remove_page = !(ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); /* clean all pages */ - for (i = 0; i < num_pages; i++) { - page = osd_data->pages[i]; - BUG_ON(!page); - WARN_ON(!PageUptodate(page)); - - writeback_stat = - atomic_long_dec_return(&fsc->writeback_count); - if (writeback_stat < - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(&fsc->backing_dev_info, - BLK_RW_ASYNC); - - ceph_put_snap_context(page_snap_context(page)); - page->private = 0; - ClearPagePrivate(page); - dout("unlocking %d %p\n", i, page); - end_page_writeback(page); + for (i = 0; i < req->r_num_ops; i++) { + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn_client(cl, + "%llx.%llx incorrect op %d req %p index %d tid %llu\n", + ceph_vinop(inode), req->r_ops[i].op, req, i, + req->r_tid); + break; + } - /* - * We lost the cache cap, need to truncate the page before - * it is unlocked, otherwise we'd truncate it later in the - * page truncation thread, possibly losing some data that - * raced its way in - */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) - generic_error_remove_page(inode->i_mapping, page); + osd_data = osd_req_op_extent_osd_data(req, i); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + len += osd_data->length; + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + total_pages += num_pages; + for (j = 0; j < num_pages; j++) { + page = osd_data->pages[j]; + if (fscrypt_is_bounce_page(page)) { + page = fscrypt_pagecache_page(page); + fscrypt_free_bounce_page(osd_data->pages[j]); + osd_data->pages[j] = page; + } + BUG_ON(!page); + WARN_ON(!PageUptodate(page)); - unlock_page(page); + if (atomic_long_dec_return(&fsc->writeback_count) < + CONGESTION_OFF_THRESH( + fsc->mount_options->congestion_kb)) + fsc->write_congested = false; + + ceph_put_snap_context(detach_page_private(page)); + end_page_writeback(page); + + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { + wake_up_all(&mdsc->flush_end_wq); + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); + } + + doutc(cl, "unlocking %p\n", page); + + if (remove_page) + generic_error_remove_folio(inode->i_mapping, + page_folio(page)); + + unlock_page(page); + } + doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n", + ceph_vinop(inode), osd_data->length, + rc >= 0 ? num_pages : 0); + + release_pages(osd_data->pages, num_pages); } - dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(osd_data->pages, num_pages); + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, rc); + + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); + + osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) - mempool_free(osd_data->pages, - ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); + mempool_free(osd_data->pages, ceph_wb_pagevec_pool); else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * initiate async writeback - */ -static int ceph_writepages_start(struct address_space *mapping, - struct writeback_control *wbc) +static inline +bool is_forced_umount(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start, end; - int range_whole = 0; - int should_loop = 1; - pgoff_t max_pages = 0, max_pages_ever = 0; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct pagevec pvec; - int done = 0; - int rc = 0; - unsigned wsize = 1 << inode->i_blkbits; - struct ceph_osd_request *req = NULL; - int do_sync; - u64 truncate_size, snap_size; - u32 truncate_seq; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + + if (ceph_inode_is_shutdown(inode)) { + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited_client(cl, + "%llx.%llx %lld forced umount\n", + ceph_vinop(inode), ceph_ino(inode)); + } + mapping_set_error(mapping, -EIO); + return true; + } - /* - * Include a 'sync' in the OSD request if this is a data - * integrity write (e.g., O_SYNC write or fsync()), or if our - * cap is being revoked. - */ - if ((wbc->sync_mode == WB_SYNC_ALL) || - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) - do_sync = 1; - dout("writepages_start %p dosync=%d (mode=%s)\n", - inode, do_sync, - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - - if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { - pr_warning("writepage_start %p on forced umount\n", inode); - return -EIO; /* we're in a forced umount, don't write! */ - } - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + return false; +} + +static inline +unsigned int ceph_define_write_size(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + unsigned int wsize = i_blocksize(inode); + + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_CACHE_SIZE) - wsize = PAGE_CACHE_SIZE; - max_pages_ever = wsize >> PAGE_CACHE_SHIFT; - pagevec_init(&pvec, 0); + return wsize; +} - /* where to start/end? */ - if (wbc->range_cyclic) { - start = mapping->writeback_index; /* Start from prev offset */ - end = -1; - dout(" cyclic, start at %lu\n", start); - } else { - start = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - should_loop = 0; - dout(" not cyclic, %lu to %lu\n", start, end); - } - index = start; +static inline +void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_init(&ceph_wbc->fbatch); + ceph_wbc->processed_in_fbatch = 0; +} + +static inline +void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_release(&ceph_wbc->fbatch); + ceph_folio_batch_init(ceph_wbc); +} + +static inline +void ceph_init_writeback_ctl(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + ceph_wbc->snapc = NULL; + ceph_wbc->last_snapc = NULL; + + ceph_wbc->strip_unit_end = 0; + ceph_wbc->wsize = ceph_define_write_size(mapping); + + ceph_wbc->nr_folios = 0; + ceph_wbc->max_pages = 0; + ceph_wbc->locked_pages = 0; + + ceph_wbc->done = false; + ceph_wbc->should_loop = false; + ceph_wbc->range_whole = false; + + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + + ceph_wbc->tag = wbc_to_tag(wbc); + + ceph_wbc->op_idx = -1; + ceph_wbc->num_ops = 0; + ceph_wbc->offset = 0; + ceph_wbc->len = 0; + ceph_wbc->from_pool = false; + + ceph_folio_batch_init(ceph_wbc); + + ceph_wbc->pages = NULL; + ceph_wbc->data_pages = NULL; +} + +static inline +int ceph_define_writeback_range(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; -retry: /* find oldest snap context with dirty data */ - ceph_put_snap_context(snapc); - snap_size = 0; - snapc = get_oldest_context(inode, &snap_size); - if (!snapc) { + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); + if (!ceph_wbc->snapc) { /* hmm, why does writepages get called when there is no dirty data? */ - dout(" no snap context with dirty data?\n"); - goto out; + doutc(cl, " no snap context with dirty data?\n"); + return -ENODATA; } - if (snap_size == 0) - snap_size = i_size_read(inode); - dout(" oldest snapc is %p seq %lld (%d snaps)\n", - snapc, snapc->seq, snapc->num_snaps); - spin_lock(&ci->i_ceph_lock); - truncate_seq = ci->i_truncate_seq; - truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); - spin_unlock(&ci->i_ceph_lock); + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", + ceph_wbc->snapc, ceph_wbc->snapc->seq, + ceph_wbc->snapc->num_snaps); + + ceph_wbc->should_loop = false; + + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); + } else { + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + ceph_wbc->range_whole = true; + doutc(cl, " not cyclic, %lu to %lu\n", + ceph_wbc->index, ceph_wbc->end); + } + } else if (!ceph_wbc->head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " non-head snapc, range whole\n"); + } - if (last_snapc && snapc != last_snapc) { - /* if we switched to a newer snapc, restart our scan at the - * start of the original file range. */ - dout(" snapc differs from last pass, restarting at %lu\n", - index); - index = start; - } - last_snapc = snapc; - - while (!done && index <= end) { - int num_ops = do_sync ? 2 : 1; - unsigned i; - int first; - pgoff_t next; - int pvec_pages, locked_pages; - struct page **pages = NULL; - mempool_t *pool = NULL; /* Becomes non-null if mempool used */ - struct page *page; - int want; - u64 offset, len; - long writeback_stat; + ceph_put_snap_context(ceph_wbc->last_snapc); + ceph_wbc->last_snapc = ceph_wbc->snapc; - next = 0; - locked_pages = 0; - max_pages = max_pages_ever; + return 0; +} -get_more_pages: - first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - want); - dout("pagevec_lookup_tag got %d\n", pvec_pages); - if (!pvec_pages && !locked_pages) - break; - for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { - page = pvec.pages[i]; - dout("? %p idx %lu\n", page, page->index); - if (locked_pages == 0) - lock_page(page); /* first page */ - else if (!trylock_page(page)) - break; +static inline +bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; +} - /* only dirty pages, or our accounting breaks */ - if (unlikely(!PageDirty(page)) || - unlikely(page->mapping != mapping)) { - dout("!dirty or !mapping %p\n", page); - unlock_page(page); - break; - } - if (!wbc->range_cyclic && page->index > end) { - dout("end of range %p\n", page); - done = 1; - unlock_page(page); - break; - } - if (next && (page->index != next)) { - dout("not consecutive %p\n", page); - unlock_page(page); - break; - } - if (wbc->sync_mode != WB_SYNC_NONE) { - dout("waiting on writeback %p\n", page); - wait_on_page_writeback(page); - } - if (page_offset(page) >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); - done = 1; - unlock_page(page); - break; - } - if (PageWriteback(page)) { - dout("%p under writeback\n", page); - unlock_page(page); - break; +static inline +bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, + unsigned index) +{ + return index < ceph_wbc->nr_folios && + ceph_wbc->locked_pages < ceph_wbc->max_pages; +} + +static +int ceph_check_page_before_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_snap_context *pgsnapc; + + /* only dirty folios, or our accounting breaks */ + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { + doutc(cl, "!dirty or !mapping %p\n", folio); + return -ENODATA; + } + + /* only if matching snap context */ + pgsnapc = page_snap_context(&folio->page); + if (pgsnapc != ceph_wbc->snapc) { + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, + ceph_wbc->snapc, ceph_wbc->snapc->seq); + + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + ceph_wbc->should_loop = true; + + return -ENODATA; + } + + if (folio_pos(folio) >= ceph_wbc->i_size) { + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc->i_size); + + if ((ceph_wbc->size_stable || + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + return -ENODATA; + } + + if (ceph_wbc->strip_unit_end && + (folio->index > ceph_wbc->strip_unit_end)) { + doutc(cl, "end of strip unit %p\n", folio); + return -E2BIG; + } + + return 0; +} + +static inline +void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, + unsigned int max_pages) +{ + ceph_wbc->pages = kmalloc_array(max_pages, + sizeof(*ceph_wbc->pages), + GFP_NOFS); + if (!ceph_wbc->pages) { + ceph_wbc->from_pool = true; + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!ceph_wbc->pages); + } +} + +static inline +void ceph_allocate_page_array(struct address_space *mapping, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_calc_file_object_mapping(&ci->i_layout, + ceph_wbc->offset, ceph_wbc->wsize, + &objnum, &objoff, &xlen); + + ceph_wbc->num_ops = 1; + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); + + BUG_ON(ceph_wbc->pages); + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); + + ceph_wbc->len = 0; +} + +static inline +bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, + const struct folio *folio) +{ + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; +} + +static inline +bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->num_ops >= + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); +} + +static inline +bool is_write_congestion_happened(struct ceph_fs_client *fsc) +{ + return atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); +} + +static inline int move_dirty_folio_in_page_array(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct page **pages = ceph_wbc->pages; + unsigned int index = ceph_wbc->locked_pages; + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; + + if (IS_ENCRYPTED(inode)) { + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, + PAGE_SIZE, + 0, + gfp_flags); + if (IS_ERR(pages[index])) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { + pr_err_client(cl, "inode->i_blkbits=%hhu\n", + inode->i_blkbits); } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc->seq > snapc->seq) { - dout("page snapc %p %lld > oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - unlock_page(page); - if (!locked_pages) - continue; /* keep looking for snap */ + /* better not fail on first page! */ + BUG_ON(ceph_wbc->locked_pages == 0); + + pages[index] = NULL; + return err; + } + } else { + pages[index] = &folio->page; + } + + ceph_wbc->locked_pages++; + + return 0; +} + +static +int ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct folio *folio = NULL; + unsigned i; + int rc = 0; + + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { + folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " + "folio_test_dirty %#x, folio_test_locked %#x\n", + folio, folio->index, folio_test_writeback(folio), + folio_test_dirty(folio), + folio_test_locked(folio)); + + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + continue; + } + + if (ceph_wbc->locked_pages == 0) + folio_lock(folio); + else if (!folio_trylock(folio)) + break; + + rc = ceph_check_page_before_write(mapping, wbc, + ceph_wbc, folio); + if (rc == -ENODATA) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } else if (rc == -E2BIG) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + break; + } + + if (!folio_clear_dirty_for_io(folio)) { + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possible write size and + * allocate a page array + */ + if (ceph_wbc->locked_pages == 0) { + ceph_allocate_page_array(mapping, ceph_wbc, folio); + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { + if (is_num_ops_too_big(ceph_wbc)) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); break; } - if (!clear_page_dirty_for_io(page)) { - dout("%p !clear_page_dirty_for_io\n", page); - unlock_page(page); + ceph_wbc->num_ops++; + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_wbc->len = 0; + } + + /* note position of first page in fbatch */ + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", + ceph_vinop(inode), folio, folio->index); + + fsc->write_congested = is_write_congestion_happened(fsc); + + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, + folio); + if (rc) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + break; + } + + ceph_wbc->fbatch.folios[i] = NULL; + ceph_wbc->len += folio_size(folio); + } + + ceph_wbc->processed_in_fbatch = i; + + return rc; +} + +static inline +void ceph_shift_unused_folios_left(struct folio_batch *fbatch) +{ + unsigned j, n = 0; + + /* shift unused page to beginning of fbatch */ + for (j = 0; j < folio_batch_count(fbatch); j++) { + if (!fbatch->folios[j]) + continue; + + if (n < j) { + fbatch->folios[n] = fbatch->folios[j]; + } + + n++; + } + + fbatch->nr = n; +} + +static +int ceph_submit_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_vino vino = ceph_vino(inode); + struct ceph_osd_request *req = NULL; + struct page *page = NULL; + bool caching = ceph_is_cache_enabled(inode); + u64 offset; + u64 len; + unsigned i; + +new_request: + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); + len = ceph_wbc->wsize; + + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, ceph_wbc->num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, false); + if (IS_ERR(req)) { + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, + min(ceph_wbc->num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, + ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, + true); + BUG_ON(IS_ERR(req)); + } + + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { + struct folio *folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + page = &folio->page; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } + + for (i = 0; i < ceph_wbc->locked_pages; i++) { + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + + if (!page) + continue; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } + + ceph_osdc_put_request(req); + return -EIO; + } + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + ceph_wbc->data_pages = ceph_wbc->pages; + ceph_wbc->op_idx = 0; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + u64 cur_offset; + + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + cur_offset = page_offset(page); + + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ + if (offset + len != cur_offset) { + /* If it's full, stop here */ + if (ceph_wbc->op_idx + 1 == req->r_num_ops) break; - } - /* - * We have something to write. If this is - * the first locked page this time through, - * allocate an osd request and a page array - * that it will use. - */ - if (locked_pages == 0) { - BUG_ON(pages); - /* prepare async write request */ - offset = (u64)page_offset(page); - len = wsize; - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, num_ops, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, truncate_seq, - truncate_size, true); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - unlock_page(page); - break; - } - - req->r_callback = writepages_finish; - req->r_inode = inode; - - max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc(max_pages * sizeof (*pages), - GFP_NOFS); - if (!pages) { - pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); - BUG_ON(!pages); - } - } + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); - /* note position of first page in pvec */ - if (first < 0) - first = i; - dout("%p will write page %p idx %lu\n", - inode, page, page->index); - - writeback_stat = - atomic_long_inc_return(&fsc->writeback_count); - if (writeback_stat > CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(&fsc->backing_dev_info, - BLK_RW_ASYNC); - } + /* Start a new extent */ + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, + cur_offset - offset); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, + len, 0, + ceph_wbc->from_pool, + false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); - set_page_writeback(page); - pages[locked_pages] = page; - locked_pages++; - next = page->index + 1; + len = 0; + offset = cur_offset; + ceph_wbc->data_pages = ceph_wbc->pages + i; + ceph_wbc->op_idx++; } - /* did we get anything? */ - if (!locked_pages) - goto release_pvec_pages; - if (i) { - int j; - BUG_ON(!locked_pages || first < 0); - - if (pvec_pages && i == pvec_pages && - locked_pages < max_pages) { - dout("reached end pvec, trying for more\n"); - pagevec_reinit(&pvec); - goto get_more_pages; - } + set_page_writeback(page); + + if (caching) + ceph_set_page_fscache(page); + + len += thp_size(page); + } + + ceph_fscache_write_to_cache(inode, offset, len, caching); + + if (ceph_wbc->size_stable) { + len = min(len, ceph_wbc->i_size - offset); + } else if (i == ceph_wbc->locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - thp_size(page); + len = get_writepages_data_length(inode, + ceph_wbc->pages[i - 1], + offset); + len = max(len, min_len); + } + + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); + } + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, len, + 0, ceph_wbc->from_pool, false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); + + ceph_wbc->from_pool = false; + if (i < ceph_wbc->locked_pages) { + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); + ceph_wbc->num_ops -= req->r_num_ops; + ceph_wbc->locked_pages -= i; + + /* allocate new pages array for next request */ + ceph_wbc->data_pages = ceph_wbc->pages; + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + memset(ceph_wbc->data_pages + i, 0, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + } else { + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); + /* request message now owns the pages array */ + ceph_wbc->pages = NULL; + } - /* shift unused pages over in the pvec... we - * will need to release them below. */ - for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", - pvec.pages[j]); - pvec.pages[j-i+first] = pvec.pages[j]; + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + req = NULL; + + wbc->nr_to_write -= i; + if (ceph_wbc->pages) + goto new_request; + + return 0; +} + +static +void ceph_wait_until_current_writes_complete(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct page *page; + unsigned i, nr; + + if (wbc->sync_mode != WB_SYNC_NONE && + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc->head_snapc) { + ceph_wbc->index = 0; + + while ((ceph_wbc->index <= ceph_wbc->end) && + (nr = filemap_get_folios_tag(mapping, + &ceph_wbc->index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &ceph_wbc->fbatch))) { + for (i = 0; i < nr; i++) { + page = &ceph_wbc->fbatch.folios[i]->page; + if (page_snap_context(page) != ceph_wbc->snapc) + continue; + wait_on_page_writeback(page); } - pvec.nr -= i-first; + + folio_batch_release(&ceph_wbc->fbatch); + cond_resched(); } + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_writeback_ctl ceph_wbc; + int rc = 0; + + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) + return 0; + + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (is_forced_umount(mapping)) { + /* we're in a forced umount, don't write! */ + return -EIO; + } + + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto out; + } - /* Format the osd request message and submit the write */ +retry: + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); + if (rc == -ENODATA) { + /* hmm, why does writepages get called when there + is no dirty data? */ + rc = 0; + goto dec_osd_stopping_blocker; + } - offset = page_offset(pages[0]); - len = min(snap_size - offset, - (u64)locked_pages << PAGE_CACHE_SHIFT); - dout("writepages got %d pages at %llu~%llu\n", - locked_pages, offset, len); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, - !!pool, false); + while (!has_writeback_done(&ceph_wbc)) { + ceph_wbc.locked_pages = 0; + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; - pages = NULL; /* request message now owns the pages array */ - pool = NULL; +get_more_pages: + ceph_folio_batch_reinit(&ceph_wbc); - /* Update the write op length in case we changed it */ + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, + &ceph_wbc.index, + ceph_wbc.end, + ceph_wbc.tag, + &ceph_wbc.fbatch); + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", + ceph_wbc.tag, ceph_wbc.nr_folios); - osd_req_op_extent_update(req, 0, len); + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) + break; - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); +process_folio_batch: + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); + if (rc) + goto release_folios; - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); - BUG_ON(rc); - req = NULL; + /* did we get anything? */ + if (!ceph_wbc.locked_pages) + goto release_folios; + + if (ceph_wbc.processed_in_fbatch) { + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && + ceph_wbc.locked_pages < ceph_wbc.max_pages) { + doutc(cl, "reached end fbatch, trying for more\n"); + goto get_more_pages; + } + } + + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; - /* continue? */ - index = next; - wbc->nr_to_write -= locked_pages; - if (wbc->nr_to_write <= 0) - done = 1; + ceph_wbc.locked_pages = 0; + ceph_wbc.strip_unit_end = 0; -release_pvec_pages: - dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, - pvec.nr ? pvec.pages[0] : NULL); - pagevec_release(&pvec); + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { + ceph_wbc.nr_folios = + folio_batch_count(&ceph_wbc.fbatch); + goto process_folio_batch; + } - if (locked_pages && !done) - goto retry; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + ceph_wbc.done = true; + +release_folios: + doutc(cl, "folio_batch release on %d folios (%p)\n", + (int)ceph_wbc.fbatch.nr, + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); + folio_batch_release(&ceph_wbc.fbatch); } - if (should_loop && !done) { + if (ceph_wbc.should_loop && !ceph_wbc.done) { /* more to do; loop back to beginning of file */ - dout("writepages looping back to beginning of file\n"); - should_loop = 0; - index = 0; + doutc(cl, "looping back to beginning of file\n"); + /* OK even when start_index == 0 */ + ceph_wbc.end = ceph_wbc.start_index - 1; + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); + + ceph_wbc.start_index = 0; + ceph_wbc.index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = ceph_wbc.index; + +dec_osd_stopping_blocker: + ceph_dec_osd_stopping_blocker(fsc->mdsc); out: - if (req) - ceph_osdc_put_request(req); - ceph_put_snap_context(snapc); - dout("writepages done, rc = %d\n", rc); + ceph_put_snap_context(ceph_wbc.last_snapc); + doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), + rc); + return rc; } - - /* * See if a given @snapc is either writeable, or already written. */ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); return ret; } -/* - * We are only allowed to write into/dirty the page if the page is - * clean, or already dirty within the same snap context. +/** + * ceph_find_incompatible - find an incompatible context and return it + * @folio: folio being dirtied + * + * We are only allowed to write into/dirty a folio if the folio is + * clean, or already dirty within the same snap context. Returns a + * conflicting context if there is one, NULL if there isn't, or a + * negative error code on other errors. * - * called with page locked. - * return success with page locked, - * or any failure (incl -EAGAIN) with page unlocked. + * Must be called with folio lock held. */ -static int ceph_update_writeable_page(struct file *file, - loff_t pos, unsigned len, - struct page *page) +static struct ceph_snap_context * +ceph_find_incompatible(struct folio *folio) { - struct inode *inode = file_inode(file); + struct inode *inode = folio->mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - loff_t page_off = pos & PAGE_CACHE_MASK; - int pos_in_page = pos & ~PAGE_CACHE_MASK; - int end_in_page = pos_in_page + len; - loff_t i_size; - int r; - struct ceph_snap_context *snapc, *oldest; -retry_locked: - /* writepages currently holds page lock, but if we change that later, */ - wait_on_page_writeback(page); + if (ceph_inode_is_shutdown(inode)) { + doutc(cl, " %llx.%llx folio %p is shutdown\n", + ceph_vinop(inode), folio); + return ERR_PTR(-ESTALE); + } + + for (;;) { + struct ceph_snap_context *snapc, *oldest; + + folio_wait_writeback(folio); + + snapc = page_snap_context(&folio->page); + if (!snapc || snapc == ci->i_head_snapc) + break; - /* check snap context */ - BUG_ON(!ci->i_snap_realm); - down_read(&mdsc->snap_rwsem); - BUG_ON(!ci->i_snap_realm->cached_context); - snapc = page_snap_context(page); - if (snapc && snapc != ci->i_head_snapc) { /* - * this page is already dirty in another (older) snap + * this folio is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL); - up_read(&mdsc->snap_rwsem); - + oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { + /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - dout(" page %p snapc %p not current or oldest\n", - page, snapc); - /* - * queue for writeback, and wait for snapc to - * be writeable or written - */ - snapc = ceph_get_snap_context(snapc); - unlock_page(page); - ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r == -ERESTARTSYS) - return r; - return -EAGAIN; + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", + ceph_vinop(inode), folio, snapc); + return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); - /* yay, writeable, do it now (without dropping page lock) */ - dout(" page %p snapc %p not current, but oldest\n", - page, snapc); - if (!clear_page_dirty_for_io(page)) - goto retry_locked; - r = writepage_nounlock(page, NULL); - if (r < 0) - goto fail_nosnap; - goto retry_locked; - } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - return 0; + /* yay, writeable, do it now (without dropping folio lock) */ + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", + ceph_vinop(inode), folio, snapc); + if (folio_clear_dirty_for_io(folio)) { + int r = write_folio_nounlock(folio, NULL); + if (r < 0) + return ERR_PTR(r); + } } + return NULL; +} - /* full page? */ - if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) - return 0; +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct folio **foliop, void **_fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; - /* past end of file? */ - i_size = inode->i_size; /* caller holds i_mutex */ + snapc = ceph_find_incompatible(*foliop); + if (snapc) { + int r; - if (i_size + len > inode->i_sb->s_maxbytes) { - /* file is too big */ - r = -EINVAL; - goto fail; - } + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; + if (IS_ERR(snapc)) + return PTR_ERR(snapc); - if (page_off >= i_size || - (pos_in_page == 0 && (pos+len) >= i_size && - end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { - dout(" zeroing %p 0 - %d and %d - %d\n", - page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); - zero_user_segments(page, - 0, pos_in_page, - end_in_page, PAGE_CACHE_SIZE); - return 0; + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return r == 0 ? -EAGAIN : r; } - - /* we need to read it. */ - up_read(&mdsc->snap_rwsem); - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; - goto retry_locked; - -fail: - up_read(&mdsc->snap_rwsem); -fail_nosnap: - unlock_page(page); - return r; + return 0; } /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ -static int ceph_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int ceph_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct page *page; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct ceph_inode_info *ci = ceph_inode(inode); int r; - do { - /* get a page */ - page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) - return -ENOMEM; - *pagep = page; - - dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); - - r = ceph_update_writeable_page(file, pos, len, page); - } while (r == -EAGAIN); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL); + if (r < 0) + return r; - return r; + folio_wait_private_2(*foliop); /* [DEPRECATED] */ + WARN_ON_ONCE(!folio_test_locked(*foliop)); + return 0; } /* * we don't do anything in here that simple_write_end doesn't do - * except adjust dirty page accounting and drop read lock on - * mdsc->snap_rwsem. + * except adjust dirty page accounting */ -static int ceph_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int ceph_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); - int check_cap = 0; + struct ceph_client *cl = ceph_inode_to_client(inode); + bool check_cap = false; - dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, - inode, page, (int)pos, (int)copied, (int)len); + doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode), + file, folio, (int)pos, (int)copied, (int)len); - /* zero the stale part of the page if we did a short copy */ - if (copied < len) - zero_user_segment(page, from+copied, len); + if (!folio_test_uptodate(folio)) { + /* just return that nothing was copied on a short copy */ + if (copied < len) { + copied = 0; + goto out; + } + folio_mark_uptodate(folio); + } /* did file size increase? */ - /* (no need for i_size_read(); we caller holds i_mutex */ - if (pos+copied > inode->i_size) + if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); - if (!PageUptodate(page)) - SetPageUptodate(page); - - set_page_dirty(page); + folio_mark_dirty(folio); - unlock_page(page); - up_read(&mdsc->snap_rwsem); - page_cache_release(page); +out: + folio_unlock(folio); + folio_put(folio); if (check_cap) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } -/* - * we set .direct_IO to indicate direct io is supported, but since we - * intercept O_DIRECT reads and writes early, this function should - * never get called. - */ -static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t pos, unsigned long nr_segs) -{ - WARN_ON(1); - return -EINVAL; -} - const struct address_space_operations ceph_aops = { - .readpage = ceph_readpage, - .readpages = ceph_readpages, - .writepage = ceph_writepage, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, - .set_page_dirty = ceph_set_page_dirty, - .invalidatepage = ceph_invalidatepage, - .releasepage = ceph_releasepage, - .direct_IO = ceph_direct_io, + .dirty_folio = ceph_dirty_folio, + .invalidate_folio = ceph_invalidate_folio, + .release_folio = netfs_release_folio, + .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops */ +static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; + int want, got, err; + sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; + + if (ceph_inode_is_shutdown(inode)) + return ret; + + ceph_block_sigs(&oldset); + + doutc(cl, "%llx.%llx %llu trying to get caps\n", + ceph_vinop(inode), off); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + + got = 0; + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); + if (err < 0) + goto out_restore; + + doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode), + off, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + !ceph_has_inline_data(ci)) { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + ceph_add_rw_context(fi, &rw_ctx); + ret = filemap_fault(vmf); + ceph_del_rw_context(fi, &rw_ctx); + doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n", + ceph_vinop(inode), off, ceph_cap_string(got), ret); + } else + err = -EAGAIN; + + ceph_put_cap_refs(ci, got); + + if (err != -EAGAIN) + goto out_restore; + + /* read inline data */ + if (off >= PAGE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + struct address_space *mapping = inode->i_mapping; + struct page *page; -/* - * Reuse write_begin here for simplicity. - */ -static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) + filemap_invalidate_lock_shared(mapping); + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (!page) { + ret = VM_FAULT_OOM; + goto out_inline; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0 || off >= i_size_read(inode)) { + unlock_page(page); + put_page(page); + ret = vmf_error(err); + goto out_inline; + } + if (err < PAGE_SIZE) + zero_user_segment(page, err, PAGE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + filemap_invalidate_unlock_shared(mapping); + doutc(cl, "%llx.%llx %llu read inline data ret %x\n", + ceph_vinop(inode), off, ret); + } +out_restore: + ceph_restore_sigs(&oldset); + if (err < 0) + ret = vmf_error(err); + + return ret; +} + +static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); - struct page *page = vmf->page; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - loff_t off = page_offset(page); - loff_t size, len; - int ret; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = page_folio(vmf->page); + loff_t off = folio_pos(folio); + loff_t size = i_size_read(inode); + size_t len; + int want, got, err; + sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; + + if (ceph_inode_is_shutdown(inode)) + return ret; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_OOM; + + sb_start_pagefault(inode->i_sb); + ceph_block_sigs(&oldset); + + if (off + folio_size(folio) <= size) + len = folio_size(folio); + else + len = offset_in_folio(folio, size); + + doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", + ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; - /* Update time before taking page lock */ + got = 0; + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); + if (err < 0) + goto out_free; + + doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), + off, len, ceph_cap_string(got)); + + /* Update time before taking folio lock */ file_update_time(vma->vm_file); + inode_inc_iversion_raw(inode); - size = i_size_read(inode); - if (off + PAGE_CACHE_SIZE <= size) - len = PAGE_CACHE_SIZE; - else - len = size & ~PAGE_CACHE_MASK; + do { + struct ceph_snap_context *snapc; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + folio_lock(folio); - lock_page(page); + if (folio_mkwrite_check_truncate(folio, inode) < 0) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + break; + } - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) - goto out; + snapc = ceph_find_incompatible(folio); + if (!snapc) { + /* success. we'll keep the folio locked. */ + folio_mark_dirty(folio); + ret = VM_FAULT_LOCKED; + break; + } - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret == 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - up_read(&mdsc->snap_rwsem); - ret = VM_FAULT_LOCKED; + folio_unlock(folio); + + if (IS_ERR(snapc)) { + ret = VM_FAULT_SIGBUS; + break; + } + + ceph_queue_writeback(inode); + err = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + } while (err == 0); + + if (ret == VM_FAULT_LOCKED) { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n", + ceph_vinop(inode), off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs_async(ci, got); +out_free: + ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); + ceph_free_cap_flush(prealloc_cf); + if (err < 0) + ret = vmf_error(err); + return ret; +} + +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + return; + } + } + + doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode, + ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); else - ret = VM_FAULT_SIGBUS; + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } +} + +int ceph_uninline_data(struct file *file) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_request *req = NULL; + struct ceph_cap_flush *prealloc_cf = NULL; + struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; + struct page *pages[1]; + int err = 0; + u64 len; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode), + inline_version); + + if (ceph_inode_is_shutdown(inode)) { + err = -EIO; + goto out; + } + + if (inline_version == CEPH_INLINE_NONE) + return 0; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); + + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, + NULL, 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_unlock; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out_unlock; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_unlock; + } + + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); + + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put_req; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put_req; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, err); + +out_uninline: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out_unlock: + if (folio) { + folio_unlock(folio); + folio_put(folio); } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) - unlock_page(page); - return ret; + ceph_free_cap_flush(prealloc_cf); + doutc(cl, "%llx.%llx inline_version %llu = %d\n", + ceph_vinop(inode), inline_version, err); + return err; } -static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, +static const struct vm_operations_struct ceph_vmops = { + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; -int ceph_mmap(struct file *file, struct vm_area_struct *vma) +int ceph_mmap_prepare(struct vm_area_desc *desc) { - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = desc->file->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; - file_accessed(file); - vma->vm_ops = &ceph_vmops; + desc->vm_ops = &ceph_vmops; return 0; } + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, + s64 pool, struct ceph_string *pool_ns) +{ + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client *cl = fsc->client; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + size_t pool_ns_len; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + if (pool_ns) + doutc(cl, "pool %lld ns %.*s no perm cached\n", pool, + (int)pool_ns->len, pool_ns->str); + else + doutc(cl, "pool %lld no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + if (pool_ns) + rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + ceph_osdc_start_request(&fsc->client->osdc, rd_req); + + wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode); + ceph_osdc_start_request(&fsc->client->osdc, wr_req); + + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) { + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + goto out_unlock; + } + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + if (err2 == -EBLOCKLISTED) + fsc->blocklisted = true; + err = err2; + goto out_unlock; + } + + pool_ns_len = pool_ns ? pool_ns->len : 0; + perm = kmalloc(struct_size(perm, pool_ns, pool_ns_len + 1), GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + perm->pool_ns_len = pool_ns_len; + if (pool_ns_len > 0) + memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); + perm->pool_ns[pool_ns_len] = 0; + + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + if (pool_ns) + doutc(cl, "pool %lld ns %.*s result = %d\n", pool, + (int)pool_ns->len, pool_ns->str, err); + else + doutc(cl, "pool %lld result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct inode *inode, int need) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_string *pool_ns; + s64 pool; + int ret, flags; + + /* Only need to do this for regular files */ + if (!S_ISREG(inode->i_mode)) + return 0; + + if (ci->i_vino.snap != CEPH_NOSNAP) { + /* + * Pool permission check needs to write to the first object. + * But for snapshot, head of the first object may have already + * been deleted. Skip check to avoid creating orphan object. + */ + return 0; + } + + if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ci->i_layout.pool_id; + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + doutc(cl, "pool %lld no read perm\n", pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + doutc(cl, "pool %lld no write perm\n", pool); + return -EPERM; + } + return 0; + } + + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ret = __ceph_pool_perm_get(ci, pool, pool_ns); + ceph_put_string(pool_ns); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ci->i_layout.pool_id && + pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { + ci->i_ceph_flags |= flags; + } else { + pool = ci->i_layout.pool_id; + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 000000000000..f678bab189d8 --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/fs_context.h> +#include "super.h" +#include "cache.h" + +void ceph_fscache_register_inode_cookie(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + + /* No caching for filesystem? */ + if (!fsc->fscache) + return; + + /* Regular files only */ + if (!S_ISREG(inode->i_mode)) + return; + + /* Only new inodes! */ + if (!(inode_state_read_once(inode) & I_NEW)) + return; + + WARN_ON_ONCE(ci->netfs.cache); + + ci->netfs.cache = + fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) +{ + fscache_relinquish_cookie(ceph_fscache_cookie(ci), false); +} + +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_use_cookie(ceph_fscache_cookie(ci), will_modify); +} + +void ceph_fscache_unuse_cookie(struct inode *inode, bool update) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (update) { + loff_t i_size = i_size_read(inode); + + fscache_unuse_cookie(ceph_fscache_cookie(ci), + &ci->i_version, &i_size); + } else { + fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL); + } +} + +void ceph_fscache_update(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + loff_t i_size = i_size_read(inode); + + fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size); +} + +void ceph_fscache_invalidate(struct inode *inode, bool dio_write) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + fscache_invalidate(ceph_fscache_cookie(ci), + &ci->i_version, i_size_read(inode), + dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) +{ + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + char *name; + int err = 0; + + name = kasprintf(GFP_KERNEL, "ceph,%pU%s%s", fsid, uniq_len ? "," : "", + uniq_len ? fscache_uniq : ""); + if (!name) + return -ENOMEM; + + fsc->fscache = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR_OR_NULL(fsc->fscache)) { + errorfc(fc, "Unable to register fscache cookie for %s", name); + err = fsc->fscache ? PTR_ERR(fsc->fscache) : -EOPNOTSUPP; + fsc->fscache = NULL; + } + kfree(name); + return err; +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + fscache_relinquish_volume(fsc->fscache, NULL, false); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 000000000000..20efac020394 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#include <linux/netfs.h> + +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_register_inode_cookie(struct inode *inode); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +void ceph_fscache_use_cookie(struct inode *inode, bool will_modify); +void ceph_fscache_unuse_cookie(struct inode *inode, bool update); + +void ceph_fscache_update(struct inode *inode); +void ceph_fscache_invalidate(struct inode *inode, bool dio_write); + +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return netfs_i_cookie(&ci->netfs); +} + +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct fscache_cookie *cookie = ceph_fscache_cookie(ci); + + if (cookie) { + ceph_fscache_use_cookie(inode, true); + fscache_resize_cookie(cookie, to); + ceph_fscache_unuse_cookie(inode, true); + } +} + +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) +{ + return netfs_unpin_writeback(inode, wbc); +} + +#define ceph_fscache_dirty_folio netfs_dirty_folio + +static inline bool ceph_is_cache_enabled(struct inode *inode) +{ + return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode))); +} + +#else /* CONFIG_CEPH_FSCACHE */ +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, + struct fs_context *fc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) +{ +} + +static inline void ceph_fscache_unuse_cookie(struct inode *inode, bool update) +{ +} + +static inline void ceph_fscache_update(struct inode *inode) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode, bool dio_write) +{ +} + +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + +static inline void ceph_fscache_resize(struct inode *inode, loff_t to) +{ +} + +static inline int ceph_fscache_unpin_writeback(struct inode *inode, + struct writeback_control *wbc) +{ + return 0; +} + +#define ceph_fscache_dirty_folio filemap_dirty_folio + +static inline bool ceph_is_cache_enabled(struct inode *inode) +{ + return false; +} +#endif /* CONFIG_CEPH_FSCACHE */ + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 25442b40c25a..b1a8ff612c41 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,15 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> +#include <linux/iversion.h> +#include <linux/filelock.h> +#include <linux/jiffies.h> #include "super.h" #include "mds_client.h" +#include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -39,6 +45,11 @@ * cluster to release server state. */ +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. @@ -62,6 +73,8 @@ static char *gcap_string(char *s, int c) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; + if (c & CEPH_CAP_GWREXTEND) + *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; @@ -139,24 +152,69 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); - mdsc->caps_min_count += delta; - BUG_ON(mdsc->caps_min_count < 0); + mdsc->caps_min_count = fsopt->max_readdir; + if (mdsc->caps_min_count < 1024) + mdsc->caps_min_count = 1024; + mdsc->caps_use_max = fsopt->caps_max; + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_max < mdsc->caps_min_count) + mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } -void ceph_reserve_caps(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx, int need) +static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) { + struct ceph_cap *cap; int i; + + if (nr_caps) { + BUG_ON(mdsc->caps_reserve_count < nr_caps); + mdsc->caps_reserve_count -= nr_caps; + if (mdsc->caps_avail_count >= + mdsc->caps_reserve_count + mdsc->caps_min_count) { + mdsc->caps_total_count -= nr_caps; + for (i = 0; i < nr_caps; i++) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + } else { + mdsc->caps_avail_count += nr_caps; + } + + doutc(mdsc->fsc->client, + "caps %d = %d used + %d resv + %d avail\n", + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + } +} + +/* + * Called under mdsc->mutex. + */ +int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) +{ + struct ceph_client *cl = mdsc->fsc->client; + int i, j; struct ceph_cap *cap; int have; int alloc = 0; + int max_caps; + int err = 0; + bool trimmed = false; + struct ceph_mds_session *s; LIST_HEAD(newcaps); - dout("reserve caps ctx=%p need=%d\n", ctx, need); + doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); @@ -171,17 +229,62 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc, mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); - for (i = have; i < need; i++) { + for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) - break; - list_add(&cap->caps_item, &newcaps); - alloc++; + if (cap) { + list_add(&cap->caps_item, &newcaps); + alloc++; + i++; + continue; + } + + if (!trimmed) { + for (j = 0; j < mdsc->max_sessions; j++) { + s = __ceph_lookup_mds_session(mdsc, j); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + max_caps = s->s_nr_caps - (need - i); + ceph_trim_caps(mdsc, s, max_caps); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + trimmed = true; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + int more_have; + if (mdsc->caps_avail_count >= need - i) + more_have = need - i; + else + more_have = mdsc->caps_avail_count; + + i += more_have; + have += more_have; + mdsc->caps_avail_count -= more_have; + mdsc->caps_reserve_count += more_have; + + } + spin_unlock(&mdsc->caps_list_lock); + + continue; + } + + pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, + have + alloc); + err = -ENOMEM; + break; + } + + if (!err) { + BUG_ON(have + alloc != need); + ctx->count = need; + ctx->used = 0; } - /* we didn't manage to reserve as much as we needed */ - if (have + alloc != need) - pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -191,38 +294,44 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc, BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); + + if (err) + __ceph_unreserve_caps(mdsc, have + alloc); + spin_unlock(&mdsc->caps_list_lock); - ctx->count = need; - dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", - ctx, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); -} - -int ceph_unreserve_caps(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) -{ - dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); - if (ctx->count) { - spin_lock(&mdsc->caps_list_lock); - BUG_ON(mdsc->caps_reserve_count < ctx->count); - mdsc->caps_reserve_count -= ctx->count; - mdsc->caps_avail_count += ctx->count; - ctx->count = 0; - dout("unreserve caps %d = %d used + %d resv + %d avail\n", - mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); - BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + - mdsc->caps_reserve_count + - mdsc->caps_avail_count); - spin_unlock(&mdsc->caps_list_lock); - } - return 0; + doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + return err; +} + +void ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) +{ + struct ceph_client *cl = mdsc->fsc->client; + bool reclaim = false; + if (!ctx->count) + return; + + doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); + spin_lock(&mdsc->caps_list_lock); + __ceph_unreserve_caps(mdsc, ctx->count); + ctx->count = 0; + + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + reclaim = true; + spin_unlock(&mdsc->caps_list_lock); + + if (reclaim) + ceph_reclaim_caps_nr(mdsc, ctx->used); } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ @@ -233,19 +342,36 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); + } else { + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + BUG_ON(list_empty(&mdsc->caps_list)); + + mdsc->caps_avail_count--; + mdsc->caps_use_count++; + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + } + spin_unlock(&mdsc->caps_list_lock); } + return cap; } spin_lock(&mdsc->caps_list_lock); - dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", - ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, + ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; + ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; @@ -260,10 +386,12 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { + struct ceph_client *cl = mdsc->fsc->client; + spin_lock(&mdsc->caps_list_lock); - dout("put_cap %p %d = %d used + %d resv + %d avail\n", - cap, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to @@ -289,6 +417,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, { struct ceph_mds_client *mdsc = fsc->mdsc; + spin_lock(&mdsc->caps_list_lock); + if (total) *total = mdsc->caps_total_count; if (avail) @@ -299,6 +429,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; + + spin_unlock(&mdsc->caps_list_lock); } /* @@ -306,7 +438,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, * * Called with i_ceph_lock held. */ -static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; @@ -334,37 +466,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) } /* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - -/* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, @@ -396,14 +497,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_options *ma = mdsc->fsc->mount_options; + struct inode *inode = &ci->netfs.inode; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; - ci->i_hold_caps_min = round_jiffies(jiffies + - ma->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + - ma->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, - ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); + opt->caps_wanted_delay_max * HZ); + doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, + ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* @@ -417,9 +517,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - __cap_set_timeouts(mdsc, ci); - dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, - ci->i_ceph_flags, ci->i_hold_caps_max); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", + inode, ceph_vinop(inode), ci->i_ceph_flags, + ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { @@ -427,6 +529,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, goto no_change; list_del_init(&ci->i_cap_delay_list); } + __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); @@ -441,7 +544,9 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -458,7 +563,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -466,83 +573,106 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->cap_delay_lock); } -/* - * Common issue checks for add_cap, handle_cap_grant. - */ +/* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + unsigned had = __ceph_caps_issued(ci, NULL); + lockdep_assert_held(&ci->i_ceph_lock); + /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + if (S_ISREG(ci->netfs.inode.i_mode) && + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* - * if we are newly issued FILE_SHARED, mark dir not complete; we - * don't know what happened to this directory while we didn't - * have the cap. + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. */ - if ((issued & CEPH_CAP_FILE_SHARED) && - (had & CEPH_CAP_FILE_SHARED) == 0) { - ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + atomic_inc(&ci->i_shared_gen); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } + + /* Wipe saved layout if we're losing DIR_CREATE caps */ + if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + !(issued & CEPH_CAP_DIR_CREATE)) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } +} + +/** + * change_auth_cap_ses - move inode to appropriate lists when auth caps change + * @ci: inode to be moved + * @session: new auth caps session + */ +void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) +{ + lockdep_assert_held(&ci->i_ceph_lock); + + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) + return; + + spin_lock(&session->s_mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) + list_move(&ci->i_dirty_item, &session->s_cap_dirty); + if (!list_empty(&ci->i_flushing_item)) + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + spin_unlock(&session->s_mdsc->cap_dirty_lock); } /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; - dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, - session->s_mds, cap_id, ceph_cap_string(issued), seq); + lockdep_assert_held(&ci->i_ceph_lock); - /* - * If we are opening the file, include file mode wanted bits - * in wanted. - */ - if (fmode >= 0) - wanted |= ceph_caps_for_mode(fmode); + doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, + ceph_vinop(inode), session->s_mds, cap_id, + ceph_cap_string(issued), seq); + + gen = atomic_read(&session->s_cap_gen); -retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&ci->i_ceph_lock); - new_cap = get_cap(mdsc, caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; @@ -553,40 +683,54 @@ retry: cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; + atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); - } else if (new_cap) - ceph_put_cap(mdsc, new_cap); + } else { + spin_lock(&session->s_cap_lock); + list_move_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + if (cap->cap_gen < gen) + cap->issued = cap->implemented = CEPH_CAP_PIN; + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } - if (!ci->i_snap_realm) { + if (!ci->i_snap_realm || + ((flags & CEPH_CAP_FLAG_AUTH) && + realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); - if (realm) { - ceph_get_snap_realm(mdsc, realm); - spin_lock(&realm->inodes_with_caps_lock); - ci->i_snap_realm = realm; - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - spin_unlock(&realm->inodes_with_caps_lock); - } else { - pr_err("ceph_add_cap: couldn't find snap realm %llx\n", - realmino); - WARN_ON(!realm); - } + if (realm) + ceph_change_snap_realm(inode, realm); + else + WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", + __func__, realmino, ci->i_vino.ino, + ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); @@ -599,47 +743,40 @@ retry: actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { - dout(" issued %s, mds wanted %s, actual %s, queueing\n", - ceph_cap_string(issued), ceph_cap_string(wanted), - ceph_cap_string(actual_wanted)); + doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { - if (ci->i_auth_cap == NULL || - ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + if (!ci->i_auth_cap || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + if (ci->i_auth_cap && + ci->i_auth_cap->session != cap->session) + change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; - } else if (ci->i_auth_cap == cap) { - ci->i_auth_cap = NULL; - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + cap->mds_wanted = wanted; } - spin_unlock(&mdsc->cap_dirty_lock); + } else { + WARN_ON(ci->i_auth_cap == cap); } - dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", - inode, ceph_vinop(inode), cap, ceph_cap_string(issued), - ceph_cap_string(issued|cap->issued), seq, mds); + doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - if (mseq > cap->mseq) + if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; - - if (fmode >= 0) - __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); + cap->cap_gen = gen; wake_up_all(&ci->i_cap_wq); - return 0; } /* @@ -649,18 +786,18 @@ retry: */ static int __cap_is_valid(struct ceph_cap *cap) { + struct inode *inode = &cap->ci->netfs.inode; + struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_gen_ttl_lock); - gen = cap->session->s_cap_gen; + gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { - dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, - cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -674,7 +811,9 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -684,8 +823,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, + ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -728,16 +867,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) */ static void __touch_cap(struct ceph_cap *cap) { + struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; + struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); - if (s->s_cap_iterator == NULL) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, - s->s_mds); + if (!s->s_cap_iterator) { + doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, + ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { - dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->vfs_inode, cap, s->s_mds); + doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", + inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -749,15 +890,16 @@ static void __touch_cap(struct ceph_cap *cap) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p snap issued %s" - " (mask %s)\n", &ci->vfs_inode, - ceph_cap_string(have), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", + inode, ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(mask)); return 1; } @@ -766,10 +908,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask %p cap %p issued %s" - " (mask %s)\n", &ci->vfs_inode, cap, - ceph_cap_string(cap->issued), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; @@ -778,10 +920,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask %p combo issued %s" - " (mask %s)\n", &ci->vfs_inode, - ceph_cap_string(cap->issued), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", + inode, ceph_vinop(inode), + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); if (touch) { struct rb_node *q; @@ -793,7 +935,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) ci_node); if (!__cap_is_valid(cap)) continue; - __touch_cap(cap); + if (cap->issued & mask) + __touch_cap(cap); } } return 1; @@ -803,6 +946,20 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) return 0; } +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int touch) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); + int r; + + r = __ceph_caps_issued_mask(ci, mask, touch); + if (r) + ceph_update_cap_hit(&fsc->mdsc->metric); + else + ceph_update_cap_mis(&fsc->mdsc->metric); + return r; +} + /* * Return true if mask caps are currently being revoked by an MDS. */ @@ -814,26 +971,13 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (cap != ocap && __cap_is_valid(cap) && + if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } return 0; } -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) -{ - struct inode *inode = &ci->vfs_inode; - int ret; - - spin_lock(&ci->i_ceph_lock); - ret = __ceph_caps_revoking_other(ci, NULL, mask); - spin_unlock(&ci->i_ceph_lock); - dout("ceph_caps_revoking %p %s = %d\n", inode, - ceph_cap_string(mask), ret); - return ret; -} - int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; @@ -841,32 +985,104 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (S_ISREG(ci->netfs.inode.i_mode) && + ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; + if (ci->i_fx_ref) + used |= CEPH_CAP_FILE_EXCL; return used; } +#define FMODE_WAIT_BIAS 1000 + /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int want = 0; - int mode; - for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) - if (ci->i_nr_by_mode[mode]) - want |= ceph_caps_for_mode(mode); - return want; + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); + struct ceph_mount_options *opt = + ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; + + if (S_ISDIR(ci->netfs.inode.i_mode)) { + int want = 0; + + /* use used_cutoff here, to keep dir's wanted caps longer */ + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || + time_after(ci->i_last_rd, used_cutoff)) + want |= CEPH_CAP_ANY_SHARED; + + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || + time_after(ci->i_last_wr, used_cutoff)) { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + want |= CEPH_CAP_ANY_DIR_OPS; + } + + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) + want |= CEPH_CAP_PIN; + + return want; + } else { + int bits = 0; + + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_rd, used_cutoff)) + bits |= 1 << RD_SHIFT; + } else if (time_after(ci->i_last_rd, idle_cutoff)) { + bits |= 1 << RD_SHIFT; + } + + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_wr, used_cutoff)) + bits |= 1 << WR_SHIFT; + } else if (time_after(ci->i_last_wr, idle_cutoff)) { + bits |= 1 << WR_SHIFT; + } + + /* check lazyio only when read/write is wanted */ + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && + ci->i_nr_by_mode[LAZY_SHIFT] > 0) + bits |= 1 << LAZY_SHIFT; + + return bits ? ceph_caps_for_mode(bits >> 1) : 0; + } +} + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + /* we want EXCL if holding caps of dir ops */ + if (w & CEPH_CAP_ANY_DIR_OPS) + w |= CEPH_CAP_FILE_EXCL; + } else { + /* we want EXCL if dirty data */ + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; + } + return w; } /* * Return caps we have registered with the MDS(s) as 'wanted'. */ -int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; @@ -874,19 +1090,26 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (!__cap_is_valid(cap)) + if (check && !__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } -/* - * called under i_ceph_lock - */ -static int __ceph_is_any_caps(struct ceph_inode_info *ci) +int ceph_is_any_caps(struct inode *inode) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_real_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* @@ -895,324 +1118,490 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc; int removed = 0; - dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + doutc(cl, "inode is NULL\n"); + return; + } + + lockdep_assert_held(&ci->i_ceph_lock); + + doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); + + mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; + + /* remove from inode's cap rbtree, and clear auth cap */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ - dout("__ceph_remove_cap delaying %p removal from session %p\n", - cap, cap->session); + doutc(cl, "delaying %p removal from session %p\n", cap, + cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; + atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; - spin_unlock(&session->s_cap_lock); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == atomic_read(&session->s_cap_gen))) { + cap->queue_release = 1; + if (removed) { + __ceph_queue_cap_release(session, cap); + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + + spin_unlock(&session->s_cap_lock); if (removed) ceph_put_cap(mdsc, cap); - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); - } - if (!__ceph_is_any_real_caps(ci)) + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + ceph_change_snap_realm(&ci->netfs.inode, NULL); + __cap_delay_cancel(mdsc, ci); + } } -/* - * Build and send a cap message to the given MDS. - * - * Caller should be holding s_mutex. - */ -static int send_cap_msg(struct ceph_mds_session *session, - u64 ino, u64 cid, int op, - int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, - struct timespec *mtime, struct timespec *atime, - u64 time_warp_seq, - kuid_t uid, kgid_t gid, umode_t mode, - u64 xattr_version, - struct ceph_buffer *xattrs_buf, - u64 follows) +void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + bool queue_release) { - struct ceph_mds_caps *fc; - struct ceph_msg *msg; - - dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), - cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), - ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, - xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + struct ceph_inode_info *ci = cap->ci; + struct ceph_fs_client *fsc; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); - if (!msg) - return -ENOMEM; + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + doutc(mdsc->fsc->client, "inode is NULL\n"); + return; + } - msg->hdr.tid = cpu_to_le64(flush_tid); + lockdep_assert_held(&ci->i_ceph_lock); - fc = msg->front.iov_base; - memset(fc, 0, sizeof(*fc)); + fsc = ceph_inode_to_fs_client(&ci->netfs.inode); + WARN_ON_ONCE(ci->i_auth_cap == cap && + !list_empty(&ci->i_dirty_item) && + !fsc->blocklisted && + !ceph_inode_is_shutdown(&ci->netfs.inode)); - fc->cap_id = cpu_to_le64(cid); - fc->op = cpu_to_le32(op); - fc->seq = cpu_to_le32(seq); - fc->issue_seq = cpu_to_le32(issue_seq); - fc->migrate_seq = cpu_to_le32(mseq); - fc->caps = cpu_to_le32(caps); - fc->wanted = cpu_to_le32(wanted); - fc->dirty = cpu_to_le32(dirty); - fc->ino = cpu_to_le64(ino); - fc->snap_follows = cpu_to_le64(follows); - - fc->size = cpu_to_le64(size); - fc->max_size = cpu_to_le64(max_size); - if (mtime) - ceph_encode_timespec(&fc->mtime, mtime); - if (atime) - ceph_encode_timespec(&fc->atime, atime); - fc->time_warp_seq = cpu_to_le32(time_warp_seq); - - fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); - fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); - fc->mode = cpu_to_le32(mode); - - fc->xattr_version = cpu_to_le64(xattr_version); - if (xattrs_buf) { - msg->middle = ceph_buffer_get(xattrs_buf); - fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); - msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); - } - - ceph_con_send(&session->s_con, msg); - return 0; + __ceph_remove_cap(cap, queue_release); } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +struct cap_msg_args { + struct ceph_mds_session *session; + u64 ino, cid, follows; + u64 flush_tid, oldest_flush_tid, size, max_size; + u64 xattr_version; + u64 change_attr; + struct ceph_buffer *xattr_buf; + struct ceph_buffer *old_xattr_buf; + struct timespec64 atime, mtime, ctime, btime; + int op, caps, wanted, dirty; + u32 seq, issue_seq, mseq, time_warp_seq; + u32 flags; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool inline_data; + bool wake; + bool encrypted; + u32 fscrypt_auth_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context +}; + +/* Marshal up the cap msg to the MDS */ +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; + struct ceph_mds_caps *fc; + void *p; + struct ceph_mds_client *mdsc = arg->session->s_mdsc; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + + doutc(mdsc->fsc->client, + "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" + " tid %llu/%llu mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", + ceph_cap_op_name(arg->op), arg->cid, arg->ino, + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, + arg->size, arg->max_size, arg->xattr_version, + arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); + + msg->hdr.version = cpu_to_le16(12); + msg->hdr.tid = cpu_to_le64(arg->flush_tid); - spin_lock(&session->s_cap_lock); - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); + fc = msg->front.iov_base; + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(arg->cid); + fc->op = cpu_to_le32(arg->op); + fc->seq = cpu_to_le32(arg->seq); + fc->issue_seq = cpu_to_le32(arg->issue_seq); + fc->migrate_seq = cpu_to_le32(arg->mseq); + fc->caps = cpu_to_le32(arg->caps); + fc->wanted = cpu_to_le32(arg->wanted); + fc->dirty = cpu_to_le32(arg->dirty); + fc->ino = cpu_to_le64(arg->ino); + fc->snap_follows = cpu_to_le64(arg->follows); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (arg->encrypted) + fc->size = cpu_to_le64(round_up(arg->size, + CEPH_FSCRYPT_BLOCK_SIZE)); + else +#endif + fc->size = cpu_to_le64(arg->size); + fc->max_size = cpu_to_le64(arg->max_size); + ceph_encode_timespec64(&fc->mtime, &arg->mtime); + ceph_encode_timespec64(&fc->atime, &arg->atime); + ceph_encode_timespec64(&fc->ctime, &arg->ctime); + fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); + + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); + fc->mode = cpu_to_le32(arg->mode); + + fc->xattr_version = cpu_to_le64(arg->xattr_version); + if (arg->xattr_buf) { + msg->middle = ceph_buffer_get(arg->xattr_buf); + fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); } - spin_unlock(&session->s_cap_lock); + + p = fc + 1; + /* flock buffer size (version 2) */ + ceph_encode_32(&p, 0); + /* inline version (version 4) */ + ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + /* + * osd_epoch_barrier (version 5) + * The epoch_barrier is protected osdc->lock, so READ_ONCE here in + * case it was recently changed + */ + ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); + /* oldest_flush_tid (version 6) */ + ceph_encode_64(&p, arg->oldest_flush_tid); + + /* + * caller_uid/caller_gid (version 7) + * + * Currently, we don't properly track which caller dirtied the caps + * last, and force a flush of them when there is a conflict. For now, + * just set this to 0:0, to emulate how the MDS has worked up to now. + */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); + + /* pool namespace (version 8) (mds always ignores this) */ + ceph_encode_32(&p, 0); + + /* btime and change_attr (version 9) */ + ceph_encode_timespec64(p, &arg->btime); + p += sizeof(struct ceph_timespec); + ceph_encode_64(&p, arg->change_attr); + + /* Advisory flags (version 10) */ + ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* + * fscrypt_auth and fscrypt_file (version 12) + * + * fscrypt_auth holds the crypto context (if any). fscrypt_file + * tracks the real i_size as an __le64 field (and we use a rounded-up + * i_size in the traditional size field). + */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, sizeof(__le64)); + ceph_encode_64(&p, arg->size); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* - * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_ceph_lock. + * Queue cap releases when an inode is dropped from our cache. */ -void ceph_queue_caps_release(struct inode *inode) +void __ceph_remove_caps(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; + /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) + * may call __ceph_caps_issued_mask() on a freeing inode. */ + spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + ceph_remove_cap(mdsc, cap, true); } + spin_unlock(&ci->i_ceph_lock); } /* - * Send a cap msg on the given inode. Update our caps state, then - * drop i_ceph_lock and send the message. + * Prepare to send a cap message to an MDS. Update the cap state, and populate + * the arg struct with the parameters that will need to be sent. This should + * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. - * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * - * Return non-zero if delayed release, or we experienced an error - * such that the caller should requeue + retry later. - * - * called with i_ceph_lock, then drops it. - * caller should hold snap_rwsem (read), s_mutex. */ -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, - int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) - __releases(cap->ci->i_ceph_lock) +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, + int op, int flags, int used, int want, int retain, + int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; - struct inode *inode = &ci->vfs_inode; - u64 cap_id = cap->cap_id; - int held, revoking, dropping, keep; - u64 seq, issue_seq, mseq, time_warp_seq, follows; - u64 size, max_size; - struct timespec mtime, atime; - int wake = 0; - umode_t mode; - kuid_t uid; - kgid_t gid; - struct ceph_mds_session *session; - u64 xattr_version = 0; - struct ceph_buffer *xattr_blob = NULL; - int delayed = 0; - u64 flush_tid = 0; - int i; - int ret; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + int held, revoking; + + lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dropping = cap->issued & ~retain; - dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", - inode, cap, cap->session, - ceph_cap_string(held), ceph_cap_string(held & retain), - ceph_cap_string(revoking)); + doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", + inode, ceph_vinop(inode), cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); - session = cap->session; - - /* don't release wanted unless we've waited a bit. */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_min)) { - dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - want |= cap->mds_wanted; - retain |= cap->issued; - delayed = 1; - } - ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ - if (cap->implemented & ~cap->issued) { - /* - * Wake up any waiters on wanted -> needed transition. - * This is due to the weird transition from buffered - * to sync IO... we need to flush dirty pages _before_ - * allowing sync writes to avoid reordering. - */ - wake = 1; - } + /* + * Wake up any waiters on wanted -> needed transition. This is due to + * the weird transition from buffered to sync IO... we need to flush + * dirty pages _before_ allowing sync writes to avoid reordering. + */ + arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } - - keep = cap->implemented; - seq = cap->seq; - issue_seq = cap->issue_seq; - mseq = cap->mseq; - size = inode->i_size; - ci->i_reported_size = size; - max_size = ci->i_wanted_max_size; - ci->i_requested_max_size = max_size; - mtime = inode->i_mtime; - atime = inode->i_atime; - time_warp_seq = ci->i_time_warp_seq; - uid = inode->i_uid; - gid = inode->i_gid; - mode = inode->i_mode; + arg->session = cap->session; + arg->ino = ceph_vino(inode).ino; + arg->cid = cap->cap_id; + arg->follows = flushing ? ci->i_head_snapc->seq : 0; + arg->flush_tid = flush_tid; + arg->oldest_flush_tid = oldest_flush_tid; + arg->size = i_size_read(inode); + ci->i_reported_size = arg->size; + arg->max_size = ci->i_wanted_max_size; + if (cap == ci->i_auth_cap) { + if (want & CEPH_CAP_ANY_FILE_WR) + ci->i_requested_max_size = arg->max_size; + else + ci->i_requested_max_size = 0; + } if (flushing & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - xattr_blob = ci->i_xattrs.blob; - xattr_version = ci->i_xattrs.version; + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); + arg->xattr_version = ci->i_xattrs.version; + arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob); + } else { + arg->xattr_buf = NULL; + arg->old_xattr_buf = NULL; } - spin_unlock(&ci->i_ceph_lock); + arg->mtime = inode_get_mtime(inode); + arg->atime = inode_get_atime(inode); + arg->ctime = inode_get_ctime(inode); + arg->btime = ci->i_btime; + arg->change_attr = inode_peek_iversion_raw(inode); + + arg->op = op; + arg->caps = cap->implemented; + arg->wanted = want; + arg->dirty = flushing; + + arg->seq = cap->seq; + arg->issue_seq = cap->issue_seq; + arg->mseq = cap->mseq; + arg->time_warp_seq = ci->i_time_warp_seq; + + arg->uid = inode->i_uid; + arg->gid = inode->i_gid; + arg->mode = inode->i_mode; + + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && + !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap; + list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->cap_flush.tid) + break; + if (capsnap->need_flush) { + flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; + break; + } + } + } + arg->flags = flags; + arg->encrypted = IS_ENCRYPTED(inode); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } +#endif /* CONFIG_FS_ENCRYPTION */ +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) - ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, - size, max_size, &mtime, &atime, time_warp_seq, - uid, gid, mode, xattr_version, xattr_blob, - follows); - if (ret < 0) { - dout("error sending cap msg, must requeue %p\n", inode); - delayed = 1; +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; +} +#else +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; +} +#endif /* CONFIG_FS_ENCRYPTION */ + +/* + * Send a cap msg on the given inode. + * + * Caller should hold snap_rwsem (read), s_mutex. + */ +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) +{ + struct ceph_msg *msg; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); + if (!msg) { + pr_err_client(cl, + "error allocating cap msg: ino (%llx.%llx)" + " flushing %s tid %llu, requeuing cap.\n", + ceph_vinop(inode), ceph_cap_string(arg->dirty), + arg->flush_tid); + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(arg->session->s_mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + return; } - if (wake) + encode_cap_msg(msg, arg); + ceph_con_send(&arg->session->s_con, msg); + ceph_buffer_put(arg->old_xattr_buf); + ceph_buffer_put(arg->xattr_buf); + if (arg->wake) wake_up_all(&ci->i_cap_wq); +} - return delayed; +static inline int __send_flush_snap(struct inode *inode, + struct ceph_mds_session *session, + struct ceph_cap_snap *capsnap, + u32 mseq, u64 oldest_flush_tid) +{ + struct cap_msg_args arg; + struct ceph_msg *msg; + + arg.session = session; + arg.ino = ceph_vino(inode).ino; + arg.cid = 0; + arg.follows = capsnap->follows; + arg.flush_tid = capsnap->cap_flush.tid; + arg.oldest_flush_tid = oldest_flush_tid; + + arg.size = capsnap->size; + arg.max_size = 0; + arg.xattr_version = capsnap->xattr_version; + arg.xattr_buf = capsnap->xattr_blob; + arg.old_xattr_buf = NULL; + + arg.atime = capsnap->atime; + arg.mtime = capsnap->mtime; + arg.ctime = capsnap->ctime; + arg.btime = capsnap->btime; + arg.change_attr = capsnap->change_attr; + + arg.op = CEPH_CAP_OP_FLUSHSNAP; + arg.caps = capsnap->issued; + arg.wanted = 0; + arg.dirty = capsnap->dirty; + + arg.seq = 0; + arg.issue_seq = 0; + arg.mseq = mseq; + arg.time_warp_seq = capsnap->time_warp_seq; + + arg.uid = capsnap->uid; + arg.gid = capsnap->gid; + arg.mode = capsnap->mode; + + arg.inline_data = capsnap->inline_data; + arg.flags = 0; + arg.wake = false; + arg.encrypted = IS_ENCRYPTED(inode); + + /* No fscrypt_auth changes from a capsnap.*/ + arg.fscrypt_auth_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; + + encode_cap_msg(msg, &arg); + ceph_con_send(&arg.session->s_con, msg); + return 0; } /* @@ -1222,37 +1611,24 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to - * the MDS (i.e., during this session). - * - * Called under i_ceph_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. */ -void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int again) +static void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; - int mds; + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; - u32 mseq; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct ceph_mds_session *session = NULL; /* if session != NULL, we hold - session->s_mutex */ - u64 next_follows = 0; /* keep track of how far we've gotten through the - i_cap_snaps list, and skip these entries next time - around to avoid an infinite loop */ + u64 oldest_flush_tid = 0; + u64 first_tid = 1, last_tid = 0; - if (psession) - session = *psession; + doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), + session); - dout("__flush_snaps %p\n", inode); -retry: list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - /* avoid an infiniute loop after retry */ - if (capsnap->follows < next_follows) - continue; /* * we need to wait for sync writes to complete and for dirty * pages to be written out. @@ -1260,98 +1636,142 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); - - /* pick mds, take s_mutex */ - if (ci->i_auth_cap == NULL) { - dout("no auth cap (migrating?), doing nothing\n"); - goto out; - } + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { - dout("already flushed %p, skipping\n", capsnap); + if (capsnap->cap_flush.tid > 0) { + doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } - mds = ci->i_auth_cap->session->s_mds; - mseq = ci->i_auth_cap->mseq; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&capsnap->cap_flush.g_list, + &mdsc->cap_flush_list); + if (oldest_flush_tid == 0) + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, + &session->s_cap_flushing); + } + spin_unlock(&mdsc->cap_dirty_lock); + + list_add_tail(&capsnap->cap_flush.i_list, + &ci->i_cap_flush_list); + + if (first_tid == 1) + first_tid = capsnap->cap_flush.tid; + last_tid = capsnap->cap_flush.tid; + } - if (session && session->s_mds != mds) { - dout("oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - session = NULL; + ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; + + while (first_tid <= last_tid) { + struct ceph_cap *cap = ci->i_auth_cap; + struct ceph_cap_flush *cf = NULL, *iter; + int ret; + + if (!(cap && cap->session == session)) { + doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", + inode, ceph_vinop(inode), cap, session->s_mds); + break; } - if (!session) { - spin_unlock(&ci->i_ceph_lock); - mutex_lock(&mdsc->mutex); - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); - if (session) { - dout("inverting session/ino locks on %p\n", - session); - mutex_lock(&session->s_mutex); + + ret = -ENOENT; + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; + ret = 0; + break; } - /* - * if session == NULL, we raced against a cap - * deletion or migration. retry, and we'll - * get a better @mds value next time. - */ - spin_lock(&ci->i_ceph_lock); - goto retry; } + if (ret < 0) + break; - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; - atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + first_tid = cf->tid + 1; + + capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); - dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", - inode, capsnap, capsnap->follows, capsnap->flush_tid); - send_cap_msg(session, ceph_vino(inode).ino, 0, - CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, - &capsnap->mtime, &capsnap->atime, - capsnap->time_warp_seq, - capsnap->uid, capsnap->gid, capsnap->mode, - capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows); - - next_follows = capsnap->follows + 1; - ceph_put_cap_snap(capsnap); + doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, + ceph_vinop(inode), capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err_client(cl, "error sending cap flushsnap, " + "ino (%llx.%llx) tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, + capsnap->follows); + } + ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); + } +} + +void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession) +{ + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_session *session = NULL; + bool need_put = false; + int mds; + + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); + if (psession) + session = *psession; +retry: + spin_lock(&ci->i_ceph_lock); + if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { + doutc(cl, " no capsnap needs flush, doing nothing\n"); + goto out; + } + if (!ci->i_auth_cap) { + doutc(cl, " no auth cap (migrating?), doing nothing\n"); + goto out; + } + + mds = ci->i_auth_cap->session->s_mds; + if (session && session->s_mds != mds) { + doutc(cl, " oops, wrong session %p mutex\n", session); + ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); goto retry; } - /* we flushed them all; remove this inode from the queue */ - spin_lock(&mdsc->snap_flush_lock); - list_del_init(&ci->i_snap_flush_item); - spin_unlock(&mdsc->snap_flush_lock); + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + __ceph_flush_snaps(ci, session); out: + spin_unlock(&ci->i_ceph_lock); + if (psession) *psession = session; - else if (session) { - mutex_unlock(&session->s_mutex); + else ceph_put_mds_session(session); - } -} + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + if (!list_empty(&ci->i_snap_flush_item)) + need_put = true; + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); -static void ceph_flush_snaps(struct ceph_inode_info *ci) -{ - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, NULL, 0); - spin_unlock(&ci->i_ceph_lock); + if (need_put) + iput(inode); } /* @@ -1359,36 +1779,54 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct inode *inode = &ci->vfs_inode; + ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, - ceph_cap_string(mask), ceph_cap_string(was), - ceph_cap_string(was | mask)); + lockdep_assert_held(&ci->i_ceph_lock); + + if (!ci->i_auth_cap) { + pr_warn_client(cl, "%p %llx.%llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_vinop(inode), + ceph_cap_string(mask)); + return 0; + } + + doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, + ceph_vinop(inode), ceph_cap_string(mask), + ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + struct ceph_mds_session *session = ci->i_auth_cap->session; + + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + } + doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", + inode, ceph_vinop(inode), ci->i_head_snapc, + ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - if (ci->i_auth_cap) - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); - else - list_add(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1398,130 +1836,244 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + struct ceph_cap_flush *cf; + + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + if (!cf) + return NULL; + + cf->is_capsnap = false; + return cf; +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + return cf->tid; + } + return 0; +} + +/* + * Remove cap_flush from the mdsc's or inode's flushing cap list. + * Return true if caller needs to wake up flush waiters. + */ +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; + } + list_del_init(&cf->g_list); + return wake; +} + +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del_init(&cf->i_list); + return wake; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) +static u64 __mark_caps_flushing(struct inode *inode, + struct ceph_mds_session *session, bool wake, + u64 *oldest_flush_tid) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; + lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; - dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", - ceph_cap_string(flushing), - ceph_cap_string(ci->i_flushing_caps), - ceph_cap_string(ci->i_flushing_caps | flushing)); + doutc(cl, "flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; - dout(" inode %p now !dirty\n", inode); + doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); + + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; + cf->tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&cf->g_list, &mdsc->cap_flush_list); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); - } else { - list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); } spin_unlock(&mdsc->cap_dirty_lock); - return flushing; + list_add_tail(&cf->i_list, &ci->i_cap_flush_list); + + return cf->tid; } /* * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ - dout("try_nonblocking_invalidate %p success\n", inode); + doutc(cl, "%p %llx.%llx success\n", inode, + ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } - dout("try_nonblocking_invalidate %p failed\n", inode); + doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } +bool __ceph_should_report_size(struct ceph_inode_info *ci) +{ + loff_t size = i_size_read(&ci->netfs.inode); + /* mds will adjust max size according to the reported size */ + if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) + return false; + if (size >= ci->i_max_size) + return true; + /* half of previous max_size increment has been used */ + if (ci->i_max_size > ci->i_reported_size && + (size << 1) >= ci->i_max_size + ci->i_reported_size) + return true; + return false; +} + /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * - * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay - * cap release further. * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. + * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without + * further delay. */ -void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session) +void ceph_check_caps(struct ceph_inode_info *ci, int flags) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; - int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int tried_invalidate = 0; - int delayed = 0, sent = 0, force_requeue = 0, num; - int queue_invalidate = 0; - int is_delayed = flags & CHECK_CAPS_NODELAY; - - /* if we are unmounting, flush any unused caps immediately. */ - if (mdsc->stopping) - is_delayed = 1; + bool queue_invalidate = false; + bool tried_invalidate = false; + bool queue_writeback = false; + struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; + + /* Don't send messages until we get async create reply */ + spin_unlock(&ci->i_ceph_lock); + return; + } if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - - /* flush snaps first time around only */ - if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session, 0); - goto retry_locked; retry: - spin_lock(&ci->i_ceph_lock); -retry_locked: + /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); + + /* Caps which have active references against them */ used = __ceph_caps_used(ci); - want = file_wanted | used; + + /* + * "issued" represents the current caps that the MDS wants us to have. + * "implemented" is the set that we have been granted, and includes the + * ones that have not yet been returned to the MDS (the "revoking" set, + * usually because they have outstanding references). + */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + + /* The ones we currently want to retain (may be adjusted below) */ + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + if (IS_RDONLY(inode)) { + want = CEPH_CAP_ANY_SHARED; + } else { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } + retain |= want; } else { + retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, @@ -1533,348 +2085,438 @@ retry_locked: } } - dout("check_caps %p file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s%s\n", inode, - ceph_cap_string(file_wanted), + doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " + "flushing %s issued %s revoking %s retain %s %s%s%s%s\n", + inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "", + (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : ""); /* * If we no longer need to hold onto old our caps, and we may * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ - if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ - (revoking & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && + S_ISREG(inode->i_mode) && + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { - dout("check_caps trying to invalidate on %p\n", inode); + doutc(cl, "trying to invalidate on %p %llx.%llx\n", + inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { - if (revoking & (CEPH_CAP_FILE_CACHE| - CEPH_CAP_FILE_LAZYIO)) { - dout("check_caps queuing invalidate\n"); - queue_invalidate = 1; - ci->i_rdcache_revoking = ci->i_rdcache_gen; - } else { - dout("check_caps failed to invalidate pages\n"); - /* we failed to invalidate pages. check these - caps again later. */ - force_requeue = 1; - __cap_set_timeouts(mdsc, ci); - } + doutc(cl, "queuing invalidate\n"); + queue_invalidate = true; + ci->i_rdcache_revoking = ci->i_rdcache_gen; } - tried_invalidate = 1; - goto retry_locked; + tried_invalidate = true; + goto retry; } - num = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + int mflags = 0; + struct cap_msg_args arg; + cap = rb_entry(p, struct ceph_cap, ci_node); - num++; /* avoid looping forever */ if (mds >= cap->mds || ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; - /* NOTE: no side-effects allowed, until we take s_mutex */ - + /* + * If we have an auth cap, we don't need to consider any + * overlapping caps as used. + */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), - ceph_cap_string(cap->implemented), - ceph_cap_string(revoking)); + doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); + + /* completed revocation? going down and there are no caps? */ + if (revoking) { + if ((revoking & cap_used) == 0) { + doutc(cl, "completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; + } + + if (flags & CHECK_CAPS_FLUSH_FORCE) { + doutc(cl, "force to flush caps\n"); + goto ack; + } if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { - dout("requesting new max_size\n"); + doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ - if ((inode->i_size << 1) >= ci->i_max_size && - (ci->i_reported_size << 1) < ci->i_max_size) { - dout("i_size approaching max_size\n"); + if (__ceph_should_report_size(ci)) { + doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ - if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && - ci->i_dirty_caps) { - dout("flushing dirty caps\n"); - goto ack; - } - - /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (cap == ci->i_auth_cap) { + if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { + doutc(cl, "flushing dirty caps\n"); + goto ack; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { + doutc(cl, "flushing snap caps\n"); + goto ack; + } } /* want more caps from mds? */ - if (want & ~(cap->mds_wanted | cap->issued)) - goto ack; + if (want & ~cap->mds_wanted) { + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + if (!__cap_is_valid(cap)) + goto ack; + } /* things we might delay */ - if ((cap->issued & ~retain) == 0 && - cap->mds_wanted == want) + if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ - if (is_delayed) - goto ack; - - /* delay? */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_max)) { - dout(" delaying issued %s -> %s, wanted %s -> %s\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - delayed++; - continue; - } - ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } + ceph_put_mds_session(session); + session = ceph_get_mds_session(cap->session); - if (session && session != cap->session) { - dout("oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); - session = NULL; - } - if (!session) { - session = cap->session; - if (mutex_trylock(&session->s_mutex) == 0) { - dout("inverting session/ino locks on %p\n", - session); - spin_unlock(&ci->i_ceph_lock); - if (took_snap_rwsem) { - up_read(&mdsc->snap_rwsem); - took_snap_rwsem = 0; - } - mutex_lock(&session->s_mutex); - goto retry; - } - } - /* take snap_rwsem after session mutex */ - if (!took_snap_rwsem) { - if (down_read_trylock(&mdsc->snap_rwsem) == 0) { - dout("inverting snap/in locks on %p\n", - inode); - spin_unlock(&ci->i_ceph_lock); - down_read(&mdsc->snap_rwsem); - took_snap_rwsem = 1; - goto retry; - } - took_snap_rwsem = 1; + /* kick flushing and flush snaps before sending normal + * cap message */ + if (cap == ci->i_auth_cap && + (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + + goto retry; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); + if (flags & CHECK_CAPS_FLUSH && + list_empty(&session->s_cap_dirty)) + mflags |= CEPH_CLIENT_CAPS_SYNC; + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ - sent++; - /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, + want, retain, flushing, flush_tid, oldest_flush_tid); + + spin_unlock(&ci->i_ceph_lock); + __send_cap(&arg, ci); + spin_lock(&ci->i_ceph_lock); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } - /* - * Reschedule delayed caps release if we delayed anything, - * otherwise cancel. - */ - if (delayed && is_delayed) - force_requeue = 1; /* __send_cap delayed release; requeue */ - if (!delayed && !is_delayed) - __cap_delay_cancel(mdsc, ci); - else if (!is_delayed || force_requeue) + /* periodically re-calculate caps wanted by open files */ + if (__ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list) && + (file_wanted & ~CEPH_CAP_PIN) && + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { __cap_delay_requeue(mdsc, ci); + } spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); - - if (session) - mutex_unlock(&session->s_mutex); - if (took_snap_rwsem) - up_read(&mdsc->snap_rwsem); } /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; -retry: spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } +retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; - int used = __ceph_caps_used(ci); - int want = __ceph_caps_wanted(ci); - int delayed; + struct cap_msg_args arg; + struct ceph_mds_session *session = cap->session; - if (!session) { + if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); - session = cap->session; - mutex_lock(&session->s_mutex); - goto retry; - } - BUG_ON(session != cap->session); - if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; + } - flushing = __mark_caps_flushing(inode, session); + if (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) + __kick_flushing_caps(mdsc, session, ci, 0); + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + goto retry_locked; + } - /* __send_cap drops i_ceph_lock */ - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __ceph_caps_used(ci), __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(&arg, ci); + } else { + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + cf->wake = true; + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: - if (session && unlock_session) - mutex_unlock(&session->s_mutex); + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush * cf = + list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } /* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. + * flush the mdlog and wait for any unsafe requests to complete. */ -static void sync_write_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; + struct ceph_mds_request *req1 = NULL, *req2 = NULL; + int ret, err = 0; spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; + if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { + req1 = list_last_entry(&ci->i_unsafe_dirops, + struct ceph_mds_request, + r_unsafe_dir_item); + ceph_mdsc_get_request(req1); + } + if (!list_empty(&ci->i_unsafe_iops)) { + req2 = list_last_entry(&ci->i_unsafe_iops, + struct ceph_mds_request, + r_unsafe_target_item); + ceph_mdsc_get_request(req2); + } + spin_unlock(&ci->i_unsafe_lock); - /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; + /* + * Trigger to flush the journal logs in all the relevant MDSes + * manually, or in the worst case we must wait at most 5 seconds + * to wait the journal logs to be flushed by the MDSes periodically. + */ + if (req1 || req2) { + struct ceph_mds_request *req; + struct ceph_mds_session **sessions; + struct ceph_mds_session *s; + unsigned int max_sessions; + int i; + + mutex_lock(&mdsc->mutex); + max_sessions = mdsc->max_sessions; + + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); + if (!sessions) { + mutex_unlock(&mdsc->mutex); + err = -ENOMEM; + goto out; + } - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); spin_lock(&ci->i_unsafe_lock); - ceph_osdc_put_request(req); + if (req1) { + list_for_each_entry(req, &ci->i_unsafe_dirops, + r_unsafe_dir_item) { + s = req->r_session; + if (!s) + continue; + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + if (req2) { + list_for_each_entry(req, &ci->i_unsafe_iops, + r_unsafe_target_item) { + s = req->r_session; + if (!s) + continue; + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + spin_unlock(&ci->i_unsafe_lock); + + /* the auth MDS */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) { + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); + } + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&mdsc->mutex); + + /* send flush mdlog request to MDSes */ + for (i = 0; i < max_sessions; i++) { + s = sessions[i]; + if (s) { + send_flush_mdlog(s); + ceph_put_mds_session(s); + } + } + kfree(sessions); + } + + doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, + ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, + req2 ? req2->r_tid : 0ULL); + if (req1) { + ret = !wait_for_completion_timeout(&req1->r_safe_completion, + ceph_timeout_jiffies(req1->r_timeout)); + if (ret) + err = -EIO; + } + if (req2) { + ret = !wait_for_completion_timeout(&req2->r_safe_completion, + ceph_timeout_jiffies(req2->r_timeout)); + if (ret) + err = -EIO; + } - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); out: - spin_unlock(&ci->i_unsafe_lock); + if (req1) + ceph_mdsc_put_request(req1); + if (req2) + ceph_mdsc_put_request(req2); + return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; - int ret; + struct ceph_client *cl = ceph_inode_to_client(inode); + u64 flush_tid; + int ret, err; int dirty; - dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - sync_write_wait(inode); + doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), + datasync ? " datasync" : ""); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret < 0) - return ret; - mutex_lock(&inode->i_mutex); + ret = file_write_and_wait_range(file, start, end); + if (datasync) + goto out; + + ret = ceph_wait_on_async_create(inode); + if (ret) + goto out; + + dirty = try_flush_caps(inode, &flush_tid); + doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); - dirty = try_flush_caps(inode, NULL, &flush_tid); - dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); - ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, + caps_are_flushed(inode, flush_tid)); } - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); - mutex_unlock(&inode->i_mutex); + if (err < 0) + ret = err; + + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; +out: + doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), + datasync ? " datasync" : "", ret); return ret; } @@ -1887,20 +2529,25 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + struct ceph_client *cl = ceph_inode_to_client(inode); + u64 flush_tid; int err = 0; int dirty; - int wait = wbc->sync_mode == WB_SYNC_ALL; + int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); - dout("write_inode %p wait=%d\n", inode, wait); + doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); + ceph_fscache_unpin_writeback(inode, wbc); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + err = ceph_wait_on_async_create(inode); + if (err) + return err; + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) @@ -1910,34 +2557,137 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -/* - * After a recovering MDS goes active, we need to resend any caps - * we were flushing. - * - * Caller holds session->s_mutex. - */ -static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) +{ + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + int ret; + u64 first_tid = 0; + u64 last_snap_flush = 0; + + /* Don't do anything until create reply comes in */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) + return; + + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + + list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { + if (cf->is_capsnap) { + last_snap_flush = cf->tid; + break; + } + } + + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid < first_tid) + continue; + + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err_client(cl, "%p auth cap %p not mds%d ???\n", + inode, cap, session->s_mds); + break; + } + + first_tid = cf->tid + 1; + + if (!cf->is_capsnap) { + struct cap_msg_args arg; + + doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", + inode, ceph_vinop(inode), cap, cf->tid, + ceph_cap_string(cf->caps)); + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, + (cf->tid < last_snap_flush ? + CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + cf->caps, cf->tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + __send_cap(&arg, ci); + } else { + struct ceph_cap_snap *capsnap = + container_of(cf, struct ceph_cap_snap, + cap_flush); + doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", + inode, ceph_vinop(inode), capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); + + refcount_inc(&capsnap->nref); + spin_unlock(&ci->i_ceph_lock); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err_client(cl, "error sending cap flushsnap," + " %p %llx.%llx tid %llu follows %llu\n", + inode, ceph_vinop(inode), cf->tid, + capsnap->follows); + } + + ceph_put_cap_snap(capsnap); + } + + spin_lock(&ci->i_ceph_lock); + } +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_cap_snap *capsnap; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + u64 oldest_flush_tid; + + doutc(cl, "mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); - dout("kick_flushing_capsnaps mds%d\n", session->s_mds); - list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, - flushing_item) { - struct ceph_inode_info *ci = capsnap->ci; - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, - cap, capsnap); - __ceph_flush_snaps(ci, &session, 1); + if (!(cap && cap->session == session)) { + pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", + inode, ceph_vinop(inode), cap, + session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + /* encode_caps_cb() also will reset these sequence + * numbers. make sure sequence numbers in cap flush + * message match later reconnect message */ + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } + spin_unlock(&ci->i_ceph_lock); } } @@ -1945,72 +2695,61 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; + struct ceph_cap *cap; + u64 oldest_flush_tid; + + lockdep_assert_held(&session->s_mutex); - kick_flushing_capsnaps(mdsc, session); + doutc(cl, "mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); - dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; + struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + if (!(cap && cap->session == session)) { + pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", + inode, ceph_vinop(inode), cap, + session->s_mds); spin_unlock(&ci->i_ceph_lock); + continue; } + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } + spin_unlock(&ci->i_ceph_lock); } } -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - struct inode *inode) +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *cap; - int delayed = 0; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_cap *cap = ci->i_auth_cap; + struct inode *inode = &ci->netfs.inode; - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + lockdep_assert_held(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, &session, 1); + doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", + inode, ceph_vinop(inode), + ceph_cap_string(ci->i_flushing_caps)); - if (ci->i_flushing_caps) { + if (!list_empty(&ci->i_cap_flush_list)) { + u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - spin_unlock(&ci->i_ceph_lock); + __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } } @@ -2018,25 +2757,37 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. - * - * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_EXCL) + ci->i_fx_ref++; + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->vfs_inode); + ihold(inode); ci->i_wb_ref++; - dout("__take_cap_refs %p wb %d -> %d (?)\n", - &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, + ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2046,60 +2797,75 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. + * + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, + * or a negative error code. There are 3 special error codes: + * -EAGAIN: need to sleep but non-blocking is specified + * -EFBIG: ask caller to call check_max_size() and try again. + * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff, int *check_max, int *err) +enum { + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ + NON_BLOCKING = (1 << 8), + CHECK_FILELOCK = (1 << 9), +}; + +static int try_get_cap_refs(struct inode *inode, int need, int want, + loff_t endoff, int flags, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; - int file_wanted; + bool snap_rwsem_locked = false; + + doutc(cl, "%p %llx.%llx need %s want %s\n", inode, + ceph_vinop(inode), ceph_cap_string(need), + ceph_cap_string(want)); - dout("get_cap_refs %p need %s want %s\n", inode, - ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); - /* make sure file is actually open */ - file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", - ceph_cap_string(need), ceph_cap_string(file_wanted)); - *err = -EBADF; - ret = 1; - goto out; + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + doutc(cl, "%p %llx.%llx error filelock\n", inode, + ceph_vinop(inode)); + ret = -EIO; + goto out_unlock; } /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_lock(&inode->i_mutex); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); - if (!(need & CEPH_CAP_FILE_WR)) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } - if (need & CEPH_CAP_FILE_WR) { + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { - dout("get_cap_refs %p endoff %llu > maxsize %llu\n", - inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { - *check_max = 1; - ret = 1; - } - goto out; + doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", + inode, ceph_vinop(inode), endoff, ci->i_max_size); + if (endoff > ci->i_requested_max_size) + ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; + goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { - dout("get_cap_refs %p cap_snap_pending\n", inode); - goto out; + doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, + ceph_vinop(inode)); + goto out_unlock; } } - have = __ceph_caps_issued(ci, &implemented); if ((have & need) == need) { /* @@ -2107,25 +2873,96 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; - dout("get_cap_refs %p have %s but not %s (revoking %s)\n", - inode, ceph_cap_string(have), ceph_cap_string(not), - ceph_cap_string(revoking)); - if ((revoking & not) == 0) { - *got = need | (have & want); - __take_cap_refs(ci, *got); + int exclude = revoking & not; + doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", + inode, ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(not), ceph_cap_string(revoking)); + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (flags & NON_BLOCKING) { + ret = -EAGAIN; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } + if ((have & want) == want) + *got = need | (want & ~exclude); + else + *got = need; + ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { - dout("get_cap_refs %p have %s needed %s\n", inode, - ceph_cap_string(have), ceph_cap_string(need)); + int session_readonly = false; + int mds_wanted; + if (ci->i_auth_cap && + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", + inode, ceph_vinop(inode), ceph_cap_string(need), + ci->i_auth_cap->mds); + ret = -EROFS; + goto out_unlock; + } + + if (ceph_inode_is_shutdown(inode)) { + doutc(cl, "%p %llx.%llx inode is shutdown\n", + inode, ceph_vinop(inode)); + ret = -ESTALE; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~mds_wanted) { + doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", + inode, ceph_vinop(inode), ceph_cap_string(need), + ceph_cap_string(mds_wanted)); + ret = -EUCLEAN; + goto out_unlock; + } + + doutc(cl, "%p %llx.%llx have %s need %s\n", inode, + ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(need)); } -out: +out_unlock: + + __ceph_touch_fmode(ci, mdsc, flags); + spin_unlock(&ci->i_ceph_lock); - dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(*got)); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); + + if (!ret) + ceph_update_cap_mis(&mdsc->metric); + else if (ret == 1) + ceph_update_cap_hit(&mdsc->metric); + + doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, + ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } @@ -2137,21 +2974,61 @@ out: static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { - dout("write %p at large endoff %llu, req max_size\n", - inode, endoff); + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { + doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", + inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; - check = 1; } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; spin_unlock(&ci->i_ceph_lock); if (check) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); +} + +static inline int get_used_fmode(int caps) +{ + int fmode = 0; + if (caps & CEPH_CAP_FILE_RD) + fmode |= CEPH_FILE_MODE_RD; + if (caps & CEPH_CAP_FILE_WR) + fmode |= CEPH_FILE_MODE_WR; + return fmode; +} + +int ceph_try_get_caps(struct inode *inode, int need, int want, + bool nonblock, int *got) +{ + int ret, flags; + + BUG_ON(need & ~CEPH_CAP_FILE_RD); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_ANY_DIR_OPS)); + if (need) { + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + } + + flags = get_used_fmode(need | want); + if (nonblock) + flags |= NON_BLOCKING; + + ret = try_get_cap_refs(inode, need, want, 0, flags, got); + /* three special error codes */ + if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) + ret = 0; + return ret; } /* @@ -2159,25 +3036,146 @@ static void check_max_size(struct inode *inode, loff_t endoff) * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, - loff_t endoff) +int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, + int want, loff_t endoff, int *got) { - int check_max, ret, err; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + int ret, _got, flags; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - check_max = 0; - err = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, - got, endoff, - &check_max, &err)); - if (err) - ret = err; - if (check_max) - goto retry; - return ret; + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + + flags = get_used_fmode(need | want); + + while (true) { + flags &= CEPH_FILE_MODE_MASK; + if (vfs_inode_has_locks(inode)) + flags |= CHECK_FILELOCK; + _got = 0; + ret = try_get_cap_refs(inode, need, want, endoff, + flags, &_got); + WARN_ON_ONCE(ret == -EAGAIN); + if (!ret) { +#ifdef CONFIG_DEBUG_FS + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; +#endif + DEFINE_WAIT_FUNC(wait, woken_wake_function); + +#ifdef CONFIG_DEBUG_FS + cw.ino = ceph_ino(inode); + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); +#endif + + /* make sure used fmode not timeout */ + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); + add_wait_queue(&ci->i_cap_wq, &wait); + + flags |= NON_BLOCKING; + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, flags, &_got))) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + + remove_wait_queue(&ci->i_cap_wq, &wait); + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); + +#ifdef CONFIG_DEBUG_FS + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); +#endif + + if (ret == -EAGAIN) + continue; + } + + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + + if (ret < 0) { + if (ret == -EFBIG || ret == -EUCLEAN) { + int ret2 = ceph_wait_on_async_create(inode); + if (ret2 < 0) + return ret2; + } + if (ret == -EFBIG) { + check_max_size(inode, endoff); + continue; + } + if (ret == -EUCLEAN) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(inode, flags); + if (ret == 0) + continue; + } + return ret; + } + + if (S_ISREG(ci->netfs.inode.i_mode) && + ceph_has_inline_data(ci) && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(inode) > 0) { + struct page *page = + find_get_page(inode->i_mapping, 0); + if (page) { + bool uptodate = PageUptodate(page); + + put_page(page); + if (uptodate) + break; + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; + } + *got = _got; + return 0; +} + +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, + int *got) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + + return __ceph_get_caps(inode, fi, need, want, endoff, got); } /* @@ -2187,10 +3185,41 @@ retry: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) +{ + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); + BUG_ON(capsnap->cap_flush.tid > 0); + ceph_put_snap_context(capsnap->context); + if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + + list_del(&capsnap->ci_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + +enum put_cap_refs_mode { + PUT_CAP_REFS_SYNC = 0, + PUT_CAP_REFS_ASYNC, +}; + /* * Release cap refs. * @@ -2200,11 +3229,13 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, + enum put_cap_refs_mode mode) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; + bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2215,45 +3246,92 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; + if (had & CEPH_CAP_FILE_EXCL) + if (--ci->i_fx_ref == 0) + last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; + /* put the ref held by ceph_take_cap_refs() */ put++; + check_flushsnaps = true; } - dout("put_cap_refs %p wb %d -> %d (?)\n", - inode, ci->i_wb_ref+1, ci->i_wb_ref); + doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, + ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } - if (had & CEPH_CAP_FILE_WR) + if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { + /* + * The Fb caps will always be took and released + * together with the Fw caps. + */ + WARN_ON_ONCE(ci->i_wb_ref); + last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + check_flushsnaps = true; + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) + ceph_change_snap_realm(inode, NULL); } + } + if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(ci, capsnap)) + /* put the ref held by ceph_queue_cap_snap() */ + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } spin_unlock(&ci->i_ceph_lock); - dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), - last ? " last" : "", put ? " put" : ""); + doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), + ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last && !flushsnaps) - ceph_check_caps(ci, 0, NULL); - else if (flushsnaps) - ceph_flush_snaps(ci); + switch (mode) { + case PUT_CAP_REFS_SYNC: + if (last) + ceph_check_caps(ci, 0); + else if (flushsnaps) + ceph_flush_snaps(ci, NULL); + break; + case PUT_CAP_REFS_ASYNC: + if (last) + ceph_queue_check_caps(inode); + else if (flushsnaps) + ceph_queue_flush_snaps(inode); + break; + default: + break; + } if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); +} + +void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); +} + /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. @@ -2264,236 +3342,390 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { - struct inode *inode = &ci->vfs_inode; - int last = 0; - int complete_capsnap = 0; - int drop_capsnap = 0; - int found = 0; - struct ceph_cap_snap *capsnap = NULL; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_cap_snap *capsnap = NULL, *iter; + int put = 0; + bool last = false; + bool flush_snaps = false; + bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; - last = !ci->i_wrbuffer_ref; + if (ci->i_wrbuffer_ref == 0) { + last = true; + put++; + } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } - dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", - inode, - ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, - ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, - last ? " LAST" : ""); + doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", + inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, + ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, + ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = 1; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - BUG_ON(!found); + + if (!capsnap) { + /* + * The capsnap should already be removed when removing + * auth cap in the case of a forced unmount. + */ + WARN_ON_ONCE(ci->i_auth_cap); + goto unlock; + } + capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { - complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; - } - dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", - inode, capsnap, capsnap->context->seq, - ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, - ci->i_wrbuffer_ref, capsnap->dirty_pages, - last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); + complete_capsnap = true; + if (!capsnap->writing) { + if (ceph_try_drop_cap_snap(ci, capsnap)) { + put++; + } else { + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + flush_snaps = true; + } + } } + doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", + inode, ceph_vinop(inode), capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + complete_capsnap ? " (complete capsnap)" : ""); } +unlock: spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - iput(inode); - } else if (complete_capsnap) { - ceph_flush_snaps(ci); - wake_up_all(&ci->i_cap_wq); + ceph_check_caps(ci, 0); + } else if (flush_snaps) { + ceph_flush_snaps(ci, NULL); } - if (drop_capsnap) + if (complete_capsnap) + wake_up_all(&ci->i_cap_wq); + while (put-- > 0) { iput(inode); + } +} + +/* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct dentry *dn, *prev = NULL; + + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); } +struct cap_extra_info { + struct ceph_string *pool_ns; + /* inline data */ + u64 inline_version; + void *inline_data; + u32 inline_len; + /* dirstat */ + bool dirstat_valid; + u64 nfiles; + u64 nsubdirs; + u64 change_attr; + /* currently issued */ + int issued; + struct timespec64 btime; + u8 *fscrypt_auth; + u32 fscrypt_auth_len; + u64 fscrypt_file_size; +}; + /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(ci->i_ceph_lock) + struct ceph_mds_caps *grant, + struct ceph_buffer *xattr_buf, + struct cap_extra_info *extra_info) + __releases(ci->i_ceph_lock) + __releases(session->s_mdsc->snap_rwsem) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); - struct timespec mtime, atime, ctime; - int check_caps = 0; - int wake = 0; - int writeback = 0; - int revoked_rdcache = 0; - int queue_invalidate = 0; - - dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", - inode, cap, mds, seq, ceph_cap_string(newcaps)); - dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, - inode->i_size); + unsigned char check_caps = 0; + bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); + bool wake = false; + bool writeback = false; + bool queue_trunc = false; + bool queue_invalidate = false; + bool deleted_inode = false; + bool fill_inline = false; + bool revoke_wait = false; + int flags = 0; + + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, + ceph_vinop(inode), cap, session->s_mds, seq, + ceph_cap_string(newcaps)); + doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, + max_size, i_size_read(inode)); + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - queue_invalidate = 1; + queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } } - /* side effects now are allowed */ + if (was_stale) + cap->issued = cap->implemented = CEPH_CAP_PIN; - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } - cap->cap_gen = session->s_cap_gen; + /* side effects now are allowed */ + cap->cap_gen = atomic_read(&session->s_cap_gen); + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(grant->mode); + inode_set_max_iversion_raw(inode, extra_info->change_attr); + + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { + umode_t mode = le32_to_cpu(grant->mode); + + if (inode_wrong_type(inode, mode)) + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + else + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); - dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - from_kuid(&init_user_ns, inode->i_uid), - from_kgid(&init_user_ns, inode->i_gid)); + ci->i_btime = extra_info->btime; + doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, + ceph_vinop(inode), inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || + memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, + ci->fscrypt_auth_len)) + pr_warn_ratelimited_client(cl, + "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", + ci->fscrypt_auth_len, + extra_info->fscrypt_auth_len); +#endif } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((newcaps & CEPH_CAP_LINK_SHARED) && + (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0) + deleted_inode = true; + } - if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { + if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && + grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { - dout(" got new xattrs v%llu on %p len %d\n", - version, inode, len); + doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", + version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); } } - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - /* max size increase? */ - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + if (newcaps & CEPH_CAP_ANY_RD) { + struct timespec64 mtime, atime, ctime; + /* ctime/mtime/atime? */ + ceph_decode_timespec64(&mtime, &grant->mtime); + ceph_decode_timespec64(&atime, &grant->atime); + ceph_decode_timespec64(&ctime, &grant->ctime); + ceph_fill_file_time(inode, extra_info->issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { + ci->i_files = extra_info->nfiles; + ci->i_subdirs = extra_info->nsubdirs; + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); + + if (ci->i_layout.pool_id != old_pool || + extra_info->pool_ns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + extra_info->pool_ns = old_ns; + + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, extra_info->issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + } + + if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { + if (max_size != ci->i_max_size) { + doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, + max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = true; } - wake = 1; } /* check cap bits */ wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); - dout(" my wanted = %s, used = %s, dirty %s\n", - ceph_cap_string(wanted), - ceph_cap_string(used), - ceph_cap_string(dirty)); - if (wanted != le32_to_cpu(grant->wanted)) { - dout("mds wanted %s -> %s\n", - ceph_cap_string(le32_to_cpu(grant->wanted)), - ceph_cap_string(wanted)); - /* imported cap may not have correct mds_wanted */ - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) - check_caps = 1; - } - - cap->seq = seq; + doutc(cl, " my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), ceph_cap_string(used), + ceph_cap_string(dirty)); - /* file layout may have changed */ - ci->i_layout = grant->layout; + if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && + (wanted & ~(cap->mds_wanted | newcaps))) { + /* + * If mds is importing cap, prior cap messages that update + * 'wanted' may get dropped by mds (migrate seq mismatch). + * + * We don't send cap message to update 'wanted' if what we + * want are already issued. If mds revokes caps, cap message + * that releases caps also tells mds what we want. But if + * caps got revoked by mds forcedly (session stale). We may + * haven't told mds what we want. + */ + check_caps = 1; + } /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; - dout("revocation: %s -> %s (revoking %s)\n", - ceph_cap_string(cap->issued), - ceph_cap_string(newcaps), - ceph_cap_string(revoking)); - if (revoking & used & CEPH_CAP_FILE_BUFFER) - writeback = 1; /* initiate writeback; will delay ack */ - else if (revoking == CEPH_CAP_FILE_CACHE && - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - queue_invalidate) - ; /* do nothing yet, invalidation will be queued */ - else if (cap == ci->i_auth_cap) + doutc(cl, "revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (S_ISREG(inode->i_mode) && + (revoking & used & CEPH_CAP_FILE_BUFFER)) { + writeback = true; /* initiate writeback; will delay ack */ + revoke_wait = true; + } else if (queue_invalidate && + revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) { + revoke_wait = true; /* do nothing yet, invalidation will be queued */ + } else if (cap == ci->i_auth_cap) { check_caps = 1; /* check auth cap only */ - else + } else { check_caps = 2; /* check all caps */ + } + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { - dout("caps unchanged: %s -> %s\n", - ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + doutc(cl, "caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); } else { - dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), - ceph_cap_string(newcaps)); + doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) @@ -2503,11 +3735,53 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ - wake = 1; + wake = true; } BUG_ON(cap->issued & ~cap->implemented); + /* don't let check_caps skip sending a response to MDS for revoke msgs */ + if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { + cap->mds_wanted = 0; + flags |= CHECK_CAPS_FLUSH_FORCE; + if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ + } + + if (extra_info->inline_version > 0 && + extra_info->inline_version >= ci->i_inline_version) { + ci->i_inline_version = extra_info->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap) { + if (newcaps & ~extra_info->issued) + wake = true; + + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } + + ceph_kick_flushing_inode_caps(session, ci); + } + up_read(&session->s_mdsc->snap_rwsem); + } spin_unlock(&ci->i_ceph_lock); + + if (fill_inline) + ceph_fill_inline_data(inode, NULL, extra_info->inline_data, + extra_info->inline_len); + + if (queue_trunc) + ceph_queue_vmtruncate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2517,16 +3791,16 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); + mutex_unlock(&session->s_mutex); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, - session); + ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); - else - mutex_unlock(&session->s_mutex); + ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL); } /* @@ -2540,47 +3814,83 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap_flush *cf, *tmp_cf; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; - int drop = 0; - int i; + bool drop = false; + bool wake_ci = false; + bool wake_mdsc = false; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { + /* Is this the one that was flushed? */ + if (cf->tid == flush_tid) + cleaned = cf->caps; - dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," - " flushing %s -> %s\n", - inode, session->s_mds, seq, ceph_cap_string(dirty), - ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), - ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + /* Is this a capsnap? */ + if (cf->is_capsnap) + continue; + + if (cf->tid <= flush_tid) { + /* + * An earlier or current tid. The FLUSH_ACK should + * represent a superset of this flush's caps. + */ + wake_ci |= __detach_cap_flush_from_ci(ci, cf); + list_add_tail(&cf->i_list, &to_remove); + } else { + /* + * This is a later one. Any caps in it are still dirty + * so don't count them as cleaned. + */ + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } + + doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", + inode, ceph_vinop(inode), session->s_mds, seq, + ceph_cap_string(dirty), ceph_cap_string(cleaned), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, i_list) + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); + if (ci->i_flushing_caps == 0) { - list_del_init(&ci->i_flushing_item); - if (!list_empty(&session->s_cap_flushing)) - dout(" mds%d still flushing cap on %p\n", - session->s_mds, - &list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item)->vfs_inode); + if (list_empty(&ci->i_cap_flush_list)) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) { + struct inode *inode = + &list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item)->netfs.inode; + doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", + session->s_mds, inode, ceph_vinop(inode)); + } + } mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); - dout(" inode %p now !flushing\n", inode); + doutc(cl, " %p %llx.%llx now !flushing\n", inode, + ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { - dout(" inode %p now clean\n", inode); + doutc(cl, " %p %llx.%llx now clean\n", inode, + ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); - drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + drop = true; + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2590,14 +3900,65 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up_all(&ci->i_cap_wq); out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, i_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + bool ret; + + lockdep_assert_held(&ci->i_ceph_lock); + + doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, + inode, ceph_vinop(inode), ci); + + list_del_init(&capsnap->ci_item); + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); + if (wake_ci) + *wake_ci = ret; + + spin_lock(&mdsc->cap_dirty_lock); + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); + if (wake_mdsc) + *wake_mdsc = ret; + spin_unlock(&mdsc->cap_dirty_lock); +} + +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); +} + /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. @@ -2609,39 +3970,46 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - int drop = 0; + struct ceph_cap_snap *capsnap = NULL, *iter; + bool wake_ci = false; + bool wake_mdsc = false; - dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", - inode, ci, session->s_mds, follows); + doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, + ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->flush_tid != flush_tid) { - dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->flush_tid); + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { + doutc(cl, " cap_snap %p follows %lld " + "tid %lld != %lld\n", iter, + follows, flush_tid, + iter->cap_flush.tid); break; } - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - drop = 1; + capsnap = iter; break; } else { - dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + doutc(cl, " skipping cap_snap %p follows %lld\n", + iter, iter->follows); } } + if (capsnap) + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (drop) + + if (capsnap) { + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); iput(inode); + } } /* @@ -2649,12 +4017,13 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, * * caller hold s_mutex. */ -static void handle_cap_trunc(struct inode *inode, +static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, - struct ceph_mds_session *session) - __releases(ci->i_ceph_lock) + struct ceph_mds_session *session, + struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); @@ -2663,18 +4032,25 @@ static void handle_cap_trunc(struct inode *inode, int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); - int queue_trunc = 0; + bool queue_trunc = false; + + lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; - dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", - inode, mds, seq, truncate_size, truncate_seq); + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", + inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&ci->i_ceph_lock); - - if (queue_trunc) - ceph_queue_vmtruncate(inode); + return queue_trunc; } /* @@ -2686,124 +4062,261 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); + u64 t_cap_id; + u32 t_issue_seq, t_mseq; + int target, issued; int mds = session->s_mds; - unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_issue_seq = le32_to_cpu(ph->issue_seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_issue_seq = t_mseq = 0; + target = -1; + } + doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), target, t_issue_seq, t_mseq); +retry: + down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; - } - if (t->session->s_mds == mds) - cap = t; + if (target < 0) { + ceph_remove_cap(mdsc, cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; - - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ - /* - * we can't flush dirty caps that we've seen the - * EXPORT but no IMPORT for - */ - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", - inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + issued = cap->issued; + if (issued != cap->implemented) + pr_err_ratelimited_client(cl, "issued != implemented: " + "%p %llx.%llx mds%d seq %d mseq %d" + " issued %s implemented %s\n", + inode, ceph_vinop(inode), mds, + cap->seq, cap->mseq, + ceph_cap_string(issued), + ceph_cap_string(cap->implemented)); + + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id == t_cap_id && + ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) { + doutc(cl, " updating import cap %p mds%d\n", tcap, + target); + tcap->cap_id = t_cap_id; + tcap->seq = t_issue_seq - 1; + tcap->issue_seq = t_issue_seq - 1; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) { + ci->i_auth_cap = tcap; + change_auth_cap_ses(ci, tcap->session); } + } + ceph_remove_cap(mdsc, cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export target */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + tcap = new_cap; + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, + t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap); + + ceph_remove_cap(mdsc, cap, false); + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + mutex_lock(&session->s_mutex); } - /* else, we already released it */ + goto retry; +out_unlock: spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + u32 piseq = 0; + u32 pmseq = 0; + int peer; + + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + piseq = le32_to_cpu(ph->issue_seq); + pmseq = le32_to_cpu(ph->mseq); + } else { + p_cap_id = 0; + peer = -1; + } - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n", + ceph_vinop(inode), peer, piseq, pmseq); +retry: + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + spin_lock(&ci->i_ceph_lock); + goto retry; } - spin_unlock(&mdsc->cap_dirty_lock); + cap = new_cap; } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); - } - - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } - /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + doutc(cl, " remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != piseq || + ocap->mseq != pmseq)) { + pr_err_ratelimited_client(cl, "mismatched seq/mseq: " + "%p %llx.%llx mds%d seq %d mseq %d" + " importer mds%d has peer seq %d mseq %d\n", + inode, ceph_vinop(inode), peer, + ocap->seq, ocap->mseq, mds, piseq, pmseq); + } + ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } + + *old_issued = issued; + *target_cap = cap; +} + +#ifdef CONFIG_FS_ENCRYPTION +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); + if (extra->fscrypt_auth_len) { + ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); + extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, + GFP_KERNEL); + if (!extra->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, extra->fscrypt_auth, + extra->fscrypt_auth_len, bad); + } + + ceph_decode_32_safe(p, end, len, bad); + if (len >= sizeof(u64)) { + ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); + len -= sizeof(u64); + } + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; } +#else +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + /* Don't care about these fields unless we're encryption-capable */ + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#endif /* * Handle a caps message from the MDS. @@ -2815,99 +4328,200 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->fsc->sb; + struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; - int mds = session->s_mds; + struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm = NULL; int op; - u32 seq, mseq; + int msg_version = le16_to_cpu(msg->hdr.version); + u32 seq, mseq, issue_seq; struct ceph_vino vino; - u64 cap_id; - u64 size, max_size; - u64 tid; void *snaptrace; size_t snaptrace_len; - void *flock; - u32 flock_len; - int open_target_sessions = 0; + void *p, *end; + struct cap_extra_info extra_info = {}; + bool queue_trunc; + bool close_sessions = false; + bool do_cap_release = false; - dout("handle_caps from mds%d\n", mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; /* decode */ - tid = le64_to_cpu(msg->hdr.tid); + end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - cap_id = le64_to_cpu(h->cap_id); seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); - size = le64_to_cpu(h->size); - max_size = le64_to_cpu(h->max_size); + issue_seq = le32_to_cpu(h->issue_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; - if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + if (msg_version >= 2) { + u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); - flock = p; - } else { - flock = NULL; - flock_len = 0; + if (p + flock_len > end) + goto bad; + p += flock_len; } - mutex_lock(&session->s_mutex); - session->s_seq++; - dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, - (unsigned)seq); + if (msg_version >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + p += sizeof(*peer); + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + + if (msg_version >= 4) { + ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); + ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); + if (p + extra_info.inline_len > end) + goto bad; + extra_info.inline_data = p; + p += extra_info.inline_len; + } + + if (msg_version >= 5) { + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + u32 epoch_barrier; + + ceph_decode_32_safe(&p, end, epoch_barrier, bad); + ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); + } + + if (msg_version >= 8) { + u32 pool_ns_len; + + /* version >= 6 */ + ceph_decode_skip_64(&p, end, bad); // flush_tid + /* version >= 7 */ + ceph_decode_skip_32(&p, end, bad); // caller_uid + ceph_decode_skip_32(&p, end, bad); // caller_gid + /* version >= 8 */ + ceph_decode_32_safe(&p, end, pool_ns_len, bad); + if (pool_ns_len > 0) { + ceph_decode_need(&p, end, pool_ns_len, bad); + extra_info.pool_ns = + ceph_find_or_create_string(p, pool_ns_len); + p += pool_ns_len; + } + } + + if (msg_version >= 9) { + struct ceph_timespec *btime; + + if (p + sizeof(*btime) > end) + goto bad; + btime = p; + ceph_decode_timespec64(&extra_info.btime, btime); + p += sizeof(*btime); + ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); + } + + if (msg_version >= 11) { + /* version >= 10 */ + ceph_decode_skip_32(&p, end, bad); // flags + /* version >= 11 */ + extra_info.dirstat_valid = true; + ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); + ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); + } - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); + if (msg_version >= 12) { + if (parse_fscrypt_fields(&p, end, &extra_info)) + goto bad; + } /* lookup ino */ - inode = ceph_find_inode(sb, vino); - ci = ceph_inode(inode); - dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, - vino.snap, inode); + inode = ceph_find_inode(mdsc->fsc->sb, vino); + doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n", + session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, + seq, issue_seq, mseq); + + mutex_lock(&session->s_mutex); + if (!inode) { - dout(" i don't have ino %llx\n", vino.ino); + doutc(cl, " i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + switch (op) { + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + break; + default: + break; + } goto flush_cap_releases; } + ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: - handle_cap_flushsnap_ack(inode, tid, h, session); + handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session); goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, - snaptrace, snaptrace_len); + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + if (ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm)) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + goto done; + } + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } + spin_lock(&ci->i_ceph_lock); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &extra_info.issued); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); + if (realm) + ceph_put_snap_realm(mdsc, realm); + goto done_unlocked; } /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); - cap = __get_cap_for_mds(ceph_inode(inode), mds); + cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { - dout(" no cap on %p ino %llx.%llx from mds%d\n", - inode, ceph_ino(inode), ceph_snap(inode), mds); + doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", + inode, ceph_ino(inode), ceph_snap(inode), + session->s_mds); spin_unlock(&ci->i_ceph_lock); + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + break; + default: + break; + } goto flush_cap_releases; } @@ -2915,100 +4529,228 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - case CEPH_CAP_OP_IMPORT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &extra_info.issued); + extra_info.issued |= __ceph_caps_dirty(ci); + handle_cap_grant(inode, session, cap, + h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: - handle_cap_flush_ack(inode, tid, h, session, cap); + handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), + h, session, cap); break; case CEPH_CAP_OP_TRUNC: - handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session, + &extra_info); + spin_unlock(&ci->i_ceph_lock); + if (queue_trunc) + ceph_queue_vmtruncate(inode); break; default: spin_unlock(&ci->i_ceph_lock); - pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, - ceph_cap_op_name(op)); + pr_err_client(cl, "unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); } - goto done; +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); + + ceph_put_string(extra_info.pool_ns); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); + + kfree(extra_info.fscrypt_auth); + return; flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); - ceph_send_cap_releases(mdsc, session); - -done: - mutex_unlock(&session->s_mutex); -done_unlocked: - if (inode) - iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); - return; + if (do_cap_release) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = le64_to_cpu(h->cap_id); + cap->mseq = mseq; + cap->seq = seq; + cap->issue_seq = seq; + spin_lock(&session->s_cap_lock); + __ceph_queue_cap_release(session, cap); + spin_unlock(&session->s_cap_lock); + } + ceph_flush_session_cap_releases(mdsc, session); + goto done; bad: - pr_err("ceph_handle_caps: corrupt message\n"); + pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); - return; + goto out; } /* * Delayed work handler to process end of delayed cap release LRU list. + * + * If new caps are added to the list while processing it, these won't get + * processed in this run. In this case, the ci->i_hold_caps_max will be + * returned so that the work can be scheduled accordingly. */ -void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; + struct inode *inode; struct ceph_inode_info *ci; - int flags = CHECK_CAPS_NODELAY; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; + unsigned long delay_max = opt->caps_wanted_delay_max * HZ; + unsigned long loop_start = jiffies; + unsigned long delay = 0; - dout("check_delayed_caps\n"); - while (1) { - spin_lock(&mdsc->cap_delay_lock); - if (list_empty(&mdsc->cap_delay_list)) - break; + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_delay_lock); + while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); + if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { + doutc(cl, "caps added recently. Exiting loop"); + delay = ci->i_hold_caps_max; + break; + } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); - spin_unlock(&mdsc->cap_delay_lock); - dout("check_delayed_caps on %p\n", &ci->vfs_inode); - ceph_check_caps(ci, flags, NULL); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, 0); + iput(inode); + spin_lock(&mdsc->cap_delay_lock); + } + + /* + * Make sure too many dirty caps or general + * slowness doesn't block mdsc delayed work, + * preventing send_renew_caps() from running. + */ + if (time_after_eq(jiffies, loop_start + 5 * HZ)) + break; } spin_unlock(&mdsc->cap_delay_lock); + doutc(cl, "done\n"); + + return delay; } /* * Flush all dirty caps to the mds */ -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +static void flush_dirty_session_caps(struct ceph_mds_session *s) { + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; - dout("flush_dirty_caps\n"); + doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); - while (!list_empty(&mdsc->cap_dirty)) { - ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + while (!list_empty(&s->s_cap_dirty)) { + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); - dout("flush_dirty_caps %p\n", inode); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + ceph_wait_on_async_create(inode); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); - dout("flush_dirty_caps done\n"); + doutc(cl, "done\n"); +} + +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); +} + +/* + * Flush all cap releases to the mds + */ +static void flush_cap_releases(struct ceph_mds_session *s) +{ + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&s->s_cap_lock); + if (s->s_num_cap_releases) + ceph_flush_session_cap_releases(mdsc, s); + spin_unlock(&s->s_cap_lock); + doutc(cl, "done\n"); + +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc) +{ + ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true); +} + +void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode) +{ + unsigned long now = jiffies; + if (fmode & CEPH_FILE_MODE_RD) + ci->i_last_rd = now; + if (fmode & CEPH_FILE_MODE_WR) + ci->i_last_wr = now; + /* queue periodic check */ + if (fmode && + __ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list)) + __cap_delay_requeue(mdsc, ci); +} + +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); + int bits = (fmode << 1) | 1; + bool already_opened = false; + int i; + + if (count == 1) + atomic64_inc(&mdsc->metric.opened_files); + + spin_lock(&ci->i_ceph_lock); + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + /* + * If any of the mode ref is larger than 0, + * that means it has been already opened by + * others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + already_opened = true; + + if (bits & (1 << i)) + ci->i_nr_by_mode[i] += count; + } + + if (!already_opened) + percpu_counter_inc(&mdsc->metric.opened_inodes); + spin_unlock(&ci->i_ceph_lock); } /* @@ -3016,21 +4758,75 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) * we may need to release capabilities to the MDS (or schedule * their delayed release). */ -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct inode *inode = &ci->vfs_inode; - int last = 0; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); + int bits = (fmode << 1) | 1; + bool is_closed = true; + int i; + + if (count == 1) + atomic64_dec(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); - dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, - ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); - BUG_ON(ci->i_nr_by_mode[fmode] == 0); - if (--ci->i_nr_by_mode[fmode] == 0) - last++; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) { + BUG_ON(ci->i_nr_by_mode[i] < count); + ci->i_nr_by_mode[i] -= count; + } + + /* + * If any of the mode ref is not 0 after + * decreased, that means it is still opened + * by others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + is_closed = false; + } + + if (is_closed) + percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); +} - if (last && ci->i_vino.snap == CEPH_NOSNAP) - ceph_check_caps(ci, 0, NULL); +/* + * For a soon-to-be unlinked file, drop the LINK caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +int ceph_drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + + if (__ceph_caps_dirty(ci)) { + struct ceph_mds_client *mdsc = + ceph_inode_to_fs_client(inode)->mdsc; + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, + ceph_vinop(inode)); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add_tail(&ci->i_cap_delay_list, + &mdsc->cap_unlink_delay_list); + spin_unlock(&mdsc->cap_delay_lock); + + /* + * Fire the work immediately, because the MDS maybe + * waiting for caps release. + */ + ceph_queue_cap_unlink_work(mdsc); + } + } + spin_unlock(&ci->i_ceph_lock); + return drop; } /* @@ -3045,6 +4841,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; @@ -3054,68 +4851,96 @@ int ceph_encode_inode_release(void **p, struct inode *inode, used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); - dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", - inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), - ceph_cap_string(unless)); + doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", + inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), + ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { - if (force || - ((cap->issued & drop) && - (cap->issued & unless) == 0)) { - if ((cap->issued & drop) && - (cap->issued & unless) == 0) { + unless &= cap->issued; + if (unless) { + if (unless & CEPH_CAP_AUTH_EXCL) + drop &= ~CEPH_CAP_AUTH_SHARED; + if (unless & CEPH_CAP_LINK_EXCL) + drop &= ~CEPH_CAP_LINK_SHARED; + if (unless & CEPH_CAP_XATTR_EXCL) + drop &= ~CEPH_CAP_XATTR_SHARED; + if (unless & CEPH_CAP_FILE_EXCL) + drop &= ~CEPH_CAP_FILE_SHARED; + } + + if (force || (cap->issued & drop)) { + if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) - wanted |= cap->mds_wanted; - dout("encode_inode_release %p cap %p " - "%s -> %s, wanted %s -> %s\n", inode, cap, - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(wanted)); + doutc(cl, "%p %llx.%llx cap %p %s -> %s, " + "wanted %s -> %s\n", inode, + ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; + if (cap == ci->i_auth_cap && + !(wanted & CEPH_CAP_ANY_FILE_WR)) + ci->i_requested_max_size = 0; } else { - dout("encode_inode_release %p cap %p %s" - " (force)\n", inode, cap, - ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p %s (force)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); - rel->issue_seq = cpu_to_le32(cap->issue_seq), + rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; *p += sizeof(*rel); ret = 1; } else { - dout("encode_inode_release %p cap %p %s\n", - inode, cap, ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } +/** + * ceph_encode_dentry_release - encode a dentry release into an outgoing request + * @p: outgoing request buffer + * @dentry: dentry to release + * @dir: dir to release it from + * @mds: mds that we're speaking to + * @drop: caps being dropped + * @unless: unless we have these caps + * + * Encode a dentry release into an outgoing request buffer. Returns 1 if the + * thing was released, or a negative error code otherwise. + */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, + struct inode *dir, int mds, int drop, int unless) { - struct inode *dir = dentry->d_parent->d_inode; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_client *cl; int force = 0; int ret; + /* This shouldn't happen */ + BUG_ON(!dir); + /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it @@ -3129,16 +4954,145 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); + cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { - dout("encode_dentry_release %p mds%d seq %d\n", - dentry, mds, (int)di->lease_seq); - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; + int len = dentry->d_name.len; + doutc(cl, "%p mds%d seq %d\n", dentry, mds, + (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + memcpy(*p, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { + len = ceph_encode_encrypted_dname(dir, *p, len); + if (len < 0) + return len; + } + rel->dname_len = cpu_to_le32(len); + *p += len; + } else { + spin_unlock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return ret; } + +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", + ci, inode, ceph_vinop(inode)); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) +{ + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client *cl = fsc->client; + struct ceph_inode_info *ci = ceph_inode(inode); + bool is_auth; + bool dirty_dropped = false; + int iputs = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", + cap, ci, inode, ceph_vinop(inode)); + + is_auth = (cap == ci->i_auth_cap); + __ceph_remove_cap(cap, false); + if (is_auth) { + struct ceph_cap_flush *cf; + + if (ceph_inode_is_shutdown(inode)) { + if (inode->i_data.nrpages > 0) + *invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } + + spin_lock(&mdsc->cap_dirty_lock); + + /* trash all of the cap flushes for this inode */ + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_del_init(&cf->g_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_dirty_item)) { + pr_warn_ratelimited_client(cl, + " dropping dirty %s state for %p %llx.%llx\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_vinop(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + dirty_dropped = true; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_warn_ratelimited_client(cl, + " dropping dirty+flushing %s state for %p %llx.%llx\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_vinop(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + dirty_dropped = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + if (dirty_dropped) { + mapping_set_error(inode->i_mapping, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited_client(cl, + " dropping file locks for %p %llx.%llx\n", + inode, ceph_vinop(inode)); + } + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + cf = ci->i_prealloc_cap_flush; + ci->i_prealloc_cap_flush = NULL; + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_cap_snaps)) + iputs = remove_capsnaps(mdsc, inode); + } + if (dirty_dropped) + ++iputs; + return iputs; +} diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index bdce8b1fbd06..6f67d5b884a0 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Ceph 'frag' type */ diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 000000000000..0ea4db650f85 --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,604 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The base64 encode/decode code was copied from fscrypt: + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * Written by Uday Savagaonkar, 2014. + * Modified by Jaegeuk Kim, 2015. + */ +#include <linux/ceph/ceph_debug.h> +#include <linux/xattr.h> +#include <linux/fscrypt.h> +#include <linux/ceph/striper.h> + +#include "super.h" +#include "mds_client.h" +#include "crypto.h" + +static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; + u32 ctxlen; + + /* Non existent or too short? */ + if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) + return -ENOBUFS; + + /* Some format we don't recognize? */ + if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) + return -ENOBUFS; + + ctxlen = le32_to_cpu(cfa->cfa_blob_len); + if (len < ctxlen) + return -ERANGE; + + memcpy(ctx, cfa->cfa_blob, ctxlen); + return ctxlen; +} + +static int ceph_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + int ret; + struct iattr attr = { }; + struct ceph_iattr cia = { }; + struct ceph_fscrypt_auth *cfa; + + WARN_ON_ONCE(fs_data); + + if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) + return -EINVAL; + + cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); + if (!cfa) + return -ENOMEM; + + cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + cfa->cfa_blob_len = cpu_to_le32(len); + memcpy(cfa->cfa_blob, ctx, len); + + cia.fscrypt_auth = cfa; + + ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia); + if (ret == 0) + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + kfree(cia.fscrypt_auth); + return ret; +} + +static bool ceph_crypt_empty_dir(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return ci->i_rsubdirs + ci->i_rfiles == 1; +} + +static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) +{ + return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy; +} + +static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), + .needs_bounce_pages = 1, + .get_context = ceph_crypt_get_context, + .set_context = ceph_crypt_set_context, + .get_dummy_policy = ceph_get_dummy_policy, + .empty_dir = ceph_crypt_empty_dir, +}; + +void ceph_fscrypt_set_ops(struct super_block *sb) +{ + fscrypt_set_ops(sb, &ceph_fscrypt_ops); +} + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ + fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); +} + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + int ret, ctxsize; + bool encrypted = false; + struct ceph_inode_info *ci = ceph_inode(inode); + + ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (ret) + return ret; + if (!encrypted) + return 0; + + as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); + if (!as->fscrypt_auth) + return -ENOMEM; + + ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, + inode); + if (ctxsize < 0) + return ctxsize; + + as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); + + WARN_ON_ONCE(ci->fscrypt_auth); + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); + ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, + GFP_KERNEL); + if (!ci->fscrypt_auth) + return -ENOMEM; + + inode->i_flags |= S_ENCRYPTED; + + return 0; +} + +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as) +{ + swap(req->r_fscrypt_auth, as->fscrypt_auth); +} + +/* + * User-created snapshots can't start with '_'. Snapshots that start with this + * character are special (hint: there aren't real snapshots) and use the + * following format: + * + * _<SNAPSHOT-NAME>_<INODE-NUMBER> + * + * where: + * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted, + * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot + * + * This function parses these snapshot names and returns the inode + * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME> + * length. + */ +static struct inode *parse_longname(const struct inode *parent, + const char *name, int *name_len) +{ + struct ceph_client *cl = ceph_inode_to_client(parent); + struct inode *dir = NULL; + struct ceph_vino vino = { .snap = CEPH_NOSNAP }; + char *name_end, *inode_number; + int ret = -EIO; + /* NUL-terminate */ + char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL); + if (!str) + return ERR_PTR(-ENOMEM); + /* Skip initial '_' */ + str++; + name_end = strrchr(str, '_'); + if (!name_end) { + doutc(cl, "failed to parse long snapshot name: %s\n", str); + return ERR_PTR(-EIO); + } + *name_len = (name_end - str); + if (*name_len <= 0) { + pr_err_client(cl, "failed to parse long snapshot name\n"); + return ERR_PTR(-EIO); + } + + /* Get the inode number */ + inode_number = name_end + 1; + ret = kstrtou64(inode_number, 10, &vino.ino); + if (ret) { + doutc(cl, "failed to parse inode number: %s\n", str); + return ERR_PTR(ret); + } + + /* And finally the inode */ + dir = ceph_find_inode(parent->i_sb, vino); + if (!dir) { + /* This can happen if we're not mounting cephfs on the root */ + dir = ceph_get_inode(parent->i_sb, vino, NULL); + if (IS_ERR(dir)) + doutc(cl, "can't find inode %s (%s)\n", inode_number, name); + } + return dir; +} + +int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) +{ + struct ceph_client *cl = ceph_inode_to_client(parent); + struct inode *dir = parent; + char *p = buf; + u32 len; + int name_len = elen; + int ret; + u8 *cryptbuf = NULL; + + /* Handle the special case of snapshot names that start with '_' */ + if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') { + dir = parse_longname(parent, p, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + p++; /* skip initial '_' */ + } + + if (!fscrypt_has_encryption_key(dir)) + goto out; + + /* + * Convert cleartext d_name to ciphertext. If result is longer than + * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes + * + * See: fscrypt_setup_filename + */ + if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) { + elen = -ENAMETOOLONG; + goto out; + } + + /* Allocate a buffer appropriate to hold the result */ + cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, + GFP_KERNEL); + if (!cryptbuf) { + elen = -ENOMEM; + goto out; + } + + ret = fscrypt_fname_encrypt(dir, + &(struct qstr)QSTR_INIT(p, name_len), + cryptbuf, len); + if (ret) { + elen = ret; + goto out; + } + + /* hash the end if the name is long enough */ + if (len > CEPH_NOHASH_NAME_MAX) { + u8 hash[SHA256_DIGEST_SIZE]; + u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; + + /* + * hash the extra bytes and overwrite crypttext beyond that + * point with it + */ + sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); + memcpy(extra, hash, SHA256_DIGEST_SIZE); + len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; + } + + /* base64 encode the encrypted name */ + elen = base64_encode(cryptbuf, len, p, false, BASE64_IMAP); + doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p); + + /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ + WARN_ON(elen > 240); + if (dir != parent) // leading _ is already there; append _<inum> + elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino); + +out: + kfree(cryptbuf); + if (dir != parent) { + if ((inode_state_read_once(dir) & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return elen; +} + +/** + * ceph_fname_to_usr - convert a filename for userland presentation + * @fname: ceph_fname to be converted + * @tname: temporary name buffer to use for conversion (may be NULL) + * @oname: where converted name should be placed + * @is_nokey: set to true if key wasn't available during conversion (may be NULL) + * + * Given a filename (usually from the MDS), format it for presentation to + * userland. If @parent is not encrypted, just pass it back as-is. + * + * Otherwise, base64 decode the string, and then ask fscrypt to format it + * for userland presentation. + * + * Returns 0 on success or negative error code on error. + */ +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + struct inode *dir = fname->dir; + struct fscrypt_str _tname = FSTR_INIT(NULL, 0); + struct fscrypt_str iname; + char *name = fname->name; + int name_len = fname->name_len; + int ret; + + /* Sanity check that the resulting name will fit in the buffer */ + if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX) + return -EIO; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (name[0] == '_')) { + dir = parse_longname(dir, name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + name++; /* skip initial '_' */ + } + + if (!IS_ENCRYPTED(dir)) { + oname->name = fname->name; + oname->len = fname->name_len; + ret = 0; + goto out_inode; + } + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret) + goto out_inode; + + /* + * Use the raw dentry name as sent by the MDS instead of + * generating a nokey name via fscrypt. + */ + if (!fscrypt_has_encryption_key(dir)) { + if (fname->no_copy) + oname->name = fname->name; + else + memcpy(oname->name, fname->name, fname->name_len); + oname->len = fname->name_len; + if (is_nokey) + *is_nokey = true; + ret = 0; + goto out_inode; + } + + if (fname->ctext_len == 0) { + int declen; + + if (!tname) { + ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname); + if (ret) + goto out_inode; + tname = &_tname; + } + + declen = base64_decode(name, name_len, + tname->name, false, BASE64_IMAP); + if (declen <= 0) { + ret = -EIO; + goto out; + } + iname.name = tname->name; + iname.len = declen; + } else { + iname.name = fname->ctext; + iname.len = fname->ctext_len; + } + + ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); + if (!ret && (dir != fname->dir)) { + char tmp_buf[BASE64_CHARS(NAME_MAX)]; + + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + oname->len, oname->name, dir->i_ino); + memcpy(oname->name, tmp_buf, name_len); + oname->len = name_len; + } + +out: + fscrypt_fname_free_buffer(&_tname); +out_inode: + if (dir != fname->dir) { + if ((inode_state_read_once(dir) & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return ret; +} + +/** + * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper + * @dir: directory inode for readdir prep + * + * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as + * non-complete if this call results in having the directory unlocked. + * + * Returns: + * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked) + * 0 - if directory is still locked + * < 0 - if __fscrypt_prepare_readdir() fails + */ +int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + bool had_key = fscrypt_has_encryption_key(dir); + int err; + + if (!IS_ENCRYPTED(dir)) + return 0; + + err = __fscrypt_prepare_readdir(dir); + if (err) + return err; + if (!had_key && fscrypt_has_encryption_key(dir)) { + /* directory just got unlocked, mark it as not complete */ + ceph_dir_clear_complete(dir); + return 1; + } + return 0; +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, + ceph_vinop(inode), len, offs, lblk_num); + return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num); +} + +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, + ceph_vinop(inode), len, offs, lblk_num); + return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num); +} + +/** + * ceph_fscrypt_decrypt_pages - decrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the read data starts + * @len: max length to decrypt + * + * Decrypt an array of fscrypt'ed pages and return the amount of + * data decrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored (and should + * probably be zeroed by the caller). + * + * Returns the length of the decrypted data or a negative errno. + */ +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Decrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} + +/** + * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer + * @inode: inode associated with pages being decrypted + * @page: pointer to page array + * @off: offset into the file that the data in page[0] starts + * @map: pointer to extent array + * @ext_cnt: length of extent array + * + * Given an extent map and a page array, decrypt the received data in-place, + * skipping holes. Returns the offset into buffer of end of last decrypted + * block. + */ +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + int i, ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Nothing to do for empty array */ + if (ext_cnt == 0) { + doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode, + ceph_vinop(inode)); + return 0; + } + + ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len, + &objno, &objoff, &xlen); + + for (i = 0; i < ext_cnt; ++i) { + struct ceph_sparse_extent *ext = &map[i]; + int pgsoff = ext->off - objoff; + int pgidx = pgsoff >> PAGE_SHIFT; + int fret; + + if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) { + pr_warn_client(cl, + "%p %llx.%llx bad encrypted sparse extent " + "idx %d off %llx len %llx\n", + inode, ceph_vinop(inode), i, ext->off, + ext->len); + return -EIO; + } + fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx], + off + pgsoff, ext->len); + doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode, + ceph_vinop(inode), i, ext->off, ext->len, fret); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret = pgsoff + fret; + } + doutc(cl, "ret %d\n", ret); + return ret; +} + +/** + * ceph_fscrypt_encrypt_pages - encrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the data starts + * @len: max length to encrypt + * + * Encrypt an array of cleartext pages and return the amount of + * data encrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored. + * + * Returns the length of the encrypted data or a negative errno. + */ +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Encrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 000000000000..b748e2060bc9 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ceph fscrypt functionality + */ + +#ifndef _CEPH_CRYPTO_H +#define _CEPH_CRYPTO_H + +#include <crypto/sha2.h> +#include <linux/fscrypt.h> +#include <linux/base64.h> + +#define CEPH_FSCRYPT_BLOCK_SHIFT 12 +#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT) +#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1)) + +struct ceph_fs_client; +struct ceph_acl_sec_ctx; +struct ceph_mds_request; + +struct ceph_fname { + struct inode *dir; + char *name; // b64 encoded, possibly hashed + unsigned char *ctext; // binary crypttext (if any) + u32 name_len; // length of name buffer + u32 ctext_len; // length of crypttext + bool no_copy; +}; + +/* + * Header for the encrypted file when truncating the size, this + * will be sent to MDS, and the MDS will update the encrypted + * last block and then truncate the size. + */ +struct ceph_fscrypt_truncate_size_header { + __u8 ver; + __u8 compat; + + /* + * It will be sizeof(assert_ver + file_offset + block_size) + * if the last block is empty when it's located in a file + * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE. + */ + __le32 data_len; + + __le64 change_attr; + __le64 file_offset; + __le32 block_size; +} __packed; + +struct ceph_fscrypt_auth { + __le32 cfa_version; + __le32 cfa_blob_len; + u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE]; +} __packed; + +#define CEPH_FSCRYPT_AUTH_VERSION 1 +static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) +{ + u32 ctxsize = le32_to_cpu(fa->cfa_blob_len); + + return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize; +} + +#ifdef CONFIG_FS_ENCRYPTION +/* + * We want to encrypt filenames when creating them, but the encrypted + * versions of those names may have illegal characters in them. To mitigate + * that, we base64 encode them, but that gives us a result that can exceed + * NAME_MAX. + * + * Follow a similar scheme to fscrypt itself, and cap the filename to a + * smaller size. If the ciphertext name is longer than the value below, then + * sha256 hash the remaining bytes. + * + * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph + * so the corresponding struct will be: + * + * struct fscrypt_ceph_nokey_name { + * u8 bytes[157]; + * u8 sha256[SHA256_DIGEST_SIZE]; + * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255) + * + * (240 bytes is the maximum size allowed for snapshot names to take into + * account the format: '_<SNAPSHOT-NAME>_<INODE-NUMBER>'.) + * + * Note that for long names that end up having their tail portion hashed, we + * must also store the full encrypted name (in the dentry's alternate_name + * field). + */ +#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE) + +void ceph_fscrypt_set_ops(struct super_block *sb); + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as); +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as); +int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len); + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (!IS_ENCRYPTED(parent)) + return 0; + return fscrypt_fname_alloc_buffer(NAME_MAX, fname); +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (IS_ENCRYPTED(parent)) + fscrypt_fname_free_buffer(fname); +} + +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey); +int ceph_fscrypt_prepare_readdir(struct inode *dir); + +static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len) +{ + /* crypto blocks cannot span more than one page */ + BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT); + + return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) - + (off >> CEPH_FSCRYPT_BLOCK_SHIFT); +} + +/* + * If we have an encrypted inode then we must adjust the offset and + * range of the on-the-wire read to cover an entire encryption block. + * The copy will be done using the original offset and length, after + * we've decrypted the result. + */ +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ + if (IS_ENCRYPTED(inode)) { + *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE; + *off &= CEPH_FSCRYPT_BLOCK_MASK; + } +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num); +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num); +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len); +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt); +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len); + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page; +} + +#else /* CONFIG_FS_ENCRYPTION */ + +static inline void ceph_fscrypt_set_ops(struct super_block *sb) +{ +} + +static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ +} + +static inline int ceph_fscrypt_prepare_context(struct inode *dir, + struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + return 0; +} + +static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ +} + +static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf, + int len) +{ + return len; +} + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + return 0; +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ +} + +static inline int ceph_fname_to_usr(const struct ceph_fname *fname, + struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + oname->name = fname->name; + oname->len = fname->name_len; + return 0; +} + +static inline int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + return 0; +} + +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ +} + +static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_extents(struct inode *inode, + struct page **page, u64 off, + struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len) +{ + return 0; +} + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return page; +} +#endif /* CONFIG_FS_ENCRYPTION */ + +static inline loff_t ceph_fscrypt_page_offset(struct page *page) +{ + return page_offset(ceph_fscrypt_pagecache_page(page)); +} + +#endif /* _CEPH_CRYPTO_H */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6d59006bfa27..f3fe786b4143 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/device.h> @@ -6,6 +7,8 @@ #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/math64.h> +#include <linux/ktime.h> #include <linux/ceph/libceph.h> #include <linux/ceph/mon_client.h> @@ -17,27 +20,27 @@ #ifdef CONFIG_DEBUG_FS #include "mds_client.h" +#include "metric.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; struct ceph_fs_client *fsc = s->private; + struct ceph_mdsmap *mdsmap; - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; - seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); - seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); - seq_printf(s, "session_timeout %d\n", - fsc->mdsc->mdsmap->m_session_timeout); - seq_printf(s, "session_autoclose %d\n", - fsc->mdsc->mdsmap->m_session_autoclose); - for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { - struct ceph_entity_addr *addr = - &fsc->mdsc->mdsmap->m_info[i].addr; - int state = fsc->mdsc->mdsmap->m_info[i].state; - + mdsmap = fsc->mdsc->mdsmap; + seq_printf(s, "epoch %d\n", mdsmap->m_epoch); + seq_printf(s, "root %d\n", mdsmap->m_root); + seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); + seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); + for (i = 0; i < mdsmap->possible_max_rank; i++) { + struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; + int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, - ceph_pr_addr(&addr->in_addr), + ceph_pr_addr(addr), ceph_mds_state_name(state)); } return 0; @@ -52,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -70,45 +71,46 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); - if (req->r_got_unsafe) - seq_printf(s, "\t(unsafe)"); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + seq_puts(s, "\t(unsafe)"); else - seq_printf(s, "\t"); + seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); - seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_dentry->d_parent->d_inode), - req->r_dentry->d_name.len, - req->r_dentry->d_name.name, + seq_printf(s, " #%llx/%pd (%s)", + ceph_ino(d_inode(req->r_dentry->d_parent)), + req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - kfree(path); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); - seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry_dir), - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, + seq_printf(s, " #%llx/%pd (%s)", + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, + req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - kfree(path); - } else if (req->r_path2) { + ceph_mdsc_free_path_info(&path_info); + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); @@ -116,49 +118,257 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, " %s", req->r_path2); } - seq_printf(s, "\n"); + seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); return 0; } +#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \ + s64 _total, _avg, _min, _max, _sq, _st; \ + _avg = ktime_to_us(avg); \ + _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ + _max = ktime_to_us(max); \ + _total = total - 1; \ + _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ + _st = int_sqrt64(_sq); \ + _st = ktime_to_us(_st); \ + seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ + name, total, _avg, _min, _max, _st); \ +} + +#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \ + u64 _min = min == U64_MAX ? 0 : min; \ + seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \ + name, total, avg, _min, max, sum); \ +} + +static int metrics_file_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *m = &fsc->mdsc->metric; + + seq_printf(s, "item total\n"); + seq_printf(s, "------------------------------------------\n"); + seq_printf(s, "%-35s%lld\n", "total inodes", + percpu_counter_sum(&m->total_inodes)); + seq_printf(s, "%-35s%lld\n", "opened files", + atomic64_read(&m->opened_files)); + seq_printf(s, "%-35s%lld\n", "pinned i_caps", + atomic64_read(&m->total_caps)); + seq_printf(s, "%-35s%lld\n", "opened inodes", + percpu_counter_sum(&m->opened_inodes)); + return 0; +} + +static const char * const metric_str[] = { + "read", + "write", + "metadata", + "copyfrom" +}; +static int metrics_latency_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *cm = &fsc->mdsc->metric; + struct ceph_metric *m; + s64 total, avg, min, max, sq; + int i; + + seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); + seq_printf(s, "-----------------------------------------------------------------------------------\n"); + + for (i = 0; i < METRIC_MAX; i++) { + m = &cm->metric[i]; + spin_lock(&m->lock); + total = m->total; + avg = m->latency_avg; + min = m->latency_min; + max = m->latency_max; + sq = m->latency_sq_sum; + spin_unlock(&m->lock); + CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq); + } + + return 0; +} + +static int metrics_size_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *cm = &fsc->mdsc->metric; + struct ceph_metric *m; + s64 total; + u64 sum, avg, min, max; + int i; + + seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n"); + seq_printf(s, "----------------------------------------------------------------------------------------\n"); + + for (i = 0; i < METRIC_MAX; i++) { + /* skip 'metadata' as it doesn't use the size metric */ + if (i == METRIC_METADATA) + continue; + m = &cm->metric[i]; + spin_lock(&m->lock); + total = m->total; + sum = m->size_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->size_min; + max = m->size_max; + spin_unlock(&m->lock); + CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum); + } + + return 0; +} + +static int metrics_caps_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *m = &fsc->mdsc->metric; + int nr_caps = 0; + + seq_printf(s, "item total miss hit\n"); + seq_printf(s, "-------------------------------------------------\n"); + + seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease", + atomic64_read(&m->total_dentries), + percpu_counter_sum(&m->d_lease_mis), + percpu_counter_sum(&m->d_lease_hit)); + + nr_caps = atomic64_read(&m->total_caps); + seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, + percpu_counter_sum(&m->i_caps_mis), + percpu_counter_sum(&m->i_caps_hit)); + + return 0; +} + +static int caps_show_cb(struct inode *inode, int mds, void *p) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct seq_file *s = p; + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap) + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + spin_unlock(&ci->i_ceph_lock); + return 0; +} + static int caps_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; - int total, avail, used, reserved, min; + struct ceph_mds_client *mdsc = fsc->mdsc; + int total, avail, used, reserved, min, i; + struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" "reserved\t%d\n" - "min\t%d\n", + "min\t\t%d\n\n", total, avail, used, reserved, min); + seq_printf(s, "ino mds issued implemented\n"); + seq_printf(s, "--------------------------------------------------\n"); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; + mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); + ceph_iterate_session_caps(session, caps_show_cb, s); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + return 0; } -static int dentry_lru_show(struct seq_file *s, void *ptr) +static int mds_sessions_show(struct seq_file *s, void *ptr) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; - struct ceph_dentry_info *di; + struct ceph_auth_client *ac = fsc->client->monc.auth; + struct ceph_options *opt = fsc->client->options; + int mds; + + mutex_lock(&mdsc->mutex); + + /* The 'num' portion of an 'entity name' */ + seq_printf(s, "global_id %llu\n", ac->global_id); - spin_lock(&mdsc->dentry_lru_lock); - list_for_each_entry(di, &mdsc->dentry_lru, lru) { - struct dentry *dentry = di->dentry; - seq_printf(s, "%p %p\t%.*s\n", - di, dentry, dentry->d_name.len, dentry->d_name.name); + /* The -o name mount argument */ + seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : ""); + + /* The list of MDS session rank+state */ + for (mds = 0; mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = + __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + continue; + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "mds.%d %s\n", + session->s_mds, + ceph_session_state_name(session->s_state)); + + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); } - spin_unlock(&mdsc->dentry_lru_lock); + mutex_unlock(&mdsc->mutex); return 0; } -CEPH_DEFINE_SHOW_FUNC(mdsmap_show) -CEPH_DEFINE_SHOW_FUNC(mdsc_show) -CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) +static int status_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_entity_inst *inst = &fsc->client->msgr.inst; + struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client); + + seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), + ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); + seq_printf(s, "blocklisted: %s\n", str_true_false(fsc->blocklisted)); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mdsmap); +DEFINE_SHOW_ATTRIBUTE(mdsc); +DEFINE_SHOW_ATTRIBUTE(caps); +DEFINE_SHOW_ATTRIBUTE(mds_sessions); +DEFINE_SHOW_ATTRIBUTE(status); +DEFINE_SHOW_ATTRIBUTE(metrics_file); +DEFINE_SHOW_ATTRIBUTE(metrics_latency); +DEFINE_SHOW_ATTRIBUTE(metrics_size); +DEFINE_SHOW_ATTRIBUTE(metrics_caps); /* @@ -186,85 +396,86 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - dout("ceph_fs_debugfs_cleanup\n"); + doutc(fsc->client, "begin\n"); debugfs_remove(fsc->debugfs_bdi); debugfs_remove(fsc->debugfs_congestion_kb); debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_status); debugfs_remove(fsc->debugfs_mdsc); - debugfs_remove(fsc->debugfs_dentry_lru); + debugfs_remove_recursive(fsc->debugfs_metrics_dir); + doutc(fsc->client, "done\n"); } -int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - char name[100]; - int err = -ENOMEM; + char name[NAME_MAX]; - dout("ceph_fs_debugfs_init\n"); - BUG_ON(!fsc->client->debugfs_dir); + doutc(fsc->client, "begin\n"); fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, fsc->client->debugfs_dir, fsc, &congestion_kb_fops); - if (!fsc->debugfs_congestion_kb) - goto out; snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->backing_dev_info.dev)); + bdi_dev_name(fsc->sb->s_bdi)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, name); - if (!fsc->debugfs_bdi) - goto out; fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, + 0400, + fsc->client->debugfs_dir, + fsc, + &mdsmap_fops); + + fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", + 0400, fsc->client->debugfs_dir, fsc, - &mdsmap_show_fops); - if (!fsc->debugfs_mdsmap) - goto out; + &mds_sessions_fops); fsc->debugfs_mdsc = debugfs_create_file("mdsc", - 0600, + 0400, fsc->client->debugfs_dir, fsc, - &mdsc_show_fops); - if (!fsc->debugfs_mdsc) - goto out; + &mdsc_fops); fsc->debugfs_caps = debugfs_create_file("caps", - 0400, - fsc->client->debugfs_dir, - fsc, - &caps_show_fops); - if (!fsc->debugfs_caps) - goto out; - - fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, - fsc->client->debugfs_dir, - fsc, - &dentry_lru_show_fops); - if (!fsc->debugfs_dentry_lru) - goto out; - - return 0; - -out: - ceph_fs_debugfs_cleanup(fsc); - return err; + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_fops); + + fsc->debugfs_status = debugfs_create_file("status", + 0400, + fsc->client->debugfs_dir, + fsc, + &status_fops); + + fsc->debugfs_metrics_dir = debugfs_create_dir("metrics", + fsc->client->debugfs_dir); + + debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_file_fops); + debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_latency_fops); + debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_size_fops); + debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_caps_fops); + doutc(fsc->client, "done\n"); } #else /* CONFIG_DEBUG_FS */ -int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) +void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a40ceda47a32..86d7aa594ea9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,13 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> -#include <linux/fs_struct.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/xattr.h> #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -26,84 +28,154 @@ * point by name. */ -const struct inode_operations ceph_dir_iops; -const struct file_operations ceph_dir_fops; const struct dentry_operations ceph_dentry_ops; +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di); +static int __dir_lease_try_check(const struct dentry *dentry); + /* * Initialize ceph dentry state. */ -int ceph_init_dentry(struct dentry *dentry) +static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); - if (dentry->d_fsdata) - return 0; - - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) return -ENOMEM; /* oh well */ - spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) { - /* lost a race */ - kmem_cache_free(ceph_dentry_cachep, di); - goto out_unlock; - } - - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di->dentry = dentry; di->lease_session = NULL; - dentry->d_time = jiffies; - /* avoid reordering d_fsdata setup so that the check above is safe */ - smp_mb(); + di->time = jiffies; dentry->d_fsdata = di; - ceph_dentry_lru_add(dentry); -out_unlock: - spin_unlock(&dentry->d_lock); + INIT_LIST_HEAD(&di->lease_list); + + atomic64_inc(&mdsc->metric.total_dentries); + return 0; } -struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +/* + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); + */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) { - struct inode *inode = NULL; + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} - if (!dentry) - return NULL; +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} - spin_lock(&dentry->d_lock); - if (!IS_ROOT(dentry)) { - inode = dentry->d_parent->d_inode; - ihold(inode); - } - spin_unlock(&dentry->d_lock); - return inode; +static unsigned fpos_frag(loff_t p) +{ + return p >> OFFSET_BITS; +} + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + +static unsigned fpos_off(loff_t p) +{ + return p & OFFSET_MASK; } +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. */ -static unsigned fpos_frag(loff_t p) +static int note_last_dentry(struct ceph_fs_client *fsc, + struct ceph_dir_file_info *dfi, + const char *name, + int len, unsigned next_offset) { - return p >> 32; + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(dfi->last_name); + dfi->last_name = buf; + memcpy(dfi->last_name, name, len); + dfi->last_name[len] = 0; + dfi->next_offset = next_offset; + doutc(fsc->client, "'%s'\n", dfi->last_name); + return 0; } -static unsigned fpos_off(loff_t p) + + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) { - return p & 0xffffffff; + struct inode *dir = d_inode(parent); + struct ceph_client *cl = ceph_inode_to_client(dir); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); + if (IS_ERR(cache_ctl->folio)) { + cache_ctl->folio = NULL; + doutc(cl, " folio %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_rwsem, no need to use folio lock */ + folio_unlock(cache_ctl->folio); + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_rwsem. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); } /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on - * d_u.d_child when we initially get results back from the MDS, and + * d_children when we initially get results back from the MDS, and * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * @@ -111,419 +183,563 @@ static unsigned fpos_off(loff_t p) * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *file, struct dir_context *ctx) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + int shared_gen) { - struct ceph_file_info *fi = file->private_data; - struct dentry *parent = file->f_dentry; - struct inode *dir = parent->d_inode; - struct list_head *p; - struct dentry *dentry, *last; + struct ceph_dir_file_info *dfi = file->private_data; + struct dentry *parent = file->f_path.dentry; + struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir); + struct ceph_client *cl = ceph_inode_to_client(dir); + struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; + struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; int err = 0; - /* claim ref on last dentry we returned */ - last = fi->dentry; - fi->dentry = NULL; + doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir), + (unsigned)shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linear search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } - dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, - last); + doutc(cl, "%p %llx.%llx cache idx %llu\n", dir, + ceph_vinop(dir), idx); + } - spin_lock(&parent->d_lock); - /* start at beginning? */ - if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { - if (list_empty(&parent->d_subdirs)) - goto out_unlock; - p = parent->d_subdirs.prev; - dout(" initial p %p/%p\n", p->prev, p->next); - } else { - p = last->d_u.d_child.prev; - } + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { + dfi->file_info.flags |= CEPH_F_ATEND; + err = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } -more: - dentry = list_entry(p, struct dentry, d_u.d_child); - di = ceph_dentry(dentry); - while (1) { - dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, - d_unhashed(dentry) ? "!hashed" : "hashed", - parent->d_subdirs.prev, parent->d_subdirs.next); - if (p == &parent->d_subdirs) { - fi->flags |= CEPH_F_ATEND; - goto out_unlock; + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (d_unhashed(dentry) || + d_really_is_negative(dentry) || + di->lease_shared_gen != shared_gen || + ((dentry->d_flags & DCACHE_NOKEY_NAME) && + fscrypt_has_encryption_key(dir))) { + spin_unlock(&dentry->d_lock); + dput(dentry); + err = -EAGAIN; + goto out; + } + if (fpos_cmp(ctx->pos, di->offset) <= 0) { + __ceph_dentry_dir_lease_touch(di); + emit_dentry = true; } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!d_unhashed(dentry) && dentry->d_inode && - ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && - ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - ctx->pos <= di->offset) - break; - dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, - dentry->d_name.len, dentry->d_name.name, di->offset, - ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !dentry->d_inode ? " null" : ""); spin_unlock(&dentry->d_lock); - p = p->prev; - dentry = list_entry(p, struct dentry, d_u.d_child); - di = ceph_dentry(dentry); - } - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); + if (emit_dentry) { + doutc(cl, " %llx dentry %p %pd %p\n", di->offset, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, ceph_present_inode(d_inode(dentry)), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, - dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; - if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12)) { - if (last) { - /* remember our position */ - fi->dentry = last; - fi->next_offset = di->offset; + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); } - dput(dentry); - return 0; - } - - if (last) - dput(last); - last = dentry; - - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; } - - spin_lock(&parent->d_lock); - p = p->prev; /* advance to next dentry */ - goto more; - -out_unlock: - spin_unlock(&parent->d_lock); out: - if (last) + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(fsc, dfi, last->d_name.name, + last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; dput(last); + /* last_name no longer match cache index */ + if (dfi->readdir_cache_idx >= 0) { + dfi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; + } + } return err; } -/* - * make note of the last dentry we read, so we can - * continue at the same lexicographical point, - * regardless of what dir changes take place on the - * server. - */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, - int len) +static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos) { - kfree(fi->last_name); - fi->last_name = kmalloc(len+1, GFP_NOFS); - if (!fi->last_name) - return -ENOMEM; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - dout("note_last_dentry '%s'\n", fi->last_name); - return 0; + if (!dfi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos)); + else + return dfi->frag != fpos_frag(pos); } static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + struct ceph_client *cl = fsc->client; + int i; int err; - u32 ftype; + unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = fsc->mount_options->max_readdir; - const int max_bytes = fsc->mount_options->max_readdir_bytes; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); - if (fi->flags & CEPH_F_ATEND) + doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode, + ceph_vinop(inode), file, ctx->pos); + if (dfi->file_info.flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ if (ctx->pos == 0) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - - dout("readdir off 0 -> '.'\n"); - if (!dir_emit(ctx, ".", 1, - ceph_translate_ino(inode->i_sb, inode->i_ino), + doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode, + ceph_vinop(inode)); + if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode), inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { - ino_t ino = parent_ino(file->f_dentry); - dout("readdir off 1 -> '..'\n"); - if (!dir_emit(ctx, "..", 2, - ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12)) + u64 ino; + struct dentry *dentry = file->f_path.dentry; + + spin_lock(&dentry->d_lock); + ino = ceph_present_inode(dentry->d_parent->d_inode); + spin_unlock(&dentry->d_lock); + + doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode, + ceph_vinop(inode)); + if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } - /* can we use the dcache? */ + err = ceph_fscrypt_prepare_readdir(inode); + if (err < 0) + return err; + spin_lock(&ci->i_ceph_lock); - if ((ctx->pos == 2 || fi->dentry) && + /* request Fx cap. if have Fx, we don't need to release Fs cap + * for later create/unlink. */ + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR); + /* can we use the dcache? */ + if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - __ceph_dir_is_complete(ci) && - __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + __ceph_dir_is_complete_ordered(ci) && + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { + int shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(file, ctx); + err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; } else { spin_unlock(&ci->i_ceph_lock); } - if (fi->dentry) { - err = note_last_dentry(fi, fi->dentry->d_name.name, - fi->dentry->d_name.len); - if (err) - return err; - dput(fi->dentry); - fi->dentry = NULL; - } /* proceed with a normal readdir */ - more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(dfi, ctx->pos)) { struct ceph_mds_request *req; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; /* discard old result, if any */ - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); + if (is_hash_order(ctx->pos)) { + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", - ceph_vinop(inode), frag, fi->last_name); + doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n", + inode, ceph_vinop(inode), frag, dfi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = inode; - ihold(inode); - req->r_dentry = dget(file->f_dentry); + + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; - req->r_direct_hash = ceph_frag_value(frag); - req->r_direct_is_hash = true; - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); - req->r_readdir_offset = fi->next_offset; + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + req->r_inode_drop = CEPH_CAP_FILE_EXCL; + } + if (dfi->last_name) { + int len = strlen(dfi->last_name); + + req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + memcpy(req->r_path2, dfi->last_name, len); + + err = ceph_encode_encrypted_dname(inode, req->r_path2, len); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } + } else if (is_hash_order(ctx->pos)) { + req->r_args.readdir.offset_hash = + cpu_to_le32(fpos_hash(ctx->pos)); + } + + req->r_dir_release_cnt = dfi->dir_release_count; + req->r_dir_ordered_cnt = dfi->dir_ordered_count; + req->r_readdir_cache_idx = dfi->readdir_cache_idx; + req->r_readdir_offset = dfi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, - (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); + doutc(cl, "%p %llx.%llx got and parsed readdir result=%d" + "on frag %x, end=%d, complete=%d, hash_order=%d\n", + inode, ceph_vinop(inode), err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); + + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (!rinfo->hash_order) { + dfi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + dfi->next_offset, + false); + } + } - if (!req->r_did_prepopulate) { - dout("readdir !did_prepopulate"); + dfi->frag = frag; + dfi->last_readdir = req; + + if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { + dfi->readdir_cache_idx = req->r_readdir_cache_idx; + if (dfi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + dfi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && + dfi->next_offset == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + dfi->dir_release_count = req->r_dir_release_cnt; + dfi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode, + ceph_vinop(inode)); + /* disable readdir cache */ + dfi->readdir_cache_idx = -1; /* preclude from marking dir complete */ - fi->dir_release_count--; + dfi->dir_release_count = 0; } /* note next offset and last dentry name */ - fi->offset = fi->next_offset; - fi->last_readdir = req; - - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - } else { - rinfo = &req->r_reply_info; - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1]); - if (err) + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(fsc, dfi, rde->name, + rde->name_len, next_offset); + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; - fi->next_offset += rinfo->dir_nr; + } + } else if (req->r_reply_info.dir_end) { + dfi->next_offset = 2; + /* keep last name */ } } - rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); - - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; - struct ceph_vino vino; - ino_t ino; - - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); - ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { - dout("filldir stopping us...\n"); + rinfo = &dfi->last_readdir->r_reply_info; + doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n", + inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; + + if (rde->offset < ctx->pos) { + pr_warn_client(cl, + "%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n", + inode, ceph_vinop(inode), rde->offset, ctx->pos); + return -EIO; + } + + if (WARN_ON_ONCE(!rde->inode.in)) + return -EIO; + + ctx->pos = rde->offset; + doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode, + ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), + le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ + doutc(cl, "filldir stopping us...\n"); return 0; } - off++; + + /* Reset the lengths to their original allocated vals */ ctx->pos++; } - if (fi->last_name) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + + if (dfi->next_offset > 2) { + frag = dfi->frag; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - off = 0; - ctx->pos = ceph_make_fpos(frag, off); - dout("readdir next frag is %x\n", frag); + if (!ceph_frag_is_rightmost(dfi->frag)) { + frag = ceph_frag_next(dfi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + dfi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, dfi->next_offset, + false); + kfree(dfi->last_name); + dfi->last_name = NULL; + } + doutc(cl, "%p %llx.%llx next frag is %x\n", inode, + ceph_vinop(inode), frag); goto more; } - fi->flags |= CEPH_F_ATEND; + dfi->file_info.flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; + if (atomic64_read(&ci->i_release_count) == + dfi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (dfi->dir_ordered_count == + atomic64_read(&ci->i_ordered_count)) { + doutc(cl, " marking %p %llx.%llx complete and ordered\n", + inode, ceph_vinop(inode)); + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(dfi->readdir_cache_idx < 0); + i_size_write(inode, dfi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { + doutc(cl, " marking %llx.%llx complete\n", + ceph_vinop(inode)); + } + __ceph_dir_set_complete(ci, dfi->dir_release_count, + dfi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&ci->i_ceph_lock); - - dout("readdir %p file %p done.\n", inode, file); + doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode), + file); return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_dir_file_info *dfi) { - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; - } - kfree(fi->last_name); - fi->last_name = NULL; - fi->next_offset = 2; /* compensate for . and .. */ - if (fi->dentry) { - dput(fi->dentry); - fi->dentry = NULL; - } - fi->flags &= ~CEPH_F_ATEND; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; + } + kfree(dfi->last_name); + dfi->last_name = NULL; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + dfi->next_offset = 2; /* compensate for . and .. */ + dfi->file_info.flags &= ~CEPH_F_ATEND; +} + +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sorted in hash order */ + } else if (dfi->frag != fpos_frag(new_pos)) { + return true; + } + rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + struct ceph_client *cl = ceph_inode_to_client(inode); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { - case SEEK_END: - offset += inode->i_size + 2; /* FIXME */ - break; case SEEK_CUR: offset += file->f_pos; + break; case SEEK_SET: break; + case SEEK_END: + retval = -EOPNOTSUPP; + goto out; default: goto out; } - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0) { + if (need_reset_readdir(dfi, offset)) { + doutc(cl, "%p %llx.%llx dropping %p content\n", + inode, ceph_vinop(inode), file); + reset_readdir(dfi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; - file->f_version = 0; - fi->flags &= ~CEPH_F_ATEND; + dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; - - /* - * discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk. - */ - if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || - fpos_off(offset) < fi->offset) { - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); - } - - /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) - fi->dir_release_count--; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } /* * Handle lookups for the hidden .snap directory. */ -int ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) +struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ + struct ceph_client *cl = ceph_inode_to_client(parent); /* .snap dir? */ - if (err == -ENOENT && - ceph_snap(parent) == CEPH_NOSNAP && - strcmp(dentry->d_name.name, - fsc->mount_options->snapdir_name) == 0) { + if (ceph_snap(parent) == CEPH_NOSNAP && + strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { + struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); - dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", - dentry, dentry->d_name.len, dentry->d_name.name, inode); - BUG_ON(!d_unhashed(dentry)); - d_add(dentry, inode); - err = 0; + + res = d_splice_alias(inode, dentry); + doutc(cl, "ENOENT on snapdir %p '%pd', linking to " + "snapdir %p %llx.%llx. Spliced dentry %p\n", + dentry, dentry, inode, ceph_vinop(inode), res); + if (res) + dentry = res; } - return err; + return dentry; } /* @@ -540,13 +756,16 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { + struct ceph_client *cl = req->r_mdsc->fsc->client; + if (err == -ENOENT) { /* no trace? */ err = 0; if (!req->r_reply_info.head->is_dentry) { - dout("ENOENT and no trace, dentry %p inode %p\n", - dentry, dentry->d_inode); - if (dentry->d_inode) { + doutc(cl, + "ENOENT and no trace, dentry %p inode %llx.%llx\n", + dentry, ceph_vinop(d_inode(dentry))); + if (d_really_is_positive(dentry)) { d_drop(dentry); err = -ENOENT; } else { @@ -563,7 +782,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; @@ -576,39 +795,53 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_request *req; int op; + int mask; int err; - dout("lookup %p dentry %p '%.*s'\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name); + doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir), + dentry, dentry); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - err = ceph_init_dentry(dentry); - if (err < 0) - return ERR_PTR(err); + if (IS_ENCRYPTED(dir)) { + bool had_key = fscrypt_has_encryption_key(dir); + + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + return ERR_PTR(err); + + /* mark directory as incomplete if it has been unlocked */ + if (!had_key && fscrypt_has_encryption_key(dir)) + ceph_dir_clear_complete(dir); + } /* can we conclude ENOENT locally? */ - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&ci->i_ceph_lock); - dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + doutc(cl, " dir %llx.%llx flags are 0x%lx\n", + ceph_vinop(dir), ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && - (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); - dout(" dir %p complete, -ENOENT\n", dir); + doutc(cl, " dir %llx.%llx complete, -ENOENT\n", + ceph_vinop(dir)); d_add(dentry, NULL); - di->lease_shared_gen = ci->i_shared_gen; + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); return NULL; } spin_unlock(&ci->i_ceph_lock); @@ -621,14 +854,30 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; - /* we only need inode linkage */ - req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); - req->r_locked_dir = dir; + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + ihold(dir); + req->r_parent = dir; + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - err = ceph_handle_snapdir(req, dentry, err); + if (err == -ENOENT) { + struct dentry *res; + + res = ceph_handle_snapdir(req, dentry); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } + } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ - dout("lookup result=%p\n", dentry); + doutc(cl, "result=%p\n", dentry); return dentry; } @@ -644,143 +893,322 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) /* * We created the item, then did a lookup, and found * it was already linked to another inode we already - * had in our cache (and thus got spliced). Link our - * dentry to that inode, but don't hash it, just in - * case the VFS wants to dereference it. + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. */ - BUG_ON(!result->d_inode); - d_instantiate(dentry, result->d_inode); - return 0; + d_drop(result); + return -ESTALE; } return PTR_ERR(result); } -static int ceph_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; int err; if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", - dir, dentry, mode, rdev); + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + + doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n", + dir, ceph_vinop(dir), dentry, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { - d_drop(dentry); - return PTR_ERR(req); + err = PTR_ERR(req); + goto out; + } + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; } + + if (S_ISREG(mode) && IS_ENCRYPTED(dir)) + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); - if (err) +out: + if (!err) + ceph_init_inode_acls(d_inode(dentry), &as_ctx); + else d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(dir, dentry, mode, 0); + return ceph_mknod(idmap, dir, dentry, mode, 0); } -static int ceph_symlink(struct inode *dir, struct dentry *dentry, - const char *dest) +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + int err; + int len = strlen(dest); + struct fscrypt_str osd_link = FSTR_INIT(NULL, 0); + + err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX, + &osd_link); + if (err) + goto out; + + err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link); + if (err) + goto out; + + req->r_path2 = kmalloc(BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out; + } + + len = base64_encode(osd_link.name, osd_link.len, + req->r_path2, false, BASE64_IMAP); + req->r_path2[len] = '\0'; +out: + fscrypt_fname_free_buffer(&osd_link); + return err; +} +#else +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + return -EOPNOTSUPP; +} +#endif + +static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *dest) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; + struct ceph_acl_sec_ctx as_ctx = {}; + umode_t mode = S_IFLNK | 0777; int err; if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + + doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry, + dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { - d_drop(dentry); - return PTR_ERR(req); + err = PTR_ERR(req); + goto out; + } + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + + req->r_parent = dir; + ihold(dir); + + if (IS_ENCRYPTED(req->r_new_inode)) { + err = prep_encrypted_symlink_target(req, dest); + if (err) + goto out_req; + } else { + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out_req; + } } + + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_path2 = kstrdup(dest, GFP_NOFS); - req->r_locked_dir = dir; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); +out: if (err) d_drop(dentry); + ceph_release_acl_sec_ctx(&as_ctx); return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; - int err = -EROFS; + struct ceph_acl_sec_ctx as_ctx = {}; + struct dentry *ret; + int err; int op; + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return ERR_PTR(err); + if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; - dout("mksnap dir %p snap '%.*s' dn %p\n", dir, - dentry->d_name.len, dentry->d_name.name, dentry); + doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n", + ceph_vinop(dir), dentry, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); + doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n", + ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { + ret = ERR_PTR(-EROFS); + goto out; + } + + if (op == CEPH_MDS_OP_MKDIR && + ceph_quota_is_max_files_exceeded(dir)) { + ret = ERR_PTR(-EDQUOT); + goto out; + } + if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && + !fscrypt_has_encryption_key(dir)) { + ret = ERR_PTR(-ENOKEY); goto out; } + + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { - err = PTR_ERR(req); + ret = ERR_CAST(req); goto out; } + mode |= S_IFDIR; + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + ret = ERR_CAST(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; + req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + if (op == CEPH_MDS_OP_MKDIR) + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_args.mkdir.mode = cpu_to_le32(mode); - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + ret = ERR_PTR(err); +out_req: + if (!IS_ERR(ret) && req->r_dentry != dentry) + /* Some other dentry was spliced in */ + ret = dget(req->r_dentry); ceph_mdsc_put_request(req); out: - if (err < 0) + if (!IS_ERR(ret)) { + if (ret) + dentry = ret; + ceph_init_inode_acls(d_inode(dentry), &as_ctx); + } else { d_drop(dentry); - return err; + } + ceph_release_acl_sec_ctx(&as_ctx); + return ret; } static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; int err; + if (dentry->d_flags & DCACHE_DISCONNECTED) + return -EINVAL; + + err = ceph_wait_on_conflict_unlink(dentry); + if (err) + return err; + if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("link in dir %p old_dentry %p dentry %p\n", dir, - old_dentry, dentry); + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir), + old_dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); if (IS_ERR(req)) { d_drop(dentry); @@ -788,40 +1216,118 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); - req->r_locked_dir = dir; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry = dget(old_dentry); + /* + * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we + * will just pass the ino# to MDSs. + */ + if (old_dentry->d_flags & DCACHE_DISCONNECTED) + req->r_ino2 = ceph_vino(d_inode(old_dentry)); + req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); if (err) { d_drop(dentry); } else if (!req->r_reply_info.head->is_dentry) { - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); } ceph_mdsc_put_request(req); return err; } -/* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it - * looks like the link count will hit 0, drop any other caps (other - * than PIN) we don't specifically want (due to the file still being - * open). - */ -static int drop_caps_for_unlink(struct inode *inode) +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) { - struct ceph_inode_info *ci = ceph_inode(inode); - int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + struct dentry *dentry = req->r_dentry; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); + struct ceph_client *cl = fsc->client; + struct ceph_dentry_info *di = ceph_dentry(dentry); + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn_client(cl, + "dentry %p:%pd async unlink bit is not set\n", + dentry, dentry); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags); + spin_unlock(&dentry->d_lock); + + synchronize_rcu(); + + if (result == -EJUKEBOX) + goto out; + + /* If op failed, mark everyone involved for errors */ + if (result) { + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); + + /* mark error on parent + clear complete */ + mapping_set_error(req->r_parent->i_mapping, result); + ceph_dir_clear_complete(req->r_parent); + + /* drop the dentry -- we don't know its status */ + if (!d_unhashed(dentry)) + d_drop(dentry); + + /* mark inode itself for an error (since metadata is bogus) */ + mapping_set_error(req->r_old_inode->i_mapping, result); + + pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); + } +out: + iput(req->r_old_inode); + ceph_mdsc_release_dir_caps(req); +} + +static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di; + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK; spin_lock(&ci->i_ceph_lock); - if (inode->i_nlink == 1) { - drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; + if ((__ceph_caps_issued(ci, NULL) & want) == want) { + ceph_take_cap_refs(ci, want, false); + got = want; } spin_unlock(&ci->i_ceph_lock); - return drop; + + /* If we didn't get anything, return 0 */ + if (!got) + return 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + /* + * - We are holding Fx, which implies Fs caps. + * - Only support async unlink for primary linkage + */ + if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen || + !(di->flags & CEPH_DENTRY_PRIMARY_LINK)) + want = 0; + spin_unlock(&dentry->d_lock); + + /* Do we still want what we've got? */ + if (want == got) + return got; + + ceph_put_cap_refs(ci, got); + return 0; } /* @@ -829,25 +1335,55 @@ static int drop_caps_for_unlink(struct inode *inode) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); + struct dentry *dn; int err = -EROFS; int op; + char *path; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ - dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, - dentry->d_name.name, dentry); + doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir), + dentry); op = CEPH_MDS_OP_RMSNAP; } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("unlink/rmdir dir %p dn %p inode %p\n", - dir, dentry, inode); - op = S_ISDIR(dentry->d_inode->i_mode) ? + doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n", + ceph_vinop(dir), dentry, ceph_vinop(inode)); + op = d_is_dir(dentry) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; + + dn = d_find_alias(dir); + if (!dn) { + try_async = false; + } else { + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); + if (IS_ERR(path)) { + try_async = false; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, MAY_WRITE); + } + ceph_mdsc_free_path_info(&path_info); + dput(dn); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + try_async = false; + err = 0; + } + } + +retry: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -855,49 +1391,127 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_locked_dir = dir; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_parent = dir; + ihold(dir); + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_inode_drop = drop_caps_for_unlink(inode); - err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) - d_delete(dentry); + req->r_inode_drop = ceph_drop_caps_for_unlink(inode); + + if (try_async && op == CEPH_MDS_OP_UNLINK && + (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + struct ceph_dentry_info *di = ceph_dentry(dentry); + + doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s", + ceph_vinop(dir), dentry, + ceph_cap_string(req->r_dir_caps)); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_callback = ceph_async_unlink_cb; + req->r_old_inode = d_inode(dentry); + ihold(req->r_old_inode); + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + spin_lock(&fsc->async_unlink_conflict_lock); + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, + dentry->d_name.hash); + spin_unlock(&fsc->async_unlink_conflict_lock); + + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + /* + * We have enough caps, so we assume that the unlink + * will succeed. Fix up the target inode and dcache. + */ + drop_nlink(inode); + d_delete(dentry); + } else { + spin_lock(&fsc->async_unlink_conflict_lock); + hash_del_rcu(&di->hnode); + spin_unlock(&fsc->async_unlink_conflict_lock); + + spin_lock(&dentry->d_lock); + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; + spin_unlock(&dentry->d_lock); + + if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } + } + } else { + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + } + ceph_mdsc_put_request(req); out: return err; } -static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; int err; + if (flags) + return -EINVAL; + if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; - if (ceph_snap(old_dir) != CEPH_NOSNAP || - ceph_snap(new_dir) != CEPH_NOSNAP) - return -EROFS; - dout("rename dir %p dentry %p to dir %p dentry %p\n", - old_dir, old_dentry, new_dir, new_dentry); - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } + /* don't allow cross-quota renames */ + if ((old_dir != new_dir) && + (!ceph_quota_is_same_realm(old_dir, new_dir))) + return -EXDEV; + + err = ceph_wait_on_conflict_unlink(new_dentry); + if (err) + return err; + + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n", + ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir), + new_dentry); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); - req->r_locked_dir = new_dir; - req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_old_dentry_dir = old_dir; + req->r_parent = new_dir; + ihold(new_dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ - req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; - if (new_dentry->d_inode) - req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + if (d_really_is_positive(new_dentry)) { + req->r_inode_drop = + ceph_drop_caps_for_unlink(d_inode(new_dentry)); + } err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* @@ -905,28 +1519,289 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * do_request, above). If there is no trace, we need * to do it here. */ - - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); - - /* ensure target dentry is invalidated, despite - rehashing bug in vfs_rename_dir */ - ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; } /* + * Move dentry to tail of mdsc->dentry_leases list when lease is updated. + * Leases at front of the list will expire first. (Assume all leases have + * similar duration) + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "%p %p '%pd'\n", di, dn, dn); + + di->flags |= CEPH_DENTRY_LEASE_LIST; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + + spin_lock(&mdsc->dentry_list_lock); + list_move_tail(&di->lease_list, &mdsc->dentry_leases); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, + struct ceph_dentry_info *di) +{ + di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED); + di->lease_gen = 0; + di->time = jiffies; + list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases); +} + +/* + * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases + * list if it's not in the list, otherwise set 'referenced' flag. + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset); + + if (!list_empty(&di->lease_list)) { + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + /* don't remove dentry from dentry lease list + * if its lease is valid */ + if (__dentry_lease_is_valid(di)) + return; + } else { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + } + + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + di->flags &= ~CEPH_DENTRY_LEASE_LIST; + return; + } + + spin_lock(&mdsc->dentry_list_lock); + __dentry_dir_lease_touch(mdsc, di); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_lease_unlist(struct ceph_dentry_info *di) +{ + struct ceph_mds_client *mdsc; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) + return; + if (list_empty(&di->lease_list)) + return; + + mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_del_init(&di->lease_list); + spin_unlock(&mdsc->dentry_list_lock); +} + +enum { + KEEP = 0, + DELETE = 1, + TOUCH = 2, + STOP = 4, +}; + +struct ceph_lease_walk_control { + bool dir_lease; + bool expire_dir_lease; + unsigned long nr_to_scan; + unsigned long dir_lease_ttl; +}; + +static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *); +static int __dentry_lease_check(const struct dentry *); + +static unsigned long +__dentry_leases_walk(struct ceph_mds_client *mdsc, + struct ceph_lease_walk_control *lwc) +{ + struct ceph_dentry_info *di, *tmp; + struct dentry *dentry, *last = NULL; + struct list_head* list; + LIST_HEAD(dispose); + unsigned long freed = 0; + int ret = 0; + + list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases; + spin_lock(&mdsc->dentry_list_lock); + list_for_each_entry_safe(di, tmp, list, lease_list) { + if (!lwc->nr_to_scan) + break; + --lwc->nr_to_scan; + + dentry = di->dentry; + if (last == dentry) + break; + + if (!spin_trylock(&dentry->d_lock)) + continue; + + if (__lockref_is_dead(&dentry->d_lockref)) { + list_del_init(&di->lease_list); + goto next; + } + + if (lwc->dir_lease) + ret = __dir_lease_check(dentry, lwc); + else + ret = __dentry_lease_check(dentry); + if (ret & TOUCH) { + /* move it into tail of dir lease list */ + __dentry_dir_lease_touch(mdsc, di); + if (!last) + last = dentry; + } + if (ret & DELETE) { + /* stale lease */ + di->flags &= ~CEPH_DENTRY_REFERENCED; + if (dentry->d_lockref.count > 0) { + /* update_dentry_lease() will re-add + * it to lease list, or + * ceph_d_delete() will return 1 when + * last reference is dropped */ + list_del_init(&di->lease_list); + } else { + di->flags |= CEPH_DENTRY_SHRINK_LIST; + list_move_tail(&di->lease_list, &dispose); + dget_dlock(dentry); + } + } +next: + spin_unlock(&dentry->d_lock); + if (ret & STOP) + break; + } + spin_unlock(&mdsc->dentry_list_lock); + + while (!list_empty(&dispose)) { + di = list_first_entry(&dispose, struct ceph_dentry_info, + lease_list); + dentry = di->dentry; + spin_lock(&dentry->d_lock); + + list_del_init(&di->lease_list); + di->flags &= ~CEPH_DENTRY_SHRINK_LIST; + if (di->flags & CEPH_DENTRY_REFERENCED) { + spin_lock(&mdsc->dentry_list_lock); + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + list_add_tail(&di->lease_list, + &mdsc->dentry_leases); + } else { + __dentry_dir_lease_touch(mdsc, di); + } + spin_unlock(&mdsc->dentry_list_lock); + } else { + freed++; + } + + spin_unlock(&dentry->d_lock); + /* ceph_d_delete() does the trick */ + dput(dentry); + } + return freed; +} + +static int __dentry_lease_check(const struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + int ret; + + if (__dentry_lease_is_valid(di)) + return STOP; + ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) + return TOUCH; + return DELETE; +} + +static int __dir_lease_check(const struct dentry *dentry, + struct ceph_lease_walk_control *lwc) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + + int ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) { + if (time_before(jiffies, di->time + lwc->dir_lease_ttl)) + return STOP; + /* Move dentry to tail of dir lease list if we don't want + * to delete it. So dentries in the list are checked in a + * round robin manner */ + if (!lwc->expire_dir_lease) + return TOUCH; + if (dentry->d_lockref.count > 0 || + (di->flags & CEPH_DENTRY_REFERENCED)) + return TOUCH; + /* invalidate dir lease */ + di->lease_shared_gen = 0; + } + return DELETE; +} + +int ceph_trim_dentries(struct ceph_mds_client *mdsc) +{ + struct ceph_lease_walk_control lwc; + unsigned long count; + unsigned long freed; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + count = mdsc->caps_use_count - mdsc->caps_use_max; + else + count = 0; + spin_unlock(&mdsc->caps_list_lock); + + lwc.dir_lease = false; + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; + freed = __dentry_leases_walk(mdsc, &lwc); + if (!lwc.nr_to_scan) /* more invalid leases */ + return -EAGAIN; + + if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE) + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE; + + lwc.dir_lease = true; + lwc.expire_dir_lease = freed < count; + lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; + freed +=__dentry_leases_walk(mdsc, &lwc); + if (!lwc.nr_to_scan) /* more to check */ + return -EAGAIN; + + return freed > 0 ? 1 : 0; +} + +/* * Ensure a dentry lease will no longer revalidate. */ void ceph_invalidate_dentry_lease(struct dentry *dentry) { + struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); - dentry->d_time = jiffies; - ceph_dentry(dentry)->lease_shared_gen = 0; + di->time = jiffies; + di->lease_shared_gen = 0; + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; + __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -934,35 +1809,55 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ -static int dentry_lease_is_valid(struct dentry *dentry) +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) +{ + struct ceph_mds_session *session; + + if (!di->lease_gen) + return false; + + session = di->lease_session; + if (session) { + u32 gen; + unsigned long ttl; + + gen = atomic_read(&session->s_cap_gen); + ttl = session->s_cap_ttl; + + if (di->lease_gen == gen && + time_before(jiffies, ttl) && + time_before(jiffies, di->time)) + return true; + } + di->lease_gen = 0; + return false; +} + +static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags) { struct ceph_dentry_info *di; - struct ceph_mds_session *s; - int valid = 0; - u32 gen; - unsigned long ttl; struct ceph_mds_session *session = NULL; - struct inode *dir = NULL; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; u32 seq = 0; + int valid = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di->lease_session) { - s = di->lease_session; - spin_lock(&s->s_gen_ttl_lock); - gen = s->s_cap_gen; - ttl = s->s_cap_ttl; - spin_unlock(&s->s_gen_ttl_lock); + if (di && __dentry_lease_is_valid(di)) { + valid = 1; - if (di->lease_gen == gen && - time_before(jiffies, dentry->d_time) && - time_before(jiffies, ttl)) { - valid = 1; - if (di->lease_renew_after && - time_after(jiffies, di->lease_renew_after)) { - /* we should renew */ - dir = dentry->d_parent->d_inode; - session = ceph_get_mds_session(s); + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(di->lease_session); seq = di->lease_seq; di->lease_renew_after = 0; di->lease_renew_from = jiffies; @@ -972,95 +1867,221 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_unlock(&dentry->d_lock); if (session) { - ceph_mdsc_lease_send_msg(session, dir, dentry, + ceph_mdsc_lease_send_msg(session, dentry, CEPH_MDS_LEASE_RENEW, seq); ceph_put_mds_session(session); } - dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + doutc(cl, "dentry %p = %d\n", dentry, valid); return valid; } /* - * Check if directory-wide content lease/cap is valid. + * Called under dentry->d_lock. */ -static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +static int __dir_lease_try_check(const struct dentry *dentry) { - struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *dir; + struct ceph_inode_info *ci; int valid = 0; + if (!di->lease_shared_gen) + return 0; + if (IS_ROOT(dentry)) + return 0; + + dir = d_inode(dentry->d_parent); + ci = ceph_inode(dir); + + if (spin_trylock(&ci->i_ceph_lock)) { + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0)) + valid = 1; + spin_unlock(&ci->i_ceph_lock); + } else { + valid = -EBUSY; + } + + if (!valid) + di->lease_shared_gen = 0; + return valid; +} + +/* + * Check if directory-wide content lease/cap is valid. + */ +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, + struct ceph_mds_client *mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_client *cl = mdsc->fsc->client; + int valid; + int shared_gen; + spin_lock(&ci->i_ceph_lock); - if (ci->i_shared_gen == di->lease_shared_gen) - valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); + if (valid) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); + shared_gen = atomic_read(&ci->i_shared_gen); + } spin_unlock(&ci->i_ceph_lock); - dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", - dir, (unsigned)ci->i_shared_gen, dentry, - (unsigned)di->lease_shared_gen, valid); + if (valid) { + struct ceph_dentry_info *di; + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + if (dir == d_inode(dentry->d_parent) && + di && di->lease_shared_gen == shared_gen) + __ceph_dentry_dir_lease_touch(di); + else + valid = 0; + spin_unlock(&dentry->d_lock); + } + doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir, + ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen), + dentry, dentry, valid); return valid; } /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) +static int ceph_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; int valid = 0; - struct inode *dir; + struct inode *inode; - if (flags & LOOKUP_RCU) - return -ECHILD; + valid = fscrypt_d_revalidate(dir, name, dentry, flags); + if (valid <= 0) + return valid; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode, - ceph_dentry(dentry)->offset); + inode = d_inode_rcu(dentry); - dir = ceph_get_dentry_parent_inode(dentry); + doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n", + dentry, dentry, inode, ceph_dentry(dentry)->offset, + !!(dentry->d_flags & DCACHE_NOKEY_NAME)); + + mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc; /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { - dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - valid = 1; - } else if (dentry->d_inode && - ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry, + dentry, inode); valid = 1; - } else if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) { + } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { valid = 1; + } else { + valid = dentry_lease_is_valid(dentry, flags); + if (valid == -ECHILD) + return valid; + if (valid || dir_lease_is_valid(dir, dentry, mdsc)) { + if (inode) + valid = ceph_is_any_caps(inode); + else + valid = 1; + } } - dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) - ceph_dentry_lru_touch(dentry); - else - d_drop(dentry); - iput(dir); + if (!valid) { + struct ceph_mds_request *req; + int op, err; + u32 mask; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + percpu_counter_inc(&mdsc->metric.d_lease_mis); + + op = ceph_snap(dir) == CEPH_SNAPDIR ? + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); + if (!IS_ERR(req)) { + req->r_dentry = dget(dentry); + req->r_num_caps = 2; + req->r_parent = dir; + ihold(dir); + + req->r_dname = name; + + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + err = ceph_mdsc_do_request(mdsc, NULL, req); + switch (err) { + case 0: + if (d_really_is_positive(dentry) && + d_inode(dentry) == req->r_target_inode) + valid = 1; + break; + case -ENOENT: + if (d_really_is_negative(dentry)) + valid = 1; + fallthrough; + default: + break; + } + ceph_mdsc_put_request(req); + doutc(cl, "%p '%pd', lookup result=%d\n", dentry, + dentry, err); + } + } else { + percpu_counter_inc(&mdsc->metric.d_lease_hit); + } + + doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid"); + if (!valid) + ceph_dir_clear_complete(dir); return valid; } /* + * Delete unused dentry that doesn't have valid lease + * + * Called under dentry->d_lock. + */ +static int ceph_d_delete(const struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + /* won't release caps */ + if (d_really_is_negative(dentry)) + return 0; + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return 0; + /* valid lease? */ + di = ceph_dentry(dentry); + if (di) { + if (__dentry_lease_is_valid(di)) + return 0; + if (__dir_lease_try_check(dentry)) + return 0; + } + return 1; +} + +/* * Release our ceph_dentry_info. */ static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); - dout("d_release %p\n", dentry); - ceph_dentry_lru_del(dentry); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); - kmem_cache_free(ceph_dentry_cachep, di); + doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry); + + atomic64_dec(&fsc->mdsc->metric.total_dentries); + + spin_lock(&dentry->d_lock); + __dentry_lease_unlist(di); dentry->d_fsdata = NULL; -} + spin_unlock(&dentry->d_lock); -static int ceph_snapdir_d_revalidate(struct dentry *dentry, - unsigned int flags) -{ - /* - * Eventually, we'll want to revalidate snapped metadata - * too... probably... - */ - return 1; + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); } /* @@ -1071,21 +2092,39 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, */ static void ceph_d_prune(struct dentry *dentry) { - dout("ceph_d_prune %p\n", dentry); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_inode_info *dir_ci; + struct ceph_dentry_info *di; + + doutc(cl, "dentry %p '%pd'\n", dentry, dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect dir's completeness */ - if (d_unhashed(dentry)) + /* we hold d_lock, so d_parent is stable */ + dir_ci = ceph_inode(d_inode(dentry->d_parent)); + if (dir_ci->i_vino.snap == CEPH_SNAPDIR) return; - /* - * we hold d_lock, so d_parent is stable, and d_fsdata is never - * cleared until d_release - */ - ceph_dir_clear_complete(dentry->d_parent->d_inode); + /* who calls d_delete() should also disable dcache readdir */ + if (d_really_is_negative(dentry)) + return; + + /* d_fsdata does not get cleared until d_release */ + if (!d_unhashed(dentry)) { + __ceph_dir_clear_complete(dir_ci); + return; + } + + /* Disable dcache readdir just in case that someone called d_drop() + * or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED + * properly (dcache readdir is still enabled) */ + di = ceph_dentry(dentry); + if (di->offset > 0 && + di->lease_shared_gen == atomic_read(&dir_ci->i_shared_gen)) + __ceph_dir_clear_ordered(dir_ci); } /* @@ -1095,21 +2134,21 @@ static void ceph_d_prune(struct dentry *dentry) static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct ceph_file_info *cf = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; const int bufsize = 1024; - if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT)) return -EISDIR; - if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_NOFS); - if (!cf->dir_info) + if (!dfi->dir_info) { + dfi->dir_info = kmalloc(bufsize, GFP_KERNEL); + if (!dfi->dir_info) return -ENOMEM; - cf->dir_info_len = - snprintf(cf->dir_info, bufsize, + dfi->dir_info_len = + snprintf(dfi->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" @@ -1117,7 +2156,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, " rfiles: %20lld\n" " rsubdirs: %20lld\n" "rbytes: %20lld\n" - "rctime: %10ld.%09ld\n", + "rctime: %ptSp\n", ci->i_files + ci->i_subdirs, ci->i_files, ci->i_subdirs, @@ -1125,124 +2164,20 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, ci->i_rfiles, ci->i_rsubdirs, ci->i_rbytes, - (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + &ci->i_rctime); } - if (*ppos >= cf->dir_info_len) + if (*ppos >= dfi->dir_info_len) return 0; - size = min_t(unsigned, size, cf->dir_info_len-*ppos); - left = copy_to_user(buf, cf->dir_info + *ppos, size); + size = min_t(unsigned, size, dfi->dir_info_len-*ppos); + left = copy_to_user(buf, dfi->dir_info + *ppos, size); if (left == size) return -EFAULT; *ppos += (size - left); return size - left; } -/* - * an fsync() on a dir will wait for any uncommitted directory - * operations to commit. - */ -static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - dout("dir_fsync %p\n", inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_entry(head->prev, - struct ceph_mds_request, r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("dir_fsync %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - if (req->r_timeout) { - ret = wait_for_completion_timeout( - &req->r_safe_completion, req->r_timeout); - if (ret > 0) - ret = 0; - else if (ret == 0) - ret = -EIO; /* timed out */ - } else { - wait_for_completion(&req->r_safe_completion); - } - ceph_mdsc_put_request(req); - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_entry(head->next, - struct ceph_mds_request, r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* - * We maintain a private dentry LRU. - * - * FIXME: this needs to be changed to a per-mds lru to be useful. - */ -void ceph_dentry_lru_add(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_add %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); -} - -void ceph_dentry_lru_touch(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, - dn->d_name.len, dn->d_name.name, di->offset); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); -} - -void ceph_dentry_lru_del(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_del %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); -} /* * Return name hash for a given dentry. This is dependent on @@ -1251,6 +2186,7 @@ void ceph_dentry_lru_del(struct dentry *dn) unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1258,19 +2194,33 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } +WRAP_DIR_ITER(ceph_readdir) // FIXME! const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, - .fsync = ceph_dir_fsync, + .compat_ioctl = compat_ptr_ioctl, + .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, +}; + +const struct file_operations ceph_snapdir_fops = { + .iterate_shared = shared_ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, }; const struct inode_operations ceph_dir_iops = { @@ -1278,10 +2228,9 @@ const struct inode_operations ceph_dir_iops = { .permission = ceph_permission, .getattr = ceph_getattr, .setattr = ceph_setattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, + .get_inode_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, @@ -1293,18 +2242,19 @@ const struct inode_operations ceph_dir_iops = { .atomic_open = ceph_atomic_open, }; -const struct dentry_operations ceph_dentry_ops = { - .d_revalidate = ceph_d_revalidate, - .d_release = ceph_d_release, - .d_prune = ceph_d_prune, -}; - -const struct dentry_operations ceph_snapdir_dentry_ops = { - .d_revalidate = ceph_snapdir_d_revalidate, - .d_release = ceph_d_release, +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, + .rename = ceph_rename, }; -const struct dentry_operations ceph_snap_dentry_ops = { +const struct dentry_operations ceph_dentry_ops = { + .d_revalidate = ceph_d_revalidate, + .d_delete = ceph_d_delete, .d_release = ceph_d_release, .d_prune = ceph_d_prune, + .d_init = ceph_d_init, }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 16796be53ca5..b2f2af104679 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,28 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/exportfs.h> #include <linux/slab.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include "super.h" #include "mds_client.h" - -/* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ +#include "crypto.h" /* * Basic fh @@ -32,100 +17,145 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); /* - * The presence of @parent_inode here tells us whether NFS wants a - * connectable file handle. However, we want to make a connectionable - * file handle unconditionally so that the MDS gets as much of a hint - * as possible. That means we only use @parent_dentry to indicate - * whether nfsd wants a connectable fh, and whether we should indicate - * failure from a too-small @max_len. + * fh for snapped inode */ +struct ceph_nfs_snapfh { + u64 ino; + u64 snapid; + u64 parent_ino; + u32 hash; +} __attribute__ ((packed)); + +#define BYTES_PER_U32 (sizeof(u32)) +#define CEPH_FH_BASIC_SIZE \ + (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32) +#define CEPH_FH_WITH_PARENT_SIZE \ + (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32) +#define CEPH_FH_SNAPPED_INODE_SIZE \ + (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32) + +static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE; + struct ceph_nfs_snapfh *sfh = (void *)rawfh; + u64 snapid = ceph_snap(inode); + int ret; + bool no_parent = true; + + if (*max_len < snap_handle_length) { + *max_len = snap_handle_length; + ret = FILEID_INVALID; + goto out; + } + + ret = -EINVAL; + if (snapid != CEPH_SNAPDIR) { + struct inode *dir; + struct dentry *dentry = d_find_alias(inode); + if (!dentry) + goto out; + + rcu_read_lock(); + dir = d_inode_rcu(dentry->d_parent); + if (ceph_snap(dir) != CEPH_SNAPDIR) { + sfh->parent_ino = ceph_ino(dir); + sfh->hash = ceph_dentry_hash(dir, dentry); + no_parent = false; + } + rcu_read_unlock(); + dput(dentry); + } + + if (no_parent) { + if (!S_ISDIR(inode->i_mode)) + goto out; + sfh->parent_ino = sfh->ino; + sfh->hash = 0; + } + sfh->ino = ceph_ino(inode); + sfh->snapid = snapid; + + *max_len = snap_handle_length; + ret = FILEID_BTRFS_WITH_PARENT; +out: + doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret); + return ret; +} + static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); + static const int handle_length = CEPH_FH_BASIC_SIZE; + static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE; int type; - struct ceph_nfs_fh *fh = (void *)rawfh; - struct ceph_nfs_confh *cfh = (void *)rawfh; - int connected_handle_length = sizeof(*cfh)/4; - int handle_length = sizeof(*fh)/4; - struct dentry *dentry; - struct dentry *parent; - /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) - return -EINVAL; + return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode); - dentry = d_find_alias(inode); + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } - /* if we found an alias, generate a connectable fh */ - if (*max_len >= connected_handle_length && dentry) { - dout("encode_fh %p connectable\n", dentry); - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + if (parent_inode) { + struct ceph_nfs_confh *cfh = (void *)rawfh; + doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode, + ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode)); cfh->ino = ceph_ino(inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, - dentry); + cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; - type = 2; - spin_unlock(&dentry->d_lock); - } else if (*max_len >= handle_length) { - if (parent_inode) { - /* nfsd wants connectable */ - *max_len = connected_handle_length; - type = FILEID_INVALID; - } else { - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(inode); - *max_len = handle_length; - type = 1; - } + type = FILEID_INO32_GEN_PARENT; } else { + struct ceph_nfs_fh *fh = (void *)rawfh; + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); + fh->ino = ceph_ino(inode); *max_len = handle_length; - type = FILEID_INVALID; + type = FILEID_INO32_GEN; } - if (dentry) - dput(dentry); return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh, int fh_len) +static struct inode *__lookup_inode(struct super_block *sb, u64 ino) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; struct inode *inode; - struct dentry *dentry; struct ceph_vino vino; int err; - if (fh_len < sizeof(*fh) / 4) + vino.ino = ino; + vino.snap = CEPH_NOSNAP; + + if (ceph_vino_is_reserved(vino)) return ERR_PTR(-ESTALE); - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; - vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { struct ceph_mds_request *req; + int mask; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, USE_ANY_MDS); if (IS_ERR(req)) return ERR_CAST(req); + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + req->r_ino1 = vino; req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -134,144 +164,455 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, ihold(inode); ceph_mdsc_put_request(req); if (!inode) + return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); + } else { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); return ERR_PTR(-ESTALE); + } } + return inode; +} - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + if (inode->i_nlink == 0) { iput(inode); - return dentry; + return ERR_PTR(-ESTALE); } - err = ceph_init_dentry(dentry); - if (err < 0) { + return inode; +} + +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) +{ + struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); + int err; + + if (IS_ERR(inode)) + return ERR_CAST(inode); + /* We need LINK caps to reliably check i_nlink */ + err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); + if (err) { iput(inode); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); - return dentry; + /* -ESTALE if inode as been unlinked and no file is open */ + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); } -/* - * convert connectable fh to dentry - */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh, int fh_len) +static struct dentry *__snapfh_to_dentry(struct super_block *sb, + struct ceph_nfs_snapfh *sfh, + bool want_parent) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_request *req; struct inode *inode; - struct dentry *dentry; struct ceph_vino vino; + int mask; int err; + bool unlinked = false; - if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); + if (want_parent) { + vino.ino = sfh->parent_ino; + if (sfh->snapid == CEPH_SNAPDIR) + vino.snap = CEPH_NOSNAP; + else if (sfh->ino == sfh->parent_ino) + vino.snap = CEPH_SNAPDIR; + else + vino.snap = sfh->snapid; + } else { + vino.ino = sfh->ino; + vino.snap = sfh->snapid; + } - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; + if (inode) { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); + return ERR_PTR(-ESTALE); + } + return d_obtain_alias(inode); + } - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - inode = req->r_target_inode; - if (inode) + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.lookupino.mask = cpu_to_le32(mask); + if (vino.snap < CEPH_NOSNAP) { + req->r_args.lookupino.snapid = cpu_to_le64(vino.snap); + if (!want_parent && sfh->ino != sfh->parent_ino) { + req->r_args.lookupino.parent = + cpu_to_le64(sfh->parent_ino); + req->r_args.lookupino.hash = + cpu_to_le32(sfh->hash); + } + } + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) { + if (vino.snap == CEPH_SNAPDIR) { + if (inode->i_nlink == 0) + unlinked = true; + inode = ceph_get_snapdir(inode); + } else if (ceph_snap(inode) == vino.snap) { ihold(inode); - ceph_mdsc_put_request(req); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + } else { + /* mds does not support lookup snapped inode */ + inode = ERR_PTR(-EOPNOTSUPP); + } + } else { + inode = ERR_PTR(-ESTALE); } + ceph_mdsc_put_request(req); - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; + if (want_parent) { + doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err); + } else { + doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, + vino.snap, sfh->parent_ino, sfh->hash, err); } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); + /* see comments in ceph_get_parent() */ + return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); +} + +/* + * convert regular fh to dentry + */ +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, false); + } + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / BYTES_PER_U32) + return NULL; + + doutc(fsc->client, "%llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; + struct ceph_mds_request *req; + struct inode *inode; + int mask; + int err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + if (child) { + req->r_inode = d_inode(child); + ihold(d_inode(child)); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; + } + + mask = CEPH_STAT_CAP_INODE; + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.getattr.mask = cpu_to_le32(mask); + + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err) { + ceph_mdsc_put_request(req); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); + + return d_obtain_alias(inode); } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, - fh_len); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, - fh_len); + struct inode *inode = d_inode(child); + struct ceph_client *cl = ceph_inode_to_client(inode); + struct dentry *dn; + + if (ceph_snap(inode) != CEPH_NOSNAP) { + struct inode* dir; + bool unlinked = false; + /* do not support non-directory */ + if (!d_is_dir(child)) { + dn = ERR_PTR(-EINVAL); + goto out; + } + dir = __lookup_inode(inode->i_sb, ceph_ino(inode)); + if (IS_ERR(dir)) { + dn = ERR_CAST(dir); + goto out; + } + /* There can be multiple paths to access snapped inode. + * For simplicity, treat snapdir of head inode as parent */ + if (ceph_snap(inode) != CEPH_SNAPDIR) { + struct inode *snapdir = ceph_get_snapdir(dir); + if (dir->i_nlink == 0) + unlinked = true; + iput(dir); + if (IS_ERR(snapdir)) { + dn = ERR_CAST(snapdir); + goto out; + } + dir = snapdir; + } + /* If directory has already been deleted, further get_parent + * will fail. Do not mark snapdir dentry as disconnected, + * this prevents exportfs from doing further get_parent. */ + if (unlinked) + dn = d_obtain_root(dir); + else + dn = d_obtain_alias(dir); + } else { + dn = __get_parent(child->d_sb, child, 0); + } +out: + doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode, + ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn)); + return dn; } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); - if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + struct ceph_nfs_snapfh *sfh = (void *)fid->raw; + return __snapfh_to_dentry(sb, sfh, true); + } - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*cfh) / BYTES_PER_U32) + return NULL; - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); + doutc(fsc->client, "%llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (unlikely(dentry == ERR_PTR(-ENOENT))) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; +static int __get_snap_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_request *req = NULL; + char *last_name = NULL; + unsigned next_offset = 2; + int err = -EINVAL; + + if (ceph_ino(inode) != ceph_ino(dir)) + goto out; + if (ceph_snap(inode) == CEPH_SNAPDIR) { + if (ceph_snap(dir) == CEPH_NOSNAP) { + /* + * .get_name() from struct export_operations + * assumes that its 'name' parameter is pointing + * to a NAME_MAX+1 sized buffer + */ + strscpy(name, fsc->mount_options->snapdir_name, + NAME_MAX + 1); + err = 0; + } + goto out; } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + if (ceph_snap(dir) != CEPH_SNAPDIR) + goto out; + + while (1) { + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_dir_entry *rde; + int i; + + req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP, + USE_AUTH_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) + goto out; + + req->r_direct_mode = USE_AUTH_MDS; + req->r_readdir_offset = next_offset; + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); + if (last_name) { + req->r_path2 = last_name; + last_name = NULL; + } + + req->r_inode = dir; + ihold(dir); + req->r_dentry = dget(parent); + + inode_lock(dir); + err = ceph_mdsc_do_request(fsc->mdsc, NULL, req); + inode_unlock(dir); + + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } + + if (rinfo->dir_end) + break; + + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } + + ceph_mdsc_put_request(req); + req = NULL; } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + err = -ENOENT; +out: + if (req) + ceph_mdsc_put_request(req); + kfree(last_name); + doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child, + inode, ceph_vinop(inode), err); + return err; +} + +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); + struct ceph_mds_reply_info_parsed *rinfo; + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return __get_snap_name(parent, name, child); + + mdsc = ceph_inode_to_fs_client(inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + inode_lock(dir); + req->r_inode = inode; + ihold(inode); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode_unlock(dir); + + if (err) + goto out; + + rinfo = &req->r_reply_info; + if (!IS_ENCRYPTED(dir)) { + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + } else { + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; + + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) + goto out; + + err = ceph_fname_to_usr(&fname, NULL, &oname, NULL); + if (!err) { + memcpy(name, oname.name, oname.len); + name[oname.len] = 0; + } + ceph_fname_free_buffer(dir, &oname); + } +out: + doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n", + child, inode, ceph_vinop(inode), err, err ? "" : "name ", + err ? "" : name); + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2ddf061c1c4a..983390069f73 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,4 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/striper.h> #include <linux/module.h> #include <linux/sched.h> @@ -7,10 +9,51 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> -#include <linux/aio.h> +#include <linux/falloc.h> +#include <linux/iversion.h> +#include <linux/ktime.h> +#include <linux/splice.h> #include "super.h" #include "mds_client.h" +#include "cache.h" +#include "io.h" +#include "metric.h" + +static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags) +{ + struct ceph_client *cl = mdsc->fsc->client; + u32 wire_flags = 0; + + switch (flags & O_ACCMODE) { + case O_RDONLY: + wire_flags |= CEPH_O_RDONLY; + break; + case O_WRONLY: + wire_flags |= CEPH_O_WRONLY; + break; + case O_RDWR: + wire_flags |= CEPH_O_RDWR; + break; + } + + flags &= ~O_ACCMODE; + +#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } + + ceph_sys2wire(O_CREAT); + ceph_sys2wire(O_EXCL); + ceph_sys2wire(O_TRUNC); + ceph_sys2wire(O_DIRECTORY); + ceph_sys2wire(O_NOFOLLOW); + +#undef ceph_sys2wire + + if (flags) + doutc(cl, "unused open flags: %x\n", flags); + + return cpu_to_le32(wire_flags); +} /* * Ceph file operations @@ -33,6 +76,101 @@ * need to wait for MDS acknowledgement. */ +/* + * How many pages to get in one call to iov_iter_get_pages(). This + * determines the size of the on-stack array used as a buffer. + */ +#define ITER_GET_BVECS_PAGES 64 + +static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, + struct bio_vec *bvecs) +{ + size_t size = 0; + int bvec_idx = 0; + + if (maxsize > iov_iter_count(iter)) + maxsize = iov_iter_count(iter); + + while (size < maxsize) { + struct page *pages[ITER_GET_BVECS_PAGES]; + ssize_t bytes; + size_t start; + int idx = 0; + + bytes = iov_iter_get_pages2(iter, pages, maxsize - size, + ITER_GET_BVECS_PAGES, &start); + if (bytes < 0) + return size ?: bytes; + + size += bytes; + + for ( ; bytes; idx++, bvec_idx++) { + int len = min_t(int, bytes, PAGE_SIZE - start); + + bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start); + bytes -= len; + start = 0; + } + } + + return size; +} + +/* + * iov_iter_get_pages() only considers one iov_iter segment, no matter + * what maxsize or maxpages are given. For ITER_BVEC that is a single + * page. + * + * Attempt to get up to @maxsize bytes worth of pages from @iter. + * Return the number of bytes in the created bio_vec array, or an error. + */ +static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, + struct bio_vec **bvecs, int *num_bvecs) +{ + struct bio_vec *bv; + size_t orig_count = iov_iter_count(iter); + ssize_t bytes; + int npages; + + iov_iter_truncate(iter, maxsize); + npages = iov_iter_npages(iter, INT_MAX); + iov_iter_reexpand(iter, orig_count); + + /* + * __iter_get_bvecs() may populate only part of the array -- zero it + * out. + */ + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); + if (!bv) + return -ENOMEM; + + bytes = __iter_get_bvecs(iter, maxsize, bv); + if (bytes < 0) { + /* + * No pages were pinned -- just free the array. + */ + kvfree(bv); + return bytes; + } + + *bvecs = bv; + *num_bvecs = npages; + return bytes; +} + +static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) +{ + int i; + + for (i = 0; i < num_bvecs; i++) { + if (bvecs[i].bv_page) { + if (should_dirty) + set_page_dirty_lock(bvecs[i].bv_page); + put_page(bvecs[i].bv_page); + } + } + kvfree(bvecs); +} /* * Prepare an open request. Preallocate ceph_cap to avoid an @@ -41,8 +179,7 @@ static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -54,51 +191,101 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) if (IS_ERR(req)) goto out; req->r_fmode = ceph_flags_to_mode(flags); - req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags); req->r_args.open.mode = cpu_to_le32(create_mode); out: return req; } +static int ceph_init_file_info(struct inode *inode, struct file *file, + int fmode, bool isdir) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_file_info *fi; + int ret; + + doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode), + file, inode->i_mode, isdir ? "dir" : "regular"); + BUG_ON(inode->i_fop->release != ceph_release); + + if (isdir) { + struct ceph_dir_file_info *dfi = + kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); + if (!dfi) + return -ENOMEM; + + file->private_data = dfi; + fi = &dfi->file_info; + dfi->next_offset = 2; + dfi->readdir_cache_idx = -1; + } else { + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); + if (!fi) + return -ENOMEM; + + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + + file->private_data = fi; + } + + ceph_get_fmode(ci, fmode, 1); + fi->fmode = fmode; + + spin_lock_init(&fi->rw_contexts_lock); + INIT_LIST_HEAD(&fi->rw_contexts); + fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen); + + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { + ret = ceph_uninline_data(file); + if (ret < 0) + goto error; + } + + return 0; + +error: + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return ret; +} + /* * initialize private struct file data. * if we fail, clean up by dropping fmode reference on the ceph_inode */ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { - struct ceph_file_info *cf; + struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; switch (inode->i_mode & S_IFMT) { case S_IFREG: + ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE); + fallthrough; case S_IFDIR: - dout("init_file %p %p 0%o (regular)\n", inode, file, - inode->i_mode); - cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); - if (cf == NULL) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ - return -ENOMEM; - } - cf->fmode = fmode; - cf->next_offset = 2; - file->private_data = cf; - BUG_ON(inode->i_fop->release != ceph_release); + ret = ceph_init_file_info(inode, file, fmode, + S_ISDIR(inode->i_mode)); break; case S_IFLNK: - dout("init_file %p %p 0%o (symlink)\n", inode, file, - inode->i_mode); - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode, + ceph_vinop(inode), file, inode->i_mode); break; default: - dout("init_file %p %p 0%o (special)\n", inode, file, - inode->i_mode); + doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode, + ceph_vinop(inode), file, inode->i_mode); /* * we need to drop the open ref now, since we don't * have .release set to ceph_release. */ - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ BUG_ON(inode->i_fop->release == ceph_release); /* call the proper open fop */ @@ -108,6 +295,62 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode, int fmode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + __ceph_touch_fmode(ci, mdsc, fmode); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n", + inode, ceph_vinop(inode), ceph_cap_string(wanted), + ceph_cap_string(issued)); + ceph_check_caps(ci, 0); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode), + err); + return err < 0 ? err : 0; +} + +/* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -116,38 +359,70 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; - struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = NULL; + struct ceph_file_info *fi = file->private_data; int err; int flags, fmode, wanted; + struct dentry *dentry; + char *path; + bool do_sync = false; + int mask = MAY_READ; - if (cf) { - dout("open file %p is already opened\n", file); + if (fi) { + doutc(cl, "file %p is already opened\n", file); return 0; } /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ flags = file->f_flags & ~(O_CREAT|O_EXCL); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { flags = O_DIRECTORY; /* mds likes to know */ + } else if (S_ISREG(inode->i_mode)) { + err = fscrypt_file_open(inode, file); + if (err) + return err; + } - dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, - ceph_vinop(inode), file, flags, file->f_flags); + doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); fmode = ceph_flags_to_mode(flags); wanted = ceph_caps_for_mode(fmode); + if (fmode & CEPH_FILE_MODE_WR) + mask |= MAY_WRITE; + dentry = d_find_alias(inode); + if (!dentry) { + do_sync = true; + } else { + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); + if (IS_ERR(path)) { + do_sync = true; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, mask); + } + ceph_mdsc_free_path_info(&path_info); + dput(dentry); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + do_sync = true; + err = 0; + } + } + /* snapped files are read-only */ if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) return -EROFS; /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&ci->i_ceph_lock); - __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -157,33 +432,34 @@ int ceph_open(struct inode *inode, struct file *file) * asynchronously. */ spin_lock(&ci->i_ceph_lock); - if (__ceph_is_any_real_caps(ci) && + if (!do_sync && __ceph_is_any_real_caps(ci) && (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { - int mds_wanted = __ceph_caps_mds_wanted(ci); + int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); - dout("open %p fmode %d want %s issued %s using existing\n", - inode, fmode, ceph_cap_string(wanted), - ceph_cap_string(issued)); - __ceph_get_fmode(ci, fmode); + doutc(cl, "open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ if ((issued & wanted) != wanted && (mds_wanted & wanted) != wanted && ceph_snap(inode) != CEPH_SNAPDIR) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); return ceph_init_file(inode, file, fmode); - } else if (ceph_snap(inode) != CEPH_NOSNAP && + } else if (!do_sync && ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { - __ceph_get_fmode(ci, fmode); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } + spin_unlock(&ci->i_ceph_lock); - dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -191,67 +467,486 @@ int ceph_open(struct inode *inode, struct file *file) } req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; - if (flags & (O_CREAT|O_TRUNC)) - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); - dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); + doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); out: return err; } +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ +static void +cache_file_layout(struct inode *dst, struct inode *src) +{ + struct ceph_inode_info *cdst = ceph_inode(dst); + struct ceph_inode_info *csrc = ceph_inode(src); + + spin_lock(&cdst->i_ceph_lock); + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && + !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { + memcpy(&cdst->i_cached_layout, &csrc->i_layout, + sizeof(cdst->i_cached_layout)); + rcu_assign_pointer(cdst->i_cached_layout.pool_ns, + ceph_try_get_string(csrc->i_layout.pool_ns)); + } + spin_unlock(&cdst->i_ceph_lock); +} + +/* + * Try to set up an async create. We need caps, a file layout, and inode number, + * and either a lease on the dentry or complete dir info. If any of those + * criteria are not satisfied, then return false and the caller can go + * synchronous. + */ +static int try_prep_async_create(struct inode *dir, struct dentry *dentry, + struct ceph_file_layout *lo, u64 *pino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; + u64 ino; + + spin_lock(&ci->i_ceph_lock); + /* No auth cap means no chance for Dc caps */ + if (!ci->i_auth_cap) + goto no_async; + + /* Any delegated inos? */ + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) + goto no_async; + + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) + goto no_async; + + if ((__ceph_caps_issued(ci, NULL) & want) != want) + goto no_async; + + if (d_in_lookup(dentry)) { + if (!__ceph_dir_is_complete(ci)) + goto no_async; + spin_lock(&dentry->d_lock); + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&dentry->d_lock); + } else if (atomic_read(&ci->i_shared_gen) != + READ_ONCE(di->lease_shared_gen)) { + goto no_async; + } + + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); + if (!ino) + goto no_async; + + *pino = ino; + ceph_take_cap_refs(ci, want, false); + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); + rcu_assign_pointer(lo->pool_ns, + ceph_try_get_string(ci->i_cached_layout.pool_ns)); + got = want; +no_async: + spin_unlock(&ci->i_ceph_lock); + return got; +} + +static void restore_deleg_ino(struct inode *dir, u64 ino) +{ + struct ceph_client *cl = ceph_inode_to_client(dir); + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_session *s = NULL; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + s = ceph_get_mds_session(ci->i_auth_cap->session); + spin_unlock(&ci->i_ceph_lock); + if (s) { + int err = ceph_restore_deleg_ino(s, ino); + if (err) + pr_warn_client(cl, + "unable to restore delegated ino 0x%llx to session: %d\n", + ino, err); + ceph_put_mds_session(s); + } +} + +static void wake_async_create_waiters(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool check_cap = false; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + clear_and_wake_up_bit(CEPH_ASYNC_CREATE_BIT, &ci->i_ceph_flags); + + if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; + check_cap = true; + } + } + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); + + if (check_cap) + ceph_check_caps(ci, CHECK_CAPS_FLUSH); +} + +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct dentry *dentry = req->r_dentry; + struct inode *dinode = d_inode(dentry); + struct inode *tinode = req->r_target_inode; + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + WARN_ON_ONCE(dinode && tinode && dinode != tinode); + + /* MDS changed -- caller must resubmit */ + if (result == -EJUKEBOX) + goto out; + + mapping_set_error(req->r_parent->i_mapping, result); + + if (result) { + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); + + pr_warn_client(cl, + "async create failure path=(%llx)%s result=%d!\n", + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path_info(&path_info); + + ceph_dir_clear_complete(req->r_parent); + if (!d_unhashed(dentry)) + d_drop(dentry); + + if (dinode) { + mapping_set_error(dinode->i_mapping, result); + ceph_inode_shutdown(dinode); + wake_async_create_waiters(dinode, req->r_session); + } + } + + if (tinode) { + u64 ino = ceph_vino(tinode).ino; + + if (req->r_deleg_ino != ino) + pr_warn_client(cl, + "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", + req->r_err, req->r_deleg_ino, ino); + + mapping_set_error(tinode->i_mapping, result); + wake_async_create_waiters(tinode, req->r_session); + } else if (!result) { + pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n", + req->r_deleg_ino); + } +out: + ceph_mdsc_release_dir_caps(req); +} + +static int ceph_finish_async_create(struct inode *dir, struct inode *inode, + struct dentry *dentry, + struct file *file, umode_t mode, + struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx, + struct ceph_file_layout *lo) +{ + int ret; + char xattr_buf[4]; + struct ceph_mds_reply_inode in = { }; + struct ceph_mds_reply_info_in iinfo = { .in = &in }; + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct timespec64 now; + struct ceph_string *pool_ns; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + + ktime_get_real_ts64(&now); + + iinfo.inline_version = CEPH_INLINE_NONE; + iinfo.change_attr = 1; + ceph_encode_timespec64(&iinfo.btime, &now); + + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } + + in.ino = cpu_to_le64(vino.ino); + in.snapid = cpu_to_le64(CEPH_NOSNAP); + in.version = cpu_to_le64(1); // ??? + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); + in.cap.cap_id = cpu_to_le64(1); + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); + in.cap.flags = CEPH_CAP_FLAG_AUTH; + in.ctime = in.mtime = in.atime = iinfo.btime; + in.truncate_seq = cpu_to_le32(1); + in.truncate_size = cpu_to_le64(-1ULL); + in.xattr_version = cpu_to_le64(1); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, + mapped_fsuid(req->r_mnt_idmap, + &init_user_ns))); + if (dir->i_mode & S_ISGID) { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); + + /* Directories always inherit the setgid bit. */ + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else { + in.gid = cpu_to_le32(from_kgid(&init_user_ns, + mapped_fsgid(req->r_mnt_idmap, + &init_user_ns))); + } + in.mode = cpu_to_le32((u32)mode); + + in.nlink = cpu_to_le32(1); + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); + /* lo is private, so pool_ns can't change */ + pool_ns = rcu_dereference_raw(lo->pool_ns); + if (pool_ns) { + iinfo.pool_ns_len = pool_ns->len; + iinfo.pool_ns_data = pool_ns->str; + } + + down_read(&mdsc->snap_rwsem); + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, + req->r_fmode, NULL); + up_read(&mdsc->snap_rwsem); + if (ret) { + doutc(cl, "failed to fill inode: %d\n", ret); + ceph_dir_clear_complete(dir); + if (!d_unhashed(dentry)) + d_drop(dentry); + discard_new_inode(inode); + } else { + struct dentry *dn; + + doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n", + vino.ino, ceph_ino(dir), dentry->d_name.name); + ceph_dir_clear_ordered(dir); + ceph_init_inode_acls(inode, as_ctx); + if (inode_state_read_once(inode) & I_NEW) { + /* + * If it's not I_NEW, then someone created this before + * we got here. Assume the server is aware of it at + * that point and don't worry about setting + * CEPH_I_ASYNC_CREATE. + */ + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; + unlock_new_inode(inode); + } + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { + if (!d_unhashed(dentry)) + d_drop(dentry); + dn = d_splice_alias(inode, dentry); + WARN_ON_ONCE(dn && dn != dentry); + } + file->f_mode |= FMODE_CREATED; + ret = finish_open(file, dentry, ceph_open); + } + + spin_lock(&dentry->d_lock); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_CREATE_BIT, &di->flags); + spin_unlock(&dentry->d_lock); + + return ret; +} /* * Do a lookup + open with a single request. If we get a non-existent * file or symlink, return 1 so the VFS can retry. */ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened) + struct file *file, unsigned flags, umode_t mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct inode *new_inode = NULL; struct dentry *dn; + struct ceph_acl_sec_ctx as_ctx = {}; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); + int mask; int err; + char *path; - dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name, - d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", + dir, ceph_vinop(dir), dentry, dentry, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; - err = ceph_init_dentry(dentry); - if (err < 0) + err = ceph_wait_on_conflict_unlink(dentry); + if (err) return err; + /* + * Do not truncate the file, since atomic_open is called before the + * permission check. The caller will do the truncation afterward. + */ + flags &= ~O_TRUNC; + + dn = d_find_alias(dir); + if (!dn) { + try_async = false; + } else { + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); + if (IS_ERR(path)) { + try_async = false; + err = 0; + } else { + int fmode = ceph_flags_to_mode(flags); + + mask = MAY_READ; + if (fmode & CEPH_FILE_MODE_WR) + mask |= MAY_WRITE; + err = ceph_mds_check_access(mdsc, path, mask); + } + ceph_mdsc_free_path_info(&path_info); + dput(dn); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + try_async = false; + err = 0; + } + } + +retry: + if (flags & O_CREAT) { + if (ceph_quota_is_max_files_exceeded(dir)) + return -EDQUOT; + + new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); + goto out_ctx; + } + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; + } else if (!d_in_lookup(dentry)) { + /* If it's not being looked up, it's negative */ + return -ENOENT; + } /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_ctx; + } req->r_dentry = dget(dentry); req->r_num_caps = 2; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_parent = dir; + if (req->r_op == CEPH_MDS_OP_CREATE) + req->r_mnt_idmap = mnt_idmap_get(idmap); + ihold(dir); + if (IS_ENCRYPTED(dir)) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + goto out_req; + } + if (flags & O_CREAT) { - req->r_dentry_drop = CEPH_CAP_FILE_SHARED; + struct ceph_file_layout lo; + + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + + ceph_as_ctx_to_req(req, &as_ctx); + + if (try_async && (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + struct ceph_dentry_info *di = ceph_dentry(dentry); + + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); + req->r_callback = ceph_async_create_cb; + + /* Hash inode before RPC */ + new_inode = ceph_get_inode(dir->i_sb, vino, new_inode); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); + new_inode = NULL; + goto out_req; + } + WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW)); + + spin_lock(&dentry->d_lock); + di->flags |= CEPH_DENTRY_ASYNC_CREATE; + spin_unlock(&dentry->d_lock); + + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + err = ceph_finish_async_create(dir, new_inode, + dentry, file, + mode, req, + &as_ctx, &lo); + new_inode = NULL; + } else if (err == -EJUKEBOX) { + restore_deleg_ino(dir, req->r_deleg_ino); + ceph_mdsc_put_request(req); + discard_new_inode(new_inode); + ceph_release_acl_sec_ctx(&as_ctx); + memset(&as_ctx, 0, sizeof(as_ctx)); + new_inode = NULL; + try_async = false; + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto retry; + } + ceph_put_string(rcu_dereference_raw(lo.pool_ns)); + goto out_req; + } } - req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, - req); - if (err) - goto out_err; - err = ceph_handle_snapdir(req, dentry, err); - if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_new_inode = new_inode; + new_inode = NULL; + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); + if (err == -ENOENT) { + dentry = ceph_handle_snapdir(req, dentry); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_req; + } + err = 0; + } + + if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { dn = ceph_finish_lookup(req, dentry, err); if (IS_ERR(dn)) err = PTR_ERR(dn); @@ -260,365 +955,1140 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, dn = NULL; } if (err) - goto out_err; - if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + goto out_req; + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ - dout("atomic_open finish_no_open on dn %p\n", dn); + doutc(cl, "finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { - dout("atomic_open finish_open on dn %p\n", dn); + if (IS_ENCRYPTED(dir) && + !fscrypt_has_permitted_context(dir, d_inode(dentry))) { + pr_warn_client(cl, + "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n", + ceph_vinop(dir), ceph_vinop(d_inode(dentry))); + goto out_req; + } + + doutc(cl, "finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - *opened |= FILE_CREATED; + struct inode *newino = d_inode(dentry); + + cache_file_layout(dir, newino); + ceph_init_inode_acls(newino, &as_ctx); + file->f_mode |= FMODE_CREATED; } - err = finish_open(file, dentry, ceph_open, opened); + err = finish_open(file, dentry, ceph_open); } - -out_err: +out_req: ceph_mdsc_put_request(req); - dout("atomic_open result=%d\n", err); + iput(new_inode); +out_ctx: + ceph_release_acl_sec_ctx(&as_ctx); + doutc(cl, "result=%d\n", err); return err; } int ceph_release(struct inode *inode, struct file *file) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *cf = file->private_data; - dout("release inode %p file %p\n", inode, file); - ceph_put_fmode(ci, cf->fmode); - if (cf->last_readdir) - ceph_mdsc_put_request(cf->last_readdir); - kfree(cf->last_name); - kfree(cf->dir_info); - dput(cf->dentry); - kmem_cache_free(ceph_file_cachep, cf); + if (S_ISDIR(inode->i_mode)) { + struct ceph_dir_file_info *dfi = file->private_data; + doutc(cl, "%p %llx.%llx dir file %p\n", inode, + ceph_vinop(inode), file); + WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); + + ceph_put_fmode(ci, dfi->file_info.fmode, 1); + + if (dfi->last_readdir) + ceph_mdsc_put_request(dfi->last_readdir); + kfree(dfi->last_name); + kfree(dfi->dir_info); + kmem_cache_free(ceph_dir_file_cachep, dfi); + } else { + struct ceph_file_info *fi = file->private_data; + doutc(cl, "%p %llx.%llx regular file %p\n", inode, + ceph_vinop(inode), file); + WARN_ON(!list_empty(&fi->rw_contexts)); + + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + + kmem_cache_free(ceph_file_cachep, fi); + } /* wake up anyone waiting for caps on this inode */ wake_up_all(&ci->i_cap_wq); return 0; } +enum { + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, +}; + /* - * Read a range of bytes striped over one or more objects. Iterate over - * objects we stripe over. (That's not atomic, but good enough for now.) + * Completely synchronous read and write methods. Direct from __user + * buffer to osd, or directly to user pages (if O_DIRECT). + * + * If the read spans object boundary, just do multiple reads. (That's not + * atomic, but good enough for now.) * * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ -static int striped_read(struct inode *inode, - u64 off, u64 len, - struct page **pages, int num_pages, - int *checkeof, bool o_direct, - unsigned long buf_align) +ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver) { - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; - int io_align, page_align; - int left, pages_left; - int read; - struct page **page_pos; - int ret; - bool hit_stripe, was_short; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_client *osdc = &fsc->client->osdc; + ssize_t ret; + u64 off = *ki_pos; + u64 len = iov_iter_count(to); + u64 i_size = i_size_read(inode); + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 objver = 0; + + doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode, + ceph_vinop(inode), *ki_pos, len); + if (ceph_inode_is_shutdown(inode)) + return -EIO; + + if (!len || !i_size) + return 0; /* - * we may need to do multiple reads. not atomic, unfortunately. + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. */ - pos = off; - left = len; - page_pos = pages; - pages_left = num_pages; - read = 0; - io_align = off & ~PAGE_MASK; - -more: - if (o_direct) - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - else - page_align = pos & ~PAGE_MASK; - this_len = left; - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), - &ci->i_layout, pos, &this_len, - ci->i_truncate_seq, - ci->i_truncate_size, - page_pos, pages_left, page_align); - if (ret == -ENOENT) - ret = 0; - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + ret = filemap_write_and_wait_range(inode->i_mapping, + off, off + len - 1); + if (ret < 0) + return ret; - if (ret > 0) { - int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; + ret = 0; + while ((len = iov_iter_count(to)) > 0) { + struct ceph_osd_request *req; + struct page **pages; + int num_pages; + size_t page_off; + bool more; + int idx = 0; + size_t left; + struct ceph_osd_req_op *op; + u64 read_off = off; + u64 read_len = len; + int extent_cnt; + + /* determine new offset/length if encrypted */ + ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); + + doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len, + read_off, read_len); + + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, read_off, &read_len, 0, 1, + sparse ? CEPH_OSD_OP_SPARSE_READ : + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_align + read, - pos - off - read, pages); + /* adjust len downward if the request truncated the len */ + if (off + len > read_off + read_len) + len = read_off + read_len - off; + more = len < iov_iter_count(to); + + op = &req->r_ops[0]; + if (sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, read_len); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } } - pos += ret; - read = pos - off; - left -= ret; - page_pos += didpages; - pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) - goto more; - } + num_pages = calc_pages_for(read_off, read_len); + page_off = offset_in_page(off); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, + offset_in_page(read_off), + false, true); + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + read_len, ret); + + if (ret > 0) + objver = req->r_version; + + i_size = i_size_read(inode); + doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len, + ret, i_size, (more ? " MORE" : "")); + + /* Fix it to go to end of extent map */ + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) + ret = 0; + + if (ret < 0) { + ceph_osdc_put_request(req); + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; + break; + } + + if (IS_ENCRYPTED(inode)) { + int fret; + + fret = ceph_fscrypt_decrypt_extents(inode, pages, + read_off, op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (fret < 0) { + ret = fret; + ceph_osdc_put_request(req); + break; + } + + /* account for any partial block at the beginning */ + fret -= (off - read_off); + + /* + * Short read after big offset adjustment? + * Nothing is usable, just call it a zero + * len read. + */ + fret = max(fret, 0); + + /* account for partial block at the end */ + ret = min_t(ssize_t, fret, len); + } - if (was_short) { - /* did we bounce off eof? */ - if (pos + left > inode->i_size) - *checkeof = 1; + /* Short read but not EOF? Zero out the remainder. */ + if (ret < len && (off + ret < i_size)) { + int zlen = min(len - ret, i_size - off - ret); + int zoff = page_off + ret; - /* zero trailing bytes (inside i_size) */ - if (left > 0 && pos < inode->i_size) { - if (pos + left > inode->i_size) - left = inode->i_size - pos; + doutc(cl, "zero gap %llu~%llu\n", off + ret, + off + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; + } - dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_align + read, left, - pages); - read += left; + if (off + ret > i_size) + left = (i_size > off) ? i_size - off : 0; + else + left = ret; + + while (left > 0) { + size_t plen, copied; + + plen = min_t(size_t, left, PAGE_SIZE - page_off); + SetPageUptodate(pages[idx]); + copied = copy_page_to_iter(pages[idx++], + page_off, plen, to); + off += copied; + left -= copied; + page_off = 0; + if (copied < plen) { + ret = -EFAULT; + break; + } } + + ceph_osdc_put_request(req); + + if (off >= i_size || !more) + break; } - if (ret >= 0) - ret = read; - dout("striped_read returns %d\n", ret); + if (ret > 0) { + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; + } + + if (last_objver) + *last_objver = objver; + } + doutc(cl, "result %zd retry_op %d\n", ret, *retry_op); return ret; } -/* - * Completely synchronous read and write methods. Direct from __user - * buffer to osd, or directly to user pages (if O_DIRECT). - * - * If the read spans object boundary, just do multiple reads. - */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *retry_op) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct page **pages; - u64 off = *poff; - int num_pages, ret; + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos, + iov_iter_count(to), + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL); +} + +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + bool write; + bool should_dirty; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec64 mtime; + struct ceph_cap_flush *prealloc_cf; +}; - dout("sync_read on file %p %llu~%u %s\n", file, off, len, - (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); + } + + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); + aio_req->iocb->ki_complete(aio_req->iocb, ret); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_osd_req_op *op = &req->r_ops[0]; + struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; + unsigned int len = osd_data->bvec_pos.iter.bi_size; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); + struct ceph_client *cl = ceph_inode_to_client(inode); + + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); + BUG_ON(!osd_data->num_bvecs); + + doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req, + inode, ceph_vinop(inode), rc, len); + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_fs_client(inode)->inode_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (sparse && rc >= 0) + rc = ceph_sparse_ext_map_end(op); + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && len > rc) { + struct iov_iter i; + int zlen = len - rc; + + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs, + osd_data->num_bvecs, len); + iov_iter_advance(&i, rc); + iov_iter_zero(zlen, &i); + } + } + + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_metrics(metric, req->r_start_latency, + req->r_end_latency, len, rc); + else + ceph_update_read_metrics(metric, req->r_start_latency, + req->r_end_latency, len, rc); + } + + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, + aio_req->should_dirty); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); } - if (IS_ERR(pages)) - return PTR_ERR(pages); + spin_unlock(&ci->i_ceph_lock); - /* - * flush any page cache pages in this range. this - * will make concurrent normal and sync io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - ret = filemap_write_and_wait(inode->i_mapping); - if (ret < 0) - goto done; + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, + false, GFP_NOFS); + if (!req) { + ret = -ENOMEM; + req = orig_req; + goto out; + } - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + req->r_ops[0] = orig_req->r_ops[0]; -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else - ceph_release_page_vector(pages, num_pages); - dout("sync_read result %d\n", ret); - return ret; + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } + + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ceph_osdc_start_request(req->r_osdc, req); +out: + if (ret < 0) { + req->r_result = ret; + ceph_aio_complete_req(req); + } + + ceph_put_snap_context(snapc); + kfree(aio_work); } -/* - * Write commit request unsafe callback, called to tell us when a - * request is unsafe (that is, in flight--has been handed to the - * messenger to send to its target osd). It is called again when - * we've received a response message indicating the request is - * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request - * is completed early (and unsuccessfully) due to a timeout or - * interrupt. - * - * This is used if we requested both an ACK and ONDISK commit reply - * from the OSD. - */ -static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) -{ - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - - dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, - unsafe ? "un" : ""); - if (unsafe) { - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); +static ssize_t +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_client_metric *metric = &fsc->mdsc->metric; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct bio_vec *bvecs; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; + int flags; + int ret = 0; + struct timespec64 mtime = current_time(inode); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; + bool should_dirty = !write && user_backed_iter(iter); + bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD); + + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc ? snapc->seq : 0); + + if (write) { + int ret2; + + ceph_fscache_invalidate(inode, true); + + ret2 = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + count - 1) >> PAGE_SHIFT); + if (ret2 < 0) + doutc(cl, "invalidate_inode_pages2_range returned %d\n", + ret2); + + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + flags = CEPH_OSD_FLAG_READ; } + + while (iov_iter_count(iter) > 0) { + u64 size = iov_iter_count(iter); + ssize_t len; + struct ceph_osd_req_op *op; + int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; + int extent_cnt; + + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &size, 0, + 1, + write ? CEPH_OSD_OP_WRITE : readop, + flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + op = &req->r_ops[0]; + if (!write && sparse) { + extent_cnt = __ceph_sparse_read_ext_count(inode, size); + ret = ceph_alloc_sparse_ext_map(op, extent_cnt); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); + if (len < 0) { + ceph_osdc_put_request(req); + ret = len; + break; + } + if (len != size) + osd_req_op_extent_update(req, 0, len); + + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + + /* + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. + */ + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + aio_req->should_dirty = should_dirty; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + PAGE_ALIGN(pos + len) - 1); + + req->r_mtime = mtime; + } + + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_private_item, &aio_req->osd_reqs); + + pos += len; + continue; + } + + ceph_osdc_start_request(req->r_osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + if (write) + ceph_update_write_metrics(metric, req->r_start_latency, + req->r_end_latency, len, ret); + else + ceph_update_read_metrics(metric, req->r_start_latency, + req->r_end_latency, len, ret); + + size = i_size_read(inode); + if (!write) { + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) + ret = 0; + + if (ret >= 0 && ret < len && pos + ret < size) { + struct iov_iter i; + int zlen = min_t(size_t, len - ret, + size - pos - ret); + + iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len); + iov_iter_advance(&i, ret); + iov_iter_zero(zlen, &i); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + + put_bvecs(bvecs, num_pages, should_dirty); + ceph_osdc_put_request(req); + if (ret < 0) + break; + + pos += len; + if (!write && pos >= size) + break; + + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY); + } + } + + if (aio_req) { + LIST_HEAD(osd_reqs); + + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); + while (!list_empty(&osd_reqs)) { + req = list_first_entry(&osd_reqs, + struct ceph_osd_request, + r_private_item); + list_del_init(&req->r_private_item); + if (ret >= 0) + ceph_osdc_start_request(req->r_osdc, req); + if (ret < 0) { + req->r_result = ret; + ceph_aio_complete_req(req); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; + iocb->ki_pos = pos; + } + return ret; } /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; - struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; - int num_ops = 1; struct page **pages; - int num_pages; u64 len; + int num_pages; int written = 0; - int flags; - int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; int ret; - struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + bool check_caps = false; + struct timespec64 mtime = current_time(inode); + size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, pos, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos, + (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); - - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ + ceph_fscache_invalidate(inode, false); + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + u64 write_pos = pos; + u64 write_len = len; + u64 objnum, objoff; + u32 xlen; + u64 assert_ver = 0; + bool rmw; + bool first, last; + struct iov_iter saved_iter = *from; + size_t off; + + ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len); + + /* clamp the length to the end of first object */ + ceph_calc_file_object_mapping(&ci->i_layout, write_pos, + write_len, &objnum, &objoff, + &xlen); + write_len = xlen; + + /* adjust len downward if it goes beyond current object */ + if (pos + len > write_pos + write_len) + len = write_pos + write_len - pos; - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - len = left; + /* + * If we had to adjust the length or position to align with a + * crypto block, then we must do a read/modify/write cycle. We + * use a version assertion to redrive the thing if something + * changes in between. + */ + first = pos != write_pos; + last = (pos + len) != (write_pos + write_len); + rmw = first || last; - snapc = ci->i_snap_realm->cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); - - /* write from beginning of first page, regardless of io alignment */ - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; - num_pages = calc_pages_for(page_align, len); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, false); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto out; - } + doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n", + ci->i_vino.ino, pos, len, write_pos, write_len, + rmw ? "" : "no "); /* - * throw out any page cache pages in this range. this - * may block. + * The data is emplaced into the page as it would be if it were + * in an array of pagecache pages. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + num_pages = calc_pages_for(write_pos, write_len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - goto out; + break; + } + + /* Do we need to preload the pages? */ + if (rmw) { + u64 first_pos = write_pos; + u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE; + u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE; + struct ceph_osd_req_op *op; + + /* We should only need to do this for encrypted inodes */ + WARN_ON_ONCE(!IS_ENCRYPTED(inode)); + + /* No need to do two reads if first and last blocks are same */ + if (first && last_pos == first_pos) + last = false; + + /* + * Allocate a read request for one or two extents, + * depending on how the request was aligned. + */ + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, first ? first_pos : last_pos, + &read_len, 0, (first && last) ? 2 : 1, + CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ceph_release_page_vector(pages, num_pages); + ret = PTR_ERR(req); + break; + } + + /* Something is misaligned! */ + if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + ret = -EIO; + break; + } + + /* Add extent for first block? */ + op = &req->r_ops[0]; + + if (first) { + osd_req_op_extent_osd_data_pages(req, 0, pages, + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + false, false); + /* We only expect a single extent here */ + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Add extent for last block */ + if (last) { + /* Init the other extent if first extent has been used */ + if (first) { + op = &req->r_ops[1]; + osd_req_op_extent_init(req, 1, + CEPH_OSD_OP_SPARSE_READ, + last_pos, CEPH_FSCRYPT_BLOCK_SIZE, + ci->i_truncate_size, + ci->i_truncate_seq); + } + + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, first ? 1 : 0, + &pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + false, false); + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + /* FIXME: length field is wrong if there are 2 extents */ + ceph_update_read_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + read_len, ret); + + /* Ok if object is not already present */ + if (ret == -ENOENT) { + /* + * If there is no object, then we can't assert + * on its version. Set it to 0, and we'll use an + * exclusive create instead. + */ + ceph_osdc_put_request(req); + ret = 0; + + /* + * zero out the soon-to-be uncopied parts of the + * first and last pages. + */ + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + if (last) + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else { + if (ret < 0) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + op = &req->r_ops[0]; + if (op->extent.sparse_ext_cnt == 0) { + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + else + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + if (first && last) { + op = &req->r_ops[1]; + if (op->extent.sparse_ext_cnt == 0) { + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Grab assert version. It must be non-zero. */ + assert_ver = req->r_version; + WARN_ON_ONCE(ret > 0 && assert_ver == 0); + + ceph_osdc_put_request(req); + if (first) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[0], CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + if (last) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + } + } + + left = len; + off = offset_in_page(pos); + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE - off); + + /* copy the data */ + ret = copy_page_from_iter(pages[n], off, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + off = 0; + left -= ret; } - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); if (ret < 0) { + doutc(cl, "write failed with %d\n", ret); ceph_release_page_vector(pages, num_pages); - goto out; + break; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; - req->r_inode = inode; - own_pages = true; + if (IS_ENCRYPTED(inode)) { + ret = ceph_fscrypt_encrypt_pages(inode, pages, + write_pos, write_len); + if (ret < 0) { + doutc(cl, "encryption failed with %d\n", ret); + ceph_release_page_vector(pages, num_pages); + break; + } } - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, own_pages); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, write_pos, &write_len, + rmw ? 1 : 0, rmw ? 2 : 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + snapc, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + ceph_release_page_vector(pages, num_pages); + break; + } - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + doutc(cl, "write op %lld~%llu\n", write_pos, write_len); + osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len, + offset_in_page(write_pos), false, + true); + req->r_inode = inode; + req->r_mtime = mtime; + + /* Set up the assertion */ + if (rmw) { + /* + * Set up the assertion. If we don't have a version + * number, then the object doesn't exist yet. Use an + * exclusive create instead of a version assertion in + * that case. + */ + if (assert_ver) { + osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0); + req->r_ops[0].assert_ver.ver = assert_ver; + } else { + osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE, + CEPH_OSD_OP_FLAG_EXCL); + } + } - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, false); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, ret); + ceph_osdc_put_request(req); + if (ret != 0) { + doutc(cl, "osd write returned %d\n", ret); + /* Version changed! Must re-do the rmw cycle */ + if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) || + (!assert_ver && ret == -EEXIST)) { + /* We should only ever see this on a rmw */ + WARN_ON_ONCE(!rmw); + + /* The version should never go backward */ + WARN_ON_ONCE(ret == -EOVERFLOW); + + *from = saved_iter; + + /* FIXME: limit number of times we loop? */ + continue; + } + ceph_set_error_write(ci); + break; + } -out: - ceph_osdc_put_request(req); - if (ret == 0) { + ceph_clear_error_write(ci); + + /* + * We successfully wrote to a range of the file. Declare + * that region of the pagecache invalid. + */ + ret = invalidate_inode_pages2_range( + inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + len - 1) >> PAGE_SHIFT); + if (ret < 0) { + doutc(cl, "invalidate_inode_pages2_range returned %d\n", + ret); + ret = 0; + } pos += len; written += len; - left -= len; - data += len; - if (left) - goto more; + doutc(cl, "written %d\n", written); + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY); + } + + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *ppos = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); + iocb->ki_pos = pos; } + doutc(cl, "returning %d\n", ret); return ret; } @@ -629,61 +2099,150 @@ out: * * Hmm, the sync read case isn't actually async... should it be? */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; + size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; + bool direct_lock = iocb->ki_flags & IOCB_DIRECT; + struct ceph_client *cl = ceph_inode_to_client(inode); ssize_t ret; - int want, got = 0; - int checkeof = 0, read = 0; + int want = 0, got = 0; + int retry_op = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: + doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n", + iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode)); + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + ret = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_read(inode); + if (ret) + return ret; + + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); - if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + want |= CEPH_CAP_FILE_LAZYIO; + + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); + if (ret < 0) { + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + return ret; + } if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) - /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + (iocb->ki_flags & IOCB_DIRECT) || + (fi->flags & CEPH_F_SYNC)) { -out: - dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", - inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + if (!ceph_has_inline_data(ci)) { + if (!retry_op && + (iocb->ki_flags & IOCB_DIRECT) && + !IS_ENCRYPTED(inode)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } + } else { + retry_op = READ_INLINE; + } + } else { + CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); + doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + ceph_add_rw_context(fi, &rw_ctx); + ret = generic_file_read_iter(iocb, to); + ceph_del_rw_context(fi, &rw_ctx); + } + + doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); - if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + + if (retry_op > HAVE_RETRIED && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + int mask = CEPH_STAT_CAP_SIZE; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_KERNEL); + if (!page) + return -ENOMEM; + + mask = CEPH_STAT_CAP_INLINE_DATA; + } + + statret = __ceph_do_getattr(inode, page, mask, !!page); + if (statret < 0) { + if (page) + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_SIZE) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_SIZE); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; + } + __free_pages(page, 0); + return read; + } /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && + ret < len) { + doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n", + iocb->ki_pos, i_size); + read += ret; - base += ret; len -= ret; - checkeof = 0; + retry_op = HAVE_RETRIED; goto again; } } + if (ret >= 0) ret += read; @@ -691,6 +2250,71 @@ out: } /* + * Wrap filemap_splice_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + */ +static ssize_t ceph_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct ceph_file_info *fi = in->private_data; + struct inode *inode = file_inode(in); + struct ceph_inode_info *ci = ceph_inode(inode); + ssize_t ret; + int want = 0, got = 0; + CEPH_DEFINE_RW_CONTEXT(rw_ctx, 0); + + dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n", + inode, ceph_vinop(inode), *ppos, len, inode); + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + if (ceph_has_inline_data(ci) || + (fi->flags & CEPH_F_SYNC)) + return copy_splice_read(in, ppos, pipe, len, flags); + + ret = ceph_start_io_read(inode); + if (ret) + return ret; + + want = CEPH_CAP_FILE_CACHE; + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want |= CEPH_CAP_FILE_LAZYIO; + + ret = ceph_get_caps(in, CEPH_CAP_FILE_RD, want, -1, &got); + if (ret < 0) + goto out_end; + + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) == 0) { + dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n", + inode, ceph_vinop(inode), *ppos, len, + ceph_cap_string(got)); + + ceph_put_cap_refs(ci, got); + ceph_end_io_read(inode); + return copy_splice_read(in, ppos, pipe, len, flags); + } + + dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n", + inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got)); + + rw_ctx.caps = got; + ceph_add_rw_context(fi, &rw_ctx); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + ceph_del_rw_context(fi, &rw_ctx); + + dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n", + inode, ceph_vinop(inode), ceph_cap_string(got), ret); + + ceph_put_cap_refs(ci, got); +out_end: + ceph_end_io_read(inode); + return ret; +} + +/* * Take cap references to avoid releasing caps to MDS mid-write. * * If we are synchronous, and write with an old snap context, the OSD @@ -700,116 +2324,188 @@ out: * * If we are near ENOSPC, write synchronously. */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; - bool hold_mutex; + int err, want = 0, got; + bool direct_lock = false; + u32 map_flags; + u64 pool_flags; + loff_t pos; + loff_t limit = max(i_size_read(inode), fsc->max_file_size); + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - mutex_lock(&inode->i_mutex); - hold_mutex = true; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (err) - goto out; + if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) + direct_lock = true; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = file->f_mapping->backing_dev_info; - - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); +retry_snap: + err = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_write(inode); if (err) - goto out; + goto out_unlocked; - if (count == 0) + if (iocb->ki_flags & IOCB_APPEND) { + err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (err < 0) + goto out; + } + + err = generic_write_checks(iocb, from); + if (err <= 0) goto out; - err = file_remove_suid(file); - if (err) + pos = iocb->ki_pos; + if (unlikely(pos >= limit)) { + err = -EFBIG; goto out; + } else { + iov_iter_truncate(from, limit - pos); + } - err = file_update_time(file); - if (err) + count = iov_iter_count(from); + if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { + err = -EDQUOT; goto out; + } -retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { err = -ENOSPC; goto out; } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, inode->i_size); + err = file_remove_privs(file); + if (err) + goto out; + + doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, + i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) goto out; - dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", - inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + + doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) { - mutex_unlock(&inode->i_mutex); - written = ceph_sync_write(file, iov->iov_base, count, - pos, &iocb->ki_pos); + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || + (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { + struct ceph_snap_context *snapc; + struct iov_iter data; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + /* we might need to revert back to that point */ + data = *from; + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode)) + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); + else + written = ceph_sync_write(iocb, &data, pos, snapc); + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); + if (written > 0) + iov_iter_advance(from, written); + ceph_put_snap_context(snapc); } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); - mutex_unlock(&inode->i_mutex); + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ + written = generic_perform_write(iocb, from); + ceph_end_io_write(inode); } - hold_mutex = false; if (written >= 0) { int dirty; + spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); + if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) + ceph_check_caps(ci, CHECK_CAPS_FLUSH); } - dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)count, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - if (written >= 0 && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + written - 1, 1); - if (err < 0) - written = err; - } - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); - mutex_lock(&inode->i_mutex); - hold_mutex = true; + doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); goto retry_snap; } -out: - if (hold_mutex) - mutex_unlock(&inode->i_mutex); - current->backing_dev_info = NULL; + if (written >= 0) { + if ((map_flags & CEPH_OSDMAP_NEARFULL) || + (pool_flags & CEPH_POOL_FLAG_NEARFULL)) + iocb->ki_flags |= IOCB_DSYNC; + written = generic_write_sync(iocb, written); + } + + goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); +out: + if (direct_lock) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); +out_unlocked: + ceph_free_cap_flush(prealloc_cf); return written ? written : err; } @@ -818,74 +2514,667 @@ out: */ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { - struct inode *inode = file->f_mapping->host; + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *inode = file_inode(file); + int ret; + + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + if (ret < 0) + return ret; + } + return generic_file_llseek(file, offset, whence); +} + +static inline void ceph_zero_partial_page(struct inode *inode, + loff_t offset, size_t size) +{ + struct folio *folio; + + folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + folio_wait_writeback(folio); + folio_zero_range(folio, offset_in_folio(folio, offset), size); + folio_unlock(folio); + folio_put(folio); +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_SIZE) { + loff_t size = round_down(length, PAGE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (ceph_inode_is_shutdown(inode)) + return -EIO; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 0, 1, op, + CEPH_OSD_FLAG_WRITE, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; + ceph_osdc_put_request(req); + +out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ci->i_layout.stripe_unit; + s32 stripe_count = ci->i_layout.stripe_count; + s32 object_size = ci->i_layout.object_size; + u64 object_set_size = (u64) object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *prealloc_cf; + struct ceph_client *cl = ceph_inode_to_client(inode); + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n", + inode, ceph_vinop(inode), mode, offset, length); + + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + inode_lock(inode); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + size = i_size_read(inode); + + /* Are we punching a hole beyond EOF? */ + if (offset >= size) + goto unlock; + if ((offset + length) > size) + length = size - offset; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); + if (ret < 0) + goto unlock; + + ret = file_modified(file); + if (ret) + goto put_caps; + + filemap_invalidate_lock(inode->i_mapping); + ceph_fscache_invalidate(inode, false); + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + filemap_invalidate_unlock(inode->i_mapping); + +put_caps: + ceph_put_cap_refs(ci, got); +unlock: + inode_unlock(inode); + ceph_free_cap_flush(prealloc_cf); + return ret; +} + +/* + * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for + * src_ci. Two attempts are made to obtain both caps, and an error is return if + * this fails; zero is returned on success. + */ +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, + loff_t dst_endoff, int *dst_got) +{ + int ret = 0; + bool retrying = false; + +retry_caps: + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + dst_endoff, dst_got); + if (ret < 0) + return ret; + + /* + * Since we're already holding the FILE_WR capability for the dst file, + * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some + * retry dance instead to try to get both capabilities. + */ + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + false, src_got); + if (ret <= 0) { + /* Start by dropping dst_ci caps and getting src_ci caps */ + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); + if (retrying) { + if (!ret) + /* ceph_try_get_caps masks EAGAIN */ + ret = -EAGAIN; + return ret; + } + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got); + if (ret < 0) + return ret; + /*... drop src_ci caps too, and retry */ + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); + retrying = true; + goto retry_caps; + } + return ret; +} + +static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, + struct ceph_inode_info *dst_ci, int dst_got) +{ + ceph_put_cap_refs(src_ci, src_got); + ceph_put_cap_refs(dst_ci, dst_got); +} + +/* + * This function does several size-related checks, returning an error if: + * - source file is smaller than off+len + * - destination file size is not OK (inode_newsize_ok()) + * - max bytes quotas is exceeded + */ +static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, + loff_t src_off, loff_t dst_off, size_t len) +{ + struct ceph_client *cl = ceph_inode_to_client(src_inode); + loff_t size, endoff; + + size = i_size_read(src_inode); + /* + * Don't copy beyond source file EOF. Instead of simply setting length + * to (size - src_off), just drop to VFS default implementation, as the + * local i_size may be stale due to other clients writing to the source + * inode. + */ + if (src_off + len > size) { + doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off, + len, size); + return -EOPNOTSUPP; + } + size = i_size_read(dst_inode); + + endoff = dst_off + len; + if (inode_newsize_ok(dst_inode, endoff)) + return -EOPNOTSUPP; + + if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) + return -EDQUOT; + + return 0; +} + +static struct ceph_osd_request * +ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc, + u64 src_snapid, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u32 truncate_seq, u64 truncate_size) +{ + struct ceph_osd_request *req; int ret; + u32 src_fadvise_flags = + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE; + u32 dst_fadvise_flags = + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + req->r_flags = CEPH_OSD_FLAG_WRITE; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + ret = osd_req_op_copy_from_init(req, src_snapid, 0, + src_oid, src_oloc, + src_fadvise_flags, + dst_fadvise_flags, + truncate_seq, + truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + if (ret) + goto out; - mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); + if (ret) + goto out; - if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { - ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); - if (ret < 0) { - offset = ret; + return req; + +out: + ceph_osdc_put_request(req); + return ERR_PTR(ret); +} + +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, + struct ceph_inode_info *dst_ci, u64 *dst_off, + struct ceph_fs_client *fsc, + size_t len, unsigned int flags) +{ + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + struct ceph_osd_client *osdc; + struct ceph_osd_request *req; + ssize_t bytes = 0; + u64 src_objnum, src_objoff, dst_objnum, dst_objoff; + u32 src_objlen, dst_objlen; + u32 object_size = src_ci->i_layout.object_size; + struct ceph_client *cl = fsc->client; + int ret; + + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + osdc = &fsc->client->osdc; + + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap, + &src_oid, &src_oloc, + &dst_oid, &dst_oloc, + dst_ci->i_truncate_seq, + dst_ci->i_truncate_size); + if (IS_ERR(req)) + ret = PTR_ERR(req); + else { + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + ceph_update_copyfrom_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + object_size, ret); + ceph_osdc_put_request(req); + } + if (ret) { + if (ret == -EOPNOTSUPP) { + fsc->have_copy_from2 = false; + pr_notice_client(cl, + "OSDs don't support copy-from2; disabling copy offload\n"); + } + doutc(cl, "returned %d\n", ret); + if (bytes <= 0) + bytes = ret; goto out; } + len -= object_size; + bytes += object_size; + *src_off += object_size; + *dst_off += object_size; } - switch (whence) { - case SEEK_END: - offset += inode->i_size; - break; - case SEEK_CUR: +out: + ceph_oloc_destroy(&src_oloc); + ceph_oloc_destroy(&dst_oloc); + return bytes; +} + +static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + struct ceph_inode_info *src_ci = ceph_inode(src_inode); + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); + struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode); + struct ceph_client *cl = src_fsc->client; + loff_t size; + ssize_t ret = -EIO, bytes; + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; + u32 src_objlen, dst_objlen; + int src_got = 0, dst_got = 0, err, dirty; + + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } + if (ceph_snap(dst_inode) != CEPH_NOSNAP) + return -EROFS; + + /* + * Some of the checks below will return -EOPNOTSUPP, which will force a + * fallback to the default VFS copy_file_range implementation. This is + * desirable in several cases (for ex, the 'len' is smaller than the + * size of the objects, or in cases where that would be more + * efficient). + */ + + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) + return -EOPNOTSUPP; + + if (!src_fsc->have_copy_from2) + return -EOPNOTSUPP; + + /* + * Striped file layouts require that we copy partial objects, but the + * OSD copy-from operation only supports full-object copies. Limit + * this to non-striped file layouts for now. + */ + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || + (src_ci->i_layout.stripe_count != 1) || + (dst_ci->i_layout.stripe_count != 1) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { + doutc(cl, "Invalid src/dst files layout\n"); + return -EOPNOTSUPP; + } + + /* Every encrypted inode gets its own key, so we can't offload them */ + if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode)) + return -EOPNOTSUPP; + + if (len < src_ci->i_layout.object_size) + return -EOPNOTSUPP; /* no remote copy will be done */ + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + /* Start by sync'ing the source and destination files */ + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); + if (ret < 0) { + doutc(cl, "failed to write src file (%zd)\n", ret); + goto out; + } + ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); + if (ret < 0) { + doutc(cl, "failed to write dst file (%zd)\n", ret); + goto out; + } + + /* + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other + * clients may have dirty data in their caches. And OSDs know nothing + * about caps, so they can't safely do the remote object copies. + */ + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); + if (err < 0) { + doutc(cl, "get_rd_wr_caps returned %d\n", err); + ret = -EOPNOTSUPP; + goto out; + } + + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); + if (ret < 0) + goto out_caps; + + /* Drop dst file cached pages */ + ceph_fscache_invalidate(dst_inode, false); + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, + dst_off >> PAGE_SHIFT, + (dst_off + len) >> PAGE_SHIFT); + if (ret < 0) { + doutc(cl, "Failed to invalidate inode pages (%zd)\n", + ret); + ret = 0; /* XXX */ + } + ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, + src_ci->i_layout.object_size, + &src_objnum, &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, + dst_ci->i_layout.object_size, + &dst_objnum, &dst_objoff, &dst_objlen); + /* object-level offsets need to the same */ + if (src_objoff != dst_objoff) { + ret = -EOPNOTSUPP; + goto out_caps; + } + + /* + * Do a manual copy if the object offset isn't object aligned. + * 'src_objlen' contains the bytes left until the end of the object, + * starting at the src_off + */ + if (src_objoff) { + doutc(cl, "Initial partial copy of %u bytes\n", src_objlen); + /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it + * we need to temporarily drop all caps as we'll be calling + * {read,write}_iter, which will get caps again. */ - if (offset == 0) { - offset = file->f_pos; - goto out; - } - offset += file->f_pos; - break; - case SEEK_DATA: - if (offset >= inode->i_size) { - ret = -ENXIO; + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, + src_objlen); + /* Abort on short copies or on error */ + if (ret < (long)src_objlen) { + doutc(cl, "Failed partial copy (%zd)\n", ret); goto out; } - break; - case SEEK_HOLE: - if (offset >= inode->i_size) { - ret = -ENXIO; + len -= ret; + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); + if (err < 0) goto out; - } - offset = inode->i_size; - break; + err = is_file_size_ok(src_inode, dst_inode, + src_off, dst_off, len); + if (err < 0) + goto out_caps; } - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + size = i_size_read(dst_inode); + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, + src_fsc, len, flags); + if (bytes <= 0) { + if (!ret) + ret = bytes; + goto out_caps; + } + doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len); + len -= bytes; + ret += bytes; + + file_update_time(dst_file); + inode_inc_iversion_raw(dst_inode); + + if (dst_off > size) { + /* Let the MDS know about dst file size change */ + if (ceph_inode_set_size(dst_inode, dst_off) || + ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH); + } + /* Mark Fw dirty */ + spin_lock(&dst_ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&dst_ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(dst_inode, dirty); + +out_caps: + put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); + + /* + * Do the final manual copy if we still have some bytes left, unless + * there were errors in remote object copies (len >= object_size). + */ + if (len && (len < src_ci->i_layout.object_size)) { + doutc(cl, "Final partial copy of %zu bytes\n", len); + bytes = splice_file_range(src_file, &src_off, dst_file, + &dst_off, len); + if (bytes > 0) + ret += bytes; + else + doutc(cl, "Failed partial copy (%zd)\n", bytes); + } out: - mutex_unlock(&inode->i_mutex); - return offset; + ceph_free_cap_flush(prealloc_cf); + + return ret; +} + +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, + size_t len, unsigned int flags) +{ + ssize_t ret; + + ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, + len, flags); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); + return ret; } const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = ceph_aio_read, - .aio_write = ceph_aio_write, - .mmap = ceph_mmap, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, + .mmap_prepare = ceph_mmap_prepare, .fsync = ceph_fsync, .lock = ceph_lock, + .setlease = simple_nosetlease, .flock = ceph_flock, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_read = ceph_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, - .compat_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .fallocate = ceph_fallocate, + .copy_file_range = ceph_copy_file_range, }; - diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f3a2abf28a77..2966f88310e3 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> @@ -6,12 +7,19 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/kernel.h> -#include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/random.h> +#include <linux/sort.h> +#include <linux/iversion.h> +#include <linux/fscrypt.h> #include "super.h" #include "mds_client.h" +#include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> /* @@ -27,73 +35,255 @@ */ static const struct inode_operations ceph_symlink_iops; +static const struct inode_operations ceph_encrypted_symlink_iops; -static void ceph_invalidate_work(struct work_struct *work); -static void ceph_writeback_work(struct work_struct *work); -static void ceph_vmtruncate_work(struct work_struct *work); +static void ceph_inode_work(struct work_struct *work); /* * find or create an inode, given the ceph ino number */ static int ceph_set_ino_cb(struct inode *inode, void *data) { - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + + ci->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); + inode_set_iversion_raw(inode, 0); + percpu_counter_inc(&mdsc->metric.total_inodes); + return 0; } -struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + +/** + * ceph_new_inode - allocate a new inode in advance of an expected create + * @dir: parent directory for new inode + * @dentry: dentry that may eventually point to new inode + * @mode: mode of new inode + * @as_ctx: pointer to inherited security context + * + * Allocate a new inode in advance of an operation to create a new inode. + * This allocates the inode and sets up the acl_sec_ctx with appropriate + * info for the new inode. + * + * Returns a pointer to the new inode or an ERR_PTR. + */ +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) { + int err; struct inode *inode; - ino_t t = ceph_vino_to_ino(vino); - inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); - if (inode == NULL) + inode = new_inode(dir->i_sb); + if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - dout("get_inode created new inode %p %llx.%llx ino %llx\n", - inode, ceph_vinop(inode), (u64)inode->i_ino); - unlock_new_inode(inode); + + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + + if (!S_ISLNK(*mode)) { + err = ceph_pre_init_acls(dir, mode, as_ctx); + if (err < 0) + goto out_err; + } + + inode_state_assign_raw(inode, 0); + inode->i_mode = *mode; + + err = ceph_security_init_secctx(dentry, *mode, as_ctx); + if (err < 0) + goto out_err; + + /* + * We'll skip setting fscrypt context for snapshots, leaving that for + * the handle_reply(). + */ + if (ceph_snap(dir) != CEPH_SNAPDIR) { + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; } - dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, - vino.snap, inode); + return inode; +out_err: + iput(inode); + return ERR_PTR(err); +} + +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ + if (as_ctx->pagelist) { + req->r_pagelist = as_ctx->pagelist; + as_ctx->pagelist = NULL; + } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); +} + +/** + * ceph_get_inode - find or create/hash a new inode + * @sb: superblock to search and allocate in + * @vino: vino to search for + * @newino: optional new inode to insert if one isn't found (may be NULL) + * + * Search for or insert a new inode into the hash for the given vino, and + * return a reference to it. If new is non-NULL, its reference is consumed. + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, + struct inode *newino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct ceph_client *cl = mdsc->fsc->client; + struct inode *inode; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-EREMOTEIO); + + if (newino) { + inode = inode_insert5(newino, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode != newino) + iput(newino); + } else { + inode = iget5_locked(sb, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + } + + if (!inode) { + doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap); + return ERR_PTR(-ENOMEM); + } + + doutc(cl, "on %llx=%llx.%llx got %p new %d\n", + ceph_present_inode(inode), ceph_vinop(inode), inode, + !!(inode_state_read_once(inode) & I_NEW)); return inode; } /* - * get/constuct snapdir inode for a given directory + * get/construct snapdir inode for a given directory */ struct inode *ceph_get_snapdir(struct inode *parent) { + struct ceph_client *cl = ceph_inode_to_client(parent); struct ceph_vino vino = { .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; - struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); struct ceph_inode_info *ci = ceph_inode(inode); + int ret = -ENOTDIR; - BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) return inode; + + if (!S_ISDIR(parent->i_mode)) { + pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n", + parent->i_mode); + goto err; + } + + if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { + pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", + inode->i_mode); + goto err; + } + inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_op = &ceph_dir_iops; - inode->i_fop = &ceph_dir_fops; - ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + inode_set_mtime_to_ts(inode, inode_get_mtime(parent)); + inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); + inode_set_atime_to_ts(inode, inode_get_atime(parent)); ci->i_rbytes = 0; + ci->i_btime = ceph_inode(parent)->i_btime; + +#ifdef CONFIG_FS_ENCRYPTION + /* if encrypted, just borrow fscrypt_auth from parent */ + if (IS_ENCRYPTED(parent)) { + struct ceph_inode_info *pci = ceph_inode(parent); + + ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, + pci->fscrypt_auth_len, + GFP_KERNEL); + if (ci->fscrypt_auth) { + inode->i_flags |= S_ENCRYPTED; + ci->fscrypt_auth_len = pci->fscrypt_auth_len; + } else { + doutc(cl, "Failed to alloc snapdir fscrypt_auth\n"); + ret = -ENOMEM; + goto err; + } + } +#endif + if (inode_state_read_once(inode) & I_NEW) { + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ + unlock_new_inode(inode); + } + return inode; +err: + if ((inode_state_read_once(inode) & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(ret); } const struct inode_operations ceph_file_iops = { .permission = ceph_permission, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, + .get_inode_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -111,6 +301,8 @@ const struct inode_operations ceph_file_iops = { static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, u32 f) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_frag *frag; @@ -130,12 +322,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, } frag = kmalloc(sizeof(*frag), GFP_NOFS); - if (!frag) { - pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " - "frag %x\n", &ci->vfs_inode, - ceph_vinop(&ci->vfs_inode), f); + if (!frag) return ERR_PTR(-ENOMEM); - } + frag->frag = f; frag->split_by = 0; frag->mds = -1; @@ -144,8 +333,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_link_node(&frag->node, parent, p); rb_insert_color(&frag->node, &ci->i_fragtree); - dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f); return frag; } @@ -175,10 +363,10 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) * specified, copy the frag delegation info to the caller if * it is present. */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; unsigned nway, i; @@ -187,7 +375,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, if (found) *found = 0; - mutex_lock(&ci->i_fragtree_mutex); while (1) { WARN_ON(!ceph_frag_contains_value(t, v)); frag = __ceph_find_frag(ci, t); @@ -203,8 +390,8 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, /* choose child */ nway = 1 << frag->split_by; - dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, - frag->split_by, nway); + doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); for (i = 0; i < nway; i++) { n = ceph_frag_make_child(t, frag->split_by, i); if (ceph_frag_contains_value(n, v)) { @@ -214,12 +401,21 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } BUG_ON(i == nway); } - dout("choose_frag(%x) = %x\n", v, t); + doutc(cl, "frag(%x) = %x\n", v, t); - mutex_unlock(&ci->i_fragtree_mutex); return t; } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + /* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only @@ -229,29 +425,39 @@ static int ceph_fill_dirfrag(struct inode *inode, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; int i; int err = 0; + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); - if (ndist == 0) { + if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) goto out; if (frag->split_by == 0) { /* tree leaf, remove */ - dout("fill_dirfrag removed %llx.%llx frag %x" - " (no ref)\n", ceph_vinop(inode), id); + doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n", + inode, ceph_vinop(inode), id); rb_erase(&frag->node, &ci->i_fragtree); kfree(frag); } else { /* tree branch, keep and clear */ - dout("fill_dirfrag cleared %llx.%llx frag %x" - " referral\n", ceph_vinop(inode), id); + doutc(cl, "cleared %p %llx.%llx frag %x referral\n", + inode, ceph_vinop(inode), id); frag->mds = -1; frag->ndist = 0; } @@ -264,8 +470,9 @@ static int ceph_fill_dirfrag(struct inode *inode, if (IS_ERR(frag)) { /* this is not the end of the world; we can continue with bad/inaccurate delegation info */ - pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", - ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n", + inode, ceph_vinop(inode), + le32_to_cpu(dirinfo->frag)); err = -ENOMEM; goto out; } @@ -274,39 +481,167 @@ static int ceph_fill_dirfrag(struct inode *inode, frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); for (i = 0; i < frag->ndist; i++) frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); - dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", - ceph_vinop(inode), frag->frag, frag->ndist); + doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode, + ceph_vinop(inode), frag->frag, frag->ndist); out: mutex_unlock(&ci->i_fragtree_mutex); return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(le32_to_cpu(ls->frag), + le32_to_cpu(rs->frag)); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag, *prev_frag = NULL; + struct rb_node *rb_node; + unsigned i, split_by, nsplits; + u32 id; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { + i = get_random_u32_below(nsplits); + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", inode, + ceph_vinop(inode), i, nsplits, id, split_by); + continue; + } + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; + doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} /* * initialize a newly allocated inode. */ struct inode *ceph_alloc_inode(struct super_block *sb) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_inode_info *ci; int i; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS); + ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS); if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + doutc(fsc->client, "%p\n", &ci->netfs.inode); + + /* Set parameters for the netfs library */ + netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false); spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; + ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - atomic_set(&ci->i_release_count, 1); - atomic_set(&ci->i_complete_count, 0); + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; + ci->i_max_bytes = 0; + ci->i_max_files = 0; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -327,26 +662,24 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); - ci->i_cap_flush_seq = 0; - ci->i_cap_flush_last_tid = 0; - memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + ci->i_prealloc_cap_flush = NULL; + INIT_LIST_HEAD(&ci->i_cap_flush_list); init_waitqueue_head(&ci->i_cap_wq); - ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; - for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; + mutex_init(&ci->i_truncate_mutex); ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -358,68 +691,92 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_ref = 0; ci->i_wr_ref = 0; ci->i_wb_ref = 0; + ci->i_fx_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; - ci->i_shared_gen = 0; + atomic_set(&ci->i_filelock_ref, 0); + atomic_set(&ci->i_shared_gen, 1); ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; - INIT_LIST_HEAD(&ci->i_unsafe_writes); INIT_LIST_HEAD(&ci->i_unsafe_dirops); + INIT_LIST_HEAD(&ci->i_unsafe_iops); spin_lock_init(&ci->i_unsafe_lock); ci->i_snap_realm = NULL; INIT_LIST_HEAD(&ci->i_snap_realm_item); INIT_LIST_HEAD(&ci->i_snap_flush_item); - INIT_WORK(&ci->i_wb_work, ceph_writeback_work); - INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work); - - INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); - - return &ci->vfs_inode; + INIT_WORK(&ci->i_work, ceph_inode_work); + ci->i_work_mask = 0; + memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); +#ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif + return &ci->netfs.inode; } -static void ceph_i_callback(struct rcu_head *head) +void ceph_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); + kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif + fscrypt_free_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); } -void ceph_destroy_inode(struct inode *inode) +void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; struct rb_node *n; - dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode)); + + percpu_counter_dec(&mdsc->metric.total_inodes); + + netfs_wait_for_outstanding_io(inode); + truncate_inode_pages_final(&inode->i_data); + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) + ceph_fscache_unuse_cookie(inode, true); + clear_inode(inode); - ceph_queue_caps_release(inode); + ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); + + __ceph_remove_caps(ci); + + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) + ceph_adjust_quota_realms_count(inode, false); /* * we may still have a snap_realm reference if there are stray - * caps in i_cap_exporting_issued or i_snap_caps. + * caps in i_snap_caps. */ if (ci->i_snap_realm) { - struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct ceph_snap_realm *realm = ci->i_snap_realm; - - dout(" dropping residual ref to snap realm %p\n", realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + if (ceph_snap(inode) == CEPH_NOSNAP) { + doutc(cl, " dropping residual ref to snap realm %p\n", + ci->i_snap_realm); + ceph_change_snap_realm(inode, NULL); + } else { + ceph_put_snapid_map(mdsc, ci->i_snapid_map); + ci->i_snap_realm = NULL; + } } - kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) @@ -427,9 +784,14 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); - call_rcu(&inode->i_rcu, ceph_i_callback); + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} /* * Helpers to fill in size, ctime, mtime, and atime. We have to be @@ -442,50 +804,84 @@ void ceph_destroy_inode(struct inode *inode) int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; + loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || - (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { - dout("size %lld -> %llu\n", inode->i_size, size); - inode->i_size = size; - inode->i_blocks = (size + (1<<9) - 1) >> 9; + (truncate_seq == ci->i_truncate_seq && size > isize)) { + doutc(cl, "size %lld -> %llu\n", isize, size); + if (size > 0 && S_ISDIR(inode->i_mode)) { + pr_err_client(cl, "non-zero size for directory\n"); + size = 0; + } + i_size_write(inode, size); + inode->i_blocks = calc_inode_blocks(size); + /* + * If we're expanding, then we should be able to just update + * the existing cookie. + */ + if (size > isize) + ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", - ci->i_truncate_seq, truncate_seq); + doutc(cl, "truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL| - CEPH_CAP_FILE_LAZYIO)) || + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || - __ceph_caps_file_wanted(ci)) { + __ceph_is_file_opened(ci)) { ci->i_truncate_pending++; queue_trunc = 1; } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n", + ci->i_truncate_size, truncate_size, + !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; + + if (IS_ENCRYPTED(inode)) { + doutc(cl, "truncate_pagecache_size %lld -> %llu\n", + ci->i_truncate_pagecache_size, size); + ci->i_truncate_pagecache_size = size; + } else { + ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime) + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, struct timespec64 *atime) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 iatime = inode_get_atime(inode); + struct timespec64 ictime = inode_get_ctime(inode); + struct timespec64 imtime = inode_get_mtime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -493,38 +889,29 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec_compare(ctime, &inode->i_ctime) > 0) { - dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + if (ci->i_version == 0 || + timespec64_compare(ctime, &ictime) > 0) { + doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime); + inode_set_ctime_to_ts(inode, *ctime); } - if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + if (ci->i_version == 0 || + ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ - dout("mtime %ld.%09ld -> %ld.%09ld " - "tw %d -> %d\n", - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec, - ci->i_time_warp_seq, (int)time_warp_seq); - - inode->i_mtime = *mtime; - inode->i_atime = *atime; + doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime, + ci->i_time_warp_seq, (int)time_warp_seq); + + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec_compare(mtime, &inode->i_mtime) > 0) { - dout("mtime %ld.%09ld -> %ld.%09ld inc\n", - inode->i_mtime.tv_sec, - inode->i_mtime.tv_nsec, - mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + if (timespec64_compare(mtime, &imtime) > 0) { + doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime); + inode_set_mtime_to_ts(inode, *mtime); } - if (timespec_compare(atime, &inode->i_atime) > 0) { - dout("atime %ld.%09ld -> %ld.%09ld inc\n", - inode->i_atime.tv_sec, - inode->i_atime.tv_nsec, - atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + if (timespec64_compare(atime, &iatime) > 0) { + doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime); + inode_set_atime_to_ts(inode, *atime); } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -534,43 +921,115 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_ctime_to_ts(inode, *ctime); + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; } } if (warn) /* time_warp_seq shouldn't go backwards */ - dout("%p mds time_warp_seq %llu < %u\n", - inode, time_warp_seq, ci->i_time_warp_seq); + doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode, + time_warp_seq, ci->i_time_warp_seq); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int enclen, u8 **decsym) +{ + struct ceph_client *cl = mdsc->fsc->client; + int declen; + u8 *sym; + + sym = kmalloc(enclen + 1, GFP_NOFS); + if (!sym) + return -ENOMEM; + + declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP); + if (declen < 0) { + pr_err_client(cl, + "can't decode symlink (%d). Content: %.*s\n", + declen, enclen, encsym); + kfree(sym); + return -EIO; + } + sym[declen + 1] = '\0'; + *decsym = sym; + return declen; +} +#else +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int symlen, u8 **decsym) +{ + return -EOPNOTSUPP; +} +#endif + /* * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, - struct ceph_mds_reply_info_in *iinfo, - struct ceph_mds_reply_dirfrag *dirinfo, - struct ceph_mds_session *session, - unsigned long ttl_from, int cap_fmode, - struct ceph_cap_reservation *caps_reservation) +int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int i; - int issued = 0, implemented; - struct timespec mtime, atime, ctime; - u32 nsplits; + int issued, new_issued, info_caps; + struct timespec64 mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; + struct ceph_buffer *old_blob = NULL; + struct ceph_string *pool_ns = NULL; + struct ceph_cap *new_cap = NULL; int err = 0; - int queue_trunc = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; + bool fill_inline = false; + umode_t mode = le32_to_cpu(info->mode); + dev_t rdev = le32_to_cpu(info->rdev); + + lockdep_assert_held(&mdsc->snap_rwsem); + + doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), + le64_to_cpu(info->version), ci->i_version); - dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", - inode, ceph_vinop(inode), le64_to_cpu(info->version), - ci->i_version); + /* Once I_NEW is cleared, we can't change type or dev numbers */ + if (inode_state_read_once(inode) & I_NEW) { + inode->i_mode = mode; + } else { + if (inode_wrong_type(inode, mode)) { + pr_warn_once_client(cl, + "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + return -ESTALE; + } + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { + pr_warn_once_client(cl, + "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); + return -ESTALE; + } + } + + info_caps = le32_to_cpu(info->cap.caps); + + /* prealloc new cap struct */ + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { + new_cap = ceph_get_cap(mdsc, caps_reservation); + if (!new_cap) + return -ENOMEM; + } /* * prealloc xattr data, if it looks like we'll need it. only @@ -580,10 +1039,17 @@ static int fill_inode(struct inode *inode, if (iinfo->xattr_len > 4) { xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) - pr_err("fill_inode ENOMEM xattr blob %d bytes\n", - iinfo->xattr_len); + pr_err_client(cl, "ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); } + if (iinfo->pool_ns_len > 0) + pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, + iinfo->pool_ns_len); + + if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) + ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); + spin_lock(&ci->i_ceph_lock); /* @@ -597,77 +1063,158 @@ static int fill_inode(struct inode *inode, * 3 2 skip * 3 3 update */ - if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) >= le64_to_cpu(info->version)) - goto no_change; - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - - /* update inode */ - ci->i_version = le64_to_cpu(info->version); - inode->i_version++; - inode->i_rdev = le32_to_cpu(info->rdev); - - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(info->mode); + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + + /* Update change_attribute */ + inode_set_max_iversion_raw(inode, iinfo->change_attr); + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + new_issued = ~issued & info_caps; + + __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); + +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && + ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); - dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - from_kuid(&init_user_ns, inode->i_uid), - from_kgid(&init_user_ns, inode->i_gid)); + doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, + ceph_vinop(inode), inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); + ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); + ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + /* directories have fl_stripe_unit set to zero */ + if (IS_ENCRYPTED(inode)) + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + else if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); - /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); - queue_trunc = ceph_fill_file_size(inode, issued, - le32_to_cpu(info->truncate_seq), - le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); - ceph_fill_file_time(inode, issued, - le32_to_cpu(info->time_warp_seq), - &ctime, &mtime, &atime); - - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); - } - - ci->i_layout = info->layout; - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec64(&atime, &info->atime); + ceph_decode_timespec64(&mtime, &info->mtime); + ceph_decode_timespec64(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) { + ci->i_files = le64_to_cpu(info->files); + ci->i_subdirs = le64_to_cpu(info->subdirs); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); + + if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + pool_ns = old_ns; + + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn_client(cl, + "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + doutc(cl, "max_size %lld -> %llu\n", + ci->i_max_size, le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } + + /* layout and rstat are not tracked by capability, update them if + * the inode info is from auth mds */ + if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) { + if (S_ISDIR(inode->i_mode)) { + ci->i_dir_layout = iinfo->dir_layout; + ci->i_rbytes = le64_to_cpu(info->rbytes); + ci->i_rfiles = le64_to_cpu(info->rfiles); + ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ci->i_dir_pin = iinfo->dir_pin; + ci->i_rsnaps = iinfo->rsnaps; + ceph_decode_timespec64(&ci->i_rctime, &info->rctime); + } + } /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ - if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && + if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = xattr_blob; if (xattr_blob) memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); + ceph_security_invalidate_secctx(inode); xattr_blob = NULL; } + /* finally update i_version */ + if (le64_to_cpu(info->version) > ci->i_version) + ci->i_version = le64_to_cpu(info->version); + inode->i_mapping->a_ops = &ceph_aops; - inode->i_mapping->backing_dev_info = - &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: case S_IFBLK: case S_IFCHR: case S_IFSOCK: - init_special_inode(inode, inode->i_mode, inode->i_rdev); + inode->i_blkbits = PAGE_SHIFT; + init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ceph_file_iops; break; case S_IFREG: @@ -675,21 +1222,45 @@ static int fill_inode(struct inode *inode, inode->i_fop = &ceph_file_fops; break; case S_IFLNK: - inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { u32 symlen = iinfo->symlink_len; char *sym; spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != inode->i_size)) - goto out; + if (IS_ENCRYPTED(inode)) { + if (symlen != i_size_read(inode)) + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + + err = decode_encrypted_symlink(mdsc, iinfo->symlink, + symlen, (u8 **)&sym); + if (err < 0) { + pr_err_client(cl, + "decoding encrypted symlink failed: %d\n", + err); + goto out; + } + symlen = err; + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } else { + if (symlen != i_size_read(inode)) { + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } - err = -ENOMEM; - sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); - if (!sym) - goto out; + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + } spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -697,228 +1268,276 @@ static int fill_inode(struct inode *inode, else kfree(sym); /* lost a race */ } + + if (IS_ENCRYPTED(inode)) { + /* + * Encrypted symlinks need to be decrypted before we can + * cache their targets in i_link. Don't touch it here. + */ + inode->i_op = &ceph_encrypted_symlink_iops; + } else { + inode->i_link = ci->i_symlink; + inode->i_op = &ceph_symlink_iops; + } break; case S_IFDIR: inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; - - ci->i_dir_layout = iinfo->dir_layout; - - ci->i_files = le64_to_cpu(info->files); - ci->i_subdirs = le64_to_cpu(info->subdirs); - ci->i_rbytes = le64_to_cpu(info->rbytes); - ci->i_rfiles = le64_to_cpu(info->rfiles); - ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); - ceph_decode_timespec(&ci->i_rctime, &info->rctime); break; default: - pr_err("fill_inode %llx.%llx BAD mode 0%o\n", - ceph_vinop(inode), inode->i_mode); - } - - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; - } -no_change: - spin_unlock(&ci->i_ceph_lock); - - /* queue truncate if we saw i_size decrease */ - if (queue_trunc) - ceph_queue_vmtruncate(inode); - - /* populate frag tree */ - /* FIXME: move me up, if/when version reflects fragtree changes */ - nsplits = le32_to_cpu(info->fragtree.nsplits); - mutex_lock(&ci->i_fragtree_mutex); - for (i = 0; i < nsplits; i++) { - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); - dout(" frag %x split by %d\n", frag->frag, frag->split_by); + pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode, + ceph_vinop(inode), inode->i_mode); } - mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ - if (info->cap.caps) { + if (info_caps) { if (ceph_snap(inode) == CEPH_NOSNAP) { ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, - le32_to_cpu(info->cap.caps), + info_caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), - info->cap.flags, - caps_reservation); + info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (info_caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + doutc(cl, " marking %p complete (empty)\n", + inode); + i_size_write(inode, 0); + __ceph_dir_set_complete(ci, + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); + } + + wake = true; } else { - spin_lock(&ci->i_ceph_lock); - dout(" %p got snap_caps %s\n", inode, - ceph_cap_string(le32_to_cpu(info->cap.caps))); - ci->i_snap_caps |= le32_to_cpu(info->cap.caps); - if (cap_fmode >= 0) - __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&ci->i_ceph_lock); + doutc(cl, " %p got snap_caps %s\n", inode, + ceph_cap_string(info_caps)); + ci->i_snap_caps |= info_caps; } - } else if (cap_fmode >= 0) { - pr_warning("mds issued no caps on %llx.%llx\n", - ceph_vinop(inode)); - __ceph_get_fmode(ci, cap_fmode); } + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ceph_has_inline_data(ci) && + (locked_page || (info_caps & cache_caps))) + fill_inline = true; + } + + if (cap_fmode >= 0) { + if (!info_caps) + pr_warn_client(cl, "mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_touch_fmode(ci, mdsc, cap_fmode); + } + + spin_unlock(&ci->i_ceph_lock); + + ceph_fscache_register_inode_cookie(inode); + + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); + /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); err = 0; - out: - if (xattr_blob) - ceph_buffer_put(xattr_blob); + if (new_cap) + ceph_put_cap(mdsc, new_cap); + ceph_buffer_put(old_blob); + ceph_buffer_put(xattr_blob); + ceph_put_string(pool_ns); return err; } /* - * caller should hold session s_mutex. + * caller should hold session s_mutex and dentry->d_lock. */ -static void update_dentry_lease(struct dentry *dentry, - struct ceph_mds_reply_lease *lease, - struct ceph_mds_session *session, - unsigned long from_time) +static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + struct ceph_mds_session **old_lease_session) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); + unsigned mask = le16_to_cpu(lease->mask); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; - struct inode *dir; + + doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl); /* only track leases on regular dentries */ - if (dentry->d_op != &ceph_dentry_ops) + if (ceph_snap(dir) != CEPH_NOSNAP) return; - spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p duration %lu ms ttl %lu\n", - dentry, duration, ttl); - - /* make lease_rdcache_gen match directory */ - dir = dentry->d_parent->d_inode; - di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - - if (duration == 0) - goto out_unlock; + if (mask & CEPH_LEASE_PRIMARY_LINK) + di->flags |= CEPH_DENTRY_PRIMARY_LINK; + else + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; - if (di->lease_gen == session->s_cap_gen && - time_before(ttl, dentry->d_time)) - goto out_unlock; /* we already have a newer lease. */ + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); + if (!(mask & CEPH_LEASE_VALID)) { + __ceph_dentry_dir_lease_touch(di); + return; + } - if (di->lease_session && di->lease_session != session) - goto out_unlock; + if (di->lease_gen == atomic_read(&session->s_cap_gen) && + time_before(ttl, di->time)) + return; /* we already have a newer lease. */ - ceph_dentry_lru_touch(dentry); + if (di->lease_session && di->lease_session != session) { + *old_lease_session = di->lease_session; + di->lease_session = NULL; + } if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); - di->lease_gen = session->s_cap_gen; + di->lease_gen = atomic_read(&session->s_cap_gen); di->lease_seq = le32_to_cpu(lease->seq); di->lease_renew_after = half_ttl; di->lease_renew_from = 0; - dentry->d_time = ttl; -out_unlock: + di->time = ttl; + + __ceph_dentry_lease_touch(di); +} + +static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time) +{ + struct ceph_mds_session *old_lease_session = NULL; + spin_lock(&dentry->d_lock); + __update_dentry_lease(dir, dentry, lease, session, from_time, + &old_lease_session); spin_unlock(&dentry->d_lock); - return; + ceph_put_mds_session(old_lease_session); } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. + * update dentry lease without having parent inode locked */ -static void ceph_set_dentry_offset(struct dentry *dn) +static void update_dentry_lease_careful(struct dentry *dentry, + struct ceph_mds_reply_lease *lease, + struct ceph_mds_session *session, + unsigned long from_time, + char *dname, u32 dname_len, + struct ceph_vino *pdvino, + struct ceph_vino *ptvino) + { - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; + struct inode *dir; + struct ceph_mds_session *old_lease_session = NULL; - BUG_ON(!inode); + spin_lock(&dentry->d_lock); + /* make sure dentry's name matches target */ + if (dentry->d_name.len != dname_len || + memcmp(dentry->d_name.name, dname, dname_len)) + goto out_unlock; - ci = ceph_inode(inode); - di = ceph_dentry(dn); + dir = d_inode(dentry->d_parent); + /* make sure parent matches dvino */ + if (!ceph_ino_compare(dir, pdvino)) + goto out_unlock; - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; + /* make sure dentry's inode matches target. NULL ptvino means that + * we expect a negative dentry */ + if (ptvino) { + if (d_really_is_negative(dentry)) + goto out_unlock; + if (!ceph_ino_compare(d_inode(dentry), ptvino)) + goto out_unlock; + } else { + if (d_really_is_positive(dentry)) + goto out_unlock; } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); + __update_dentry_lease(dir, dentry, lease, session, + from_time, &old_lease_session); +out_unlock: + spin_unlock(&dentry->d_lock); + ceph_put_mds_session(old_lease_session); } /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. - * - * we will only rehash the resulting dentry if @prehash is - * true; @prehash will be set to false (for the benefit of - * the caller) if we fail. + * caller must hold directory i_rwsem for this to be safe. */ -static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) +static int splice_dentry(struct dentry **pdn, struct inode *in) { + struct ceph_client *cl = ceph_inode_to_client(in); + struct dentry *dn = *pdn; struct dentry *realdn; - BUG_ON(dn->d_inode); + BUG_ON(d_inode(dn)); + + if (S_ISDIR(in->i_mode)) { + /* If inode is directory, d_splice_alias() below will remove + * 'realdn' from its origin parent. We need to ensure that + * origin parent's readdir cache will not reference 'realdn' + */ + realdn = d_find_any_alias(in); + if (realdn) { + struct ceph_dentry_info *di = ceph_dentry(realdn); + spin_lock(&realdn->d_lock); + + realdn->d_op->d_prune(realdn); + + di->time = jiffies; + di->lease_shared_gen = 0; + di->offset = 0; + + spin_unlock(&realdn->d_lock); + dput(realdn); + } + } /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); - realdn = d_materialise_unique(dn, in); + realdn = d_splice_alias(in, dn); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", - PTR_ERR(realdn), dn, in, ceph_vinop(in)); - if (prehash) - *prehash = false; /* don't rehash on error */ - dn = realdn; /* note realdn contains the error */ - goto out; - } else if (realdn) { - dout("dn %p (%d) spliced with %p (%d) " - "inode %p ino %llx.%llx\n", - dn, d_count(dn), - realdn, d_count(realdn), - realdn->d_inode, ceph_vinop(realdn->d_inode)); + pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); + return PTR_ERR(realdn); + } + + if (realdn) { + doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n", + dn, d_count(dn), realdn, d_count(realdn), + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); - dn = realdn; + *pdn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode)); + doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn, + d_inode(dn), ceph_vinop(d_inode(dn))); } - if ((!prehash || *prehash) && d_unhashed(dn)) - d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); -out: - return dn; + return 0; } /* @@ -932,81 +1551,153 @@ out: * * Called with snap_rwsem (read). */ -int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, - struct ceph_mds_session *session) +int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) { + struct ceph_mds_session *session = req->r_session; struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; - struct ceph_vino vino; - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; + struct ceph_vino tvino, dvino; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; - dout("fill_trace %p is_dentry %d is_target %d\n", req, - rinfo->head->is_dentry, rinfo->head->is_target); - -#if 0 - /* - * Debugging hook: - * - * If we resend completed ops to a recovering mds, we get no - * trace. Since that is very rare, pretend this is the case - * to ensure the 'no trace' handlers in the callers behave. - * - * Fill in inodes unconditionally to avoid breaking cap - * invariants. - */ - if (rinfo->head->op & CEPH_MDS_OP_WRITE) { - pr_info("fill_trace faking empty trace on %lld %s\n", - req->r_tid, ceph_mds_op_name(rinfo->head->op)); - if (rinfo->head->is_dentry) { - rinfo->head->is_dentry = 0; - err = fill_inode(req->r_locked_dir, - &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1); - } - if (rinfo->head->is_target) { - rinfo->head->is_target = 0; - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - err = fill_inode(in, &rinfo->targeti, NULL, - session, req->r_request_started, - req->r_fmode); - iput(in); - } - } -#endif + doutc(cl, "%p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); if (!rinfo->head->is_target && !rinfo->head->is_dentry) { - dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) + doutc(cl, "reply is empty!\n"); + if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_locked_dir; - - if (dir) { - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1, - &req->r_caps_reservation); + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, + rinfo->dirfrag, session, -1, + &req->r_caps_reservation); if (err < 0) - return err; + goto done; } else { WARN_ON_ONCE(1); } + + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + bool is_nokey = false; + struct qstr dname; + struct dentry *dn, *parent; + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = parent_dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(parent_dir); + BUG_ON(!parent); + + err = ceph_fname_alloc_buffer(parent_dir, &oname); + if (err < 0) { + dput(parent); + goto done; + } + + err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); + if (err < 0) { + dput(parent); + ceph_fname_free_buffer(parent_dir, &oname); + goto done; + } + dname.name = oname.name; + dname.len = oname.len; + dname.hash = full_name_hash(parent, dname.name, dname.len); + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (!dn) { + dput(parent); + ceph_fname_free_buffer(parent_dir, &oname); + err = -ENOMEM; + goto done; + } + if (is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } + err = 0; + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + ceph_dir_clear_ordered(parent_dir); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + ceph_fname_free_buffer(parent_dir, &oname); + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + /* Should be filled in by handle_reply */ + BUG_ON(!req->r_target_inode); + + in = req->r_target_inode; + err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, + NULL, session, + (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && + rinfo->head->result == 0) ? req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err_client(cl, "badness %p %llx.%llx\n", in, + ceph_vinop(in)); + req->r_target_inode = NULL; + if (inode_state_read_once(in) & I_NEW) + discard_new_inode(in); + else + iput(in); + goto done; + } + if (inode_state_read_once(in) & I_NEW) + unlock_new_inode(in); } /* * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later */ - if (rinfo->head->is_dentry && !req->r_aborted && - req->r_locked_dir && + if (rinfo->head->is_dentry && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { @@ -1015,17 +1706,19 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * mknod symlink mkdir : null -> new inode * unlink : linked -> null */ - struct inode *dir = req->r_locked_dir; + struct inode *dir = req->r_parent; struct dentry *dn = req->r_dentry; bool have_dir_cap, have_lease; BUG_ON(!dn); BUG_ON(!dir); - BUG_ON(dn->d_parent->d_inode != dir); - BUG_ON(ceph_ino(dir) != - le64_to_cpu(rinfo->diri.in->ino)); - BUG_ON(ceph_snap(dir) != - le64_to_cpu(rinfo->diri.in->snapid)); + BUG_ON(d_inode(dn->d_parent) != dir); + + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + BUG_ON(ceph_ino(dir) != dvino.ino); + BUG_ON(ceph_snap(dir) != dvino.snap); /* do we have a lease on the whole dir? */ have_dir_cap = @@ -1036,157 +1729,130 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, have_lease = have_dir_cap || le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) - dout("fill_trace no dentry lease or dir cap\n"); + doutc(cl, "no dentry lease or dir cap\n"); /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { - dout(" src %p '%.*s' dst %p '%.*s'\n", - req->r_old_dentry, - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, - dn, dn->d_name.len, dn->d_name.name); - dout("fill_trace doing d_move %p -> %p\n", - req->r_old_dentry, dn); + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); + doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); d_move(req->r_old_dentry, dn); - dout(" src %p '%.*s' dst %p '%.*s'\n", - req->r_old_dentry, - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, - dn, dn->d_name.len, dn->d_name.name); + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, - ceph_dentry(req->r_old_dentry)->offset); + doutc(cl, "dn %p gets new offset %lld\n", + req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); - dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; + /* swap r_dentry and r_old_dentry in case that + * splice_dentry() gets called later. This is safe + * because no other place will use them */ + req->r_dentry = req->r_old_dentry; + req->r_old_dentry = dn; + dn = req->r_dentry; } /* null dentry? */ if (!rinfo->head->is_target) { - dout("fill_trace null dentry\n"); - if (dn->d_inode) { - dout("d_delete %p\n", dn); + doutc(cl, "null dentry\n"); + if (d_really_is_positive(dn)) { + doutc(cl, "d_delete %p\n", dn); + ceph_dir_clear_ordered(dir); d_delete(dn); - } else { - dout("d_instantiate %p NULL\n", dn); - d_instantiate(dn, NULL); - if (have_lease && d_unhashed(dn)) - d_rehash(dn); - update_dentry_lease(dn, rinfo->dlease, - session, - req->r_request_started); + } else if (have_lease) { + if (d_unhashed(dn)) + d_add(dn, NULL); } + + if (!d_unhashed(dn) && have_lease) + update_dentry_lease(dir, dn, + rinfo->dlease, session, + req->r_request_started); + goto done; + } + + if (unlikely(!in)) { + err = -EINVAL; goto done; } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_drop(dn); - goto done; - } - dn = splice_dentry(dn, in, &have_lease, true); - if (IS_ERR(dn)) { - err = PTR_ERR(dn); - goto done; - } - req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { + if (d_really_is_negative(dn)) { + ceph_dir_clear_ordered(dir); ihold(in); - } else { - dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + err = splice_dentry(&req->r_dentry, in); + if (err < 0) + goto done; + dn = req->r_dentry; /* may have spliced */ + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { + doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn)), + ceph_vinop(in)); + d_invalidate(dn); have_lease = false; - in = NULL; } - if (have_lease) - update_dentry_lease(dn, rinfo->dlease, session, + if (have_lease) { + update_dentry_lease(dir, dn, + rinfo->dlease, session, req->r_request_started); - dout(" final dn %p\n", dn); - i++; + } + doutc(cl, " final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { - struct dentry *dn = req->r_dentry; + req->r_op == CEPH_MDS_OP_MKSNAP) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + struct inode *dir = req->r_parent; /* fill out a snapdir LOOKUPSNAP dentry */ - BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } - dout(" linking snapped dir %p to dn %p\n", in, dn); - dn = splice_dentry(dn, in, NULL, true); - if (IS_ERR(dn)) { - err = PTR_ERR(dn); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); + BUG_ON(!req->r_dentry); + doutc(cl, " linking snapped dir %p to dn %p\n", in, + req->r_dentry); + ceph_dir_clear_ordered(dir); + + if (unlikely(!in)) { + err = -EINVAL; goto done; } - req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ - } - - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; - - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); + ihold(in); + err = splice_dentry(&req->r_dentry, in); + if (err < 0) goto done; + } else if (rinfo->head->is_dentry && req->r_dentry) { + /* parent inode is not locked, be careful */ + struct ceph_vino *ptvino = NULL; + dvino.ino = le64_to_cpu(rinfo->diri.in->ino); + dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); + if (rinfo->head->is_target) { + tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); + tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + ptvino = &tvino; } + update_dentry_lease_careful(req->r_dentry, rinfo->dlease, + session, req->r_request_started, + rinfo->dname, rinfo->dname_len, + &dvino, ptvino); } - done: - dout("fill_trace done err=%d\n", err); + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); + doutc(cl, "done err=%d\n", err); return err; } @@ -1197,297 +1863,393 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); - in = ceph_get_inode(req->r_dentry->d_sb, vino); + in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); - dout("new_inode badness got %d\n", err); + doutc(cl, "badness got %d\n", err); continue; } - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation); + rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); if (rc < 0) { - pr_err("fill_inode badness on %p got %d\n", in, rc); + pr_err_client(cl, "inode badness on %p got %d\n", in, + rc); err = rc; - continue; + if (inode_state_read_once(in) & I_NEW) { + ihold(in); + discard_new_inode(in); + } + } else if (inode_state_read_once(in) & I_NEW) { + unlock_new_inode(in); } + + iput(in); } return err; } +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_client *cl = ceph_inode_to_client(dir); + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->folio || pgoff != ctl->folio->index) { + ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + + if (idx == 0) + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; + ctl->index = -1; + return idx == 0 ? err : 0; + } + /* reading/filling the cache are serialized by + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); + if (idx == 0) + memset(ctl->dentries, 0, PAGE_SIZE); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + doutc(cl, "dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + doutc(cl, "disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct inode *inode = d_inode(parent); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; - struct inode *snapdir = NULL; - struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); - struct ceph_dentry_info *di; + int err = 0, skipped = 0, ret, i; + u32 frag = le32_to_cpu(req->r_args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; + struct ceph_readdir_cache_control cache_ctl = {}; - if (req->r_aborted) + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order) { + if (req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, + strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } else if (rinfo->offset_hash) { + /* mds understands offset_hash */ + WARN_ON_ONCE(req->r_readdir_offset != 2); + last_hash = le32_to_cpu(req->r_args.readdir.offset_hash); + } + } + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + doutc(cl, "got new frag %x -> %x\n", frag, + le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (!rinfo->hash_order) + req->r_readdir_offset = 2; + } + if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(parent->d_inode); - parent = d_find_alias(snapdir); - dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); } else { - dout("readdir_prepopulate %d items under dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) - ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); + + if (ceph_frag_is_leftmost(frag) && + req->r_readdir_offset == 2 && + !(rinfo->hash_order && last_hash)) { + /* note dir version at start of readdir so we can + * tell if any dentries get dropped */ + req->r_dir_release_cnt = + atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = + atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } } - for (i = 0; i < rinfo->dir_nr; i++) { - struct ceph_vino vino; - - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; - dname.hash = full_name_hash(dname.name, dname.len); + cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + /* FIXME: release caps/leases if error occurs */ + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; + struct ceph_vino tvino; + + dname.name = rde->name; + dname.len = rde->name_len; + dname.hash = full_name_hash(parent, dname.name, dname.len); + + tvino.ino = le64_to_cpu(rde->inode.in->ino); + tvino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_frag_value(rde->raw_hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } retry_lookup: dn = d_lookup(parent, &dname); - dout("d_lookup on parent=%p name=%.*s got %p\n", - parent, dname.len, dname.name, dn); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); if (!dn) { dn = d_alloc(parent, &dname); - dout("d_alloc %p '%.*s' = %p\n", parent, - dname.len, dname.name, dn); - if (dn == NULL) { - dout("d_alloc badness\n"); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (!dn) { + doutc(cl, "d_alloc badness\n"); err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { - dput(dn); - goto out; + if (rde->is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != tvino.ino || + ceph_snap(d_inode(dn)) != tvino.snap)) { + struct ceph_dentry_info *di = ceph_dentry(dn); + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); + + spin_lock(&dn->d_lock); + if (di->offset > 0 && + di->lease_shared_gen == + atomic_read(&ci->i_shared_gen)) { + __ceph_dir_clear_ordered(ci); + di->offset = 0; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { - dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + spin_unlock(&dn->d_lock); + d_delete(dn); dput(dn); goto retry_lookup; - } else { - /* reorder parent's d_subdirs */ - spin_lock(&parent->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &parent->d_subdirs); - spin_unlock(&dn->d_lock); - spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - /* inode */ - if (dn->d_inode) { - in = dn->d_inode; + if (d_really_is_positive(dn)) { + in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, vino); + in = ceph_get_inode(parent->d_sb, tvino, NULL); if (IS_ERR(in)) { - dout("new_inode badness\n"); + doutc(cl, "new_inode badness\n"); d_drop(dn); dput(dn); err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation) < 0) { - pr_err("fill_inode badness on %p\n", in); + ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); + if (ret < 0) { + pr_err_client(cl, "badness on %p %llx.%llx\n", in, + ceph_vinop(in)); + if (d_really_is_negative(dn)) { + if (inode_state_read_once(in) & I_NEW) { + ihold(in); + discard_new_inode(in); + } + iput(in); + } + d_drop(dn); + err = ret; goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + if (inode_state_read_once(in) & I_NEW) + unlock_new_inode(in); + + if (d_really_is_negative(dn)) { + if (ceph_security_xattr_deadlock(in)) { + doutc(cl, " skip splicing dn %p to inode %p" + " (security xattr deadlock)\n", dn, in); + iput(in); + skipped++; + goto next_item; + } + + err = splice_dentry(&dn, in); + if (err < 0) + goto next_item; + } + + ceph_dentry(dn)->offset = rde->offset; + + update_dentry_lease(d_inode(parent), dn, + rde->lease, req->r_session, + req->r_request_started); + + if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } next_item: - if (dn) - dput(dn); + dput(dn); } - req->r_did_prepopulate = true; - out: - if (snapdir) { - iput(snapdir); - dput(parent); + if (err == 0 && skipped == 0) { + set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); + req->r_readdir_cache_idx = cache_ctl.index; } - dout("readdir_prepopulate done\n"); + ceph_readdir_cache_release(&cache_ctl); + doutc(cl, "done\n"); return err; } -int ceph_inode_set_size(struct inode *inode, loff_t size) +bool ceph_inode_set_size(struct inode *inode, loff_t size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - int ret = 0; + bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); - inode->i_size = size; - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); + i_size_write(inode, size); + ceph_fscache_update(inode); + inode->i_blocks = calc_inode_blocks(size); - /* tell the MDS if we are approaching max_size */ - if ((size << 1) >= ci->i_max_size && - (ci->i_reported_size << 1) < ci->i_max_size) - ret = 1; + ret = __ceph_should_report_size(ci); spin_unlock(&ci->i_ceph_lock); - return ret; -} -/* - * Write back inode data in a worker thread. (This can't be done - * in the message handler context.) - */ -void ceph_queue_writeback(struct inode *inode) -{ - ihold(inode); - if (queue_work(ceph_inode_to_client(inode)->wb_wq, - &ceph_inode(inode)->i_wb_work)) { - dout("ceph_queue_writeback %p\n", inode); - } else { - dout("ceph_queue_writeback %p failed\n", inode); - iput(inode); - } + return ret; } -static void ceph_writeback_work(struct work_struct *work) +void ceph_queue_inode_work(struct inode *inode, int work_bit) { - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_wb_work); - struct inode *inode = &ci->vfs_inode; - - dout("writeback %p\n", inode); - filemap_fdatawrite(&inode->i_data); - iput(inode); -} + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_inode_info *ci = ceph_inode(inode); + set_bit(work_bit, &ci->i_work_mask); -/* - * queue an async invalidation - */ -void ceph_queue_invalidate(struct inode *inode) -{ ihold(inode); - if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, - &ceph_inode(inode)->i_pg_inv_work)) { - dout("ceph_queue_invalidate %p\n", inode); + if (queue_work(fsc->inode_wq, &ci->i_work)) { + doutc(cl, "%p %llx.%llx mask=%lx\n", inode, + ceph_vinop(inode), ci->i_work_mask); } else { - dout("ceph_queue_invalidate %p failed\n", inode); + doutc(cl, "%p %llx.%llx already queued, mask=%lx\n", + inode, ceph_vinop(inode), ci->i_work_mask); iput(inode); } } -/* - * Invalidate inode pages in a worker thread. (This can't be done - * in the message handler context.) - */ -static void ceph_invalidate_work(struct work_struct *work) +static void ceph_do_invalidate_pages(struct inode *inode) { - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_pg_inv_work); - struct inode *inode = &ci->vfs_inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); u32 orig_gen; int check = 0; + ceph_fscache_invalidate(inode, false); + + mutex_lock(&ci->i_truncate_mutex); + + if (ceph_inode_is_shutdown(inode)) { + pr_warn_ratelimited_client(cl, + "%p %llx.%llx is shut down\n", inode, + ceph_vinop(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); - dout("invalidate_pages %p gen %d revoking %d\n", inode, - ci->i_rdcache_gen, ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - /* nevermind! */ + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); goto out; } orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(&inode->i_data, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n", + ceph_vinop(inode)); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && orig_gen == ci->i_rdcache_revoking) { - dout("invalidate_pages %p gen %d successful\n", inode, - ci->i_rdcache_gen); + doutc(cl, "%p %llx.%llx gen %d successful\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen); ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", - inode, orig_gen, ci->i_rdcache_gen, - ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n", + inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } spin_unlock(&ci->i_ceph_lock); - - if (check) - ceph_check_caps(ci, 0, NULL); + mutex_unlock(&ci->i_truncate_mutex); out: - iput(inode); -} - - -/* - * called by trunc_wq; - * - * We also truncate in a separate thread as well. - */ -static void ceph_vmtruncate_work(struct work_struct *work) -{ - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_vmtruncate_work); - struct inode *inode = &ci->vfs_inode; - - dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); - iput(inode); -} - -/* - * Queue an async vmtruncate. If we fail to queue work, we will handle - * the truncation the next time we call __ceph_do_pending_vmtruncate. - */ -void ceph_queue_vmtruncate(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - ihold(inode); - if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, - &ci->i_vmtruncate_work)) { - dout("ceph_queue_vmtruncate %p\n", inode); - } else { - dout("ceph_queue_vmtruncate %p failed, pending=%d\n", - inode, ci->i_truncate_pending); - iput(inode); - } + if (check) + ceph_check_caps(ci, 0); } /* @@ -1496,15 +2258,19 @@ void ceph_queue_vmtruncate(struct inode *inode) */ void __ceph_do_pending_vmtruncate(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 to; int wrbuffer_refs, finish = 0; + mutex_lock(&ci->i_truncate_mutex); retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + doutc(cl, "%p %llx.%llx none pending\n", inode, + ceph_vinop(inode)); spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); return; } @@ -1513,24 +2279,28 @@ retry: * possibly truncate them.. so write AND block! */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); spin_unlock(&ci->i_ceph_lock); + doutc(cl, "%p %llx.%llx flushing snaps first\n", inode, + ceph_vinop(inode)); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; } - to = ci->i_truncate_size; + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, - ci->i_truncate_pending, to); + doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode), + ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, to); + ceph_fscache_resize(inode, to); + truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -1538,107 +2308,381 @@ retry: if (!finish) goto retry; + mutex_unlock(&ci->i_truncate_mutex); + if (wrbuffer_refs == 0) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, 0); wake_up_all(&ci->i_cap_wq); } +static void ceph_inode_work(struct work_struct *work) +{ + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_work); + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); -/* - * symlinks - */ -static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) + if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { + doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode)); + filemap_fdatawrite(&inode->i_data); + } + if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) + ceph_do_invalidate_pages(inode); + + if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask)) + __ceph_do_pending_vmtruncate(inode); + + if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) + ceph_check_caps(ci, 0); + + if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) + ceph_flush_snaps(ci, NULL); + + iput(inode); +} + +static const char *ceph_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); - nd_set_link(nd, ci->i_symlink); - return NULL; + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!dentry) + return ERR_PTR(-ECHILD); + + return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), + done); } +static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int ret; + + ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); + if (ret) + return ret; + return fscrypt_symlink_getattr(path, stat); +} + +/* + * symlinks + */ static const struct inode_operations ceph_symlink_iops = { - .readlink = generic_readlink, - .follow_link = ceph_sym_follow_link, + .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = ceph_setxattr, - .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, - .removexattr = ceph_removexattr, +}; + +static const struct inode_operations ceph_encrypted_symlink_iops = { + .get_link = ceph_encrypted_get_link, + .setattr = ceph_setattr, + .getattr = ceph_encrypted_symlink_getattr, + .listxattr = ceph_listxattr, }; /* - * setattr + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. */ -int ceph_setattr(struct dentry *dentry, struct iattr *attr) +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n", + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff, + CEPH_FSCRYPT_BLOCK_SIZE); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode, + ceph_vinop(inode), ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + +int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, + struct iattr *attr, struct ceph_iattr *cia) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ + struct dentry *dentry; + char *path; + bool do_sync = false; + + dentry = d_find_alias(inode); + if (!dentry) { + do_sync = true; + } else { + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); + if (IS_ERR(path)) { + do_sync = true; + err = 0; + } else { + err = ceph_mds_check_access(mdsc, path, MAY_WRITE); + } + ceph_mdsc_free_path_info(&path_info); + dput(dentry); + + /* For none EACCES cases will let the MDS do the mds auth check */ + if (err == -EACCES) { + return err; + } else if (err < 0) { + do_sync = true; + err = 0; + } + } - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - - __ceph_do_pending_vmtruncate(inode); - - err = inode_change_ok(inode, attr); - if (err != 0) - return err; +retry: + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, USE_AUTH_MDS); - if (IS_ERR(req)) + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); return PTR_ERR(req); + } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); - dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + + doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode), + ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode, + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { - dout("setattr %p uid %d -> %d\n", inode, - from_kuid(&init_user_ns, inode->i_uid), - from_kuid(&init_user_ns, attr->ia_uid)); - if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_uid = attr->ia_uid; + kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); + + doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode, + ceph_vinop(inode), + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + inode->i_uid = fsuid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !uid_eq(attr->ia_uid, inode->i_uid)) { + !uid_eq(fsuid, inode->i_uid)) { req->r_args.setattr.uid = cpu_to_le32( - from_kuid(&init_user_ns, attr->ia_uid)); + from_kuid(&init_user_ns, fsuid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { - dout("setattr %p gid %d -> %d\n", inode, - from_kgid(&init_user_ns, inode->i_gid), - from_kgid(&init_user_ns, attr->ia_gid)); - if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_gid = attr->ia_gid; + kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); + + doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode, + ceph_vinop(inode), + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { + inode->i_gid = fsgid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !gid_eq(attr->ia_gid, inode->i_gid)) { + !gid_eq(fsgid, inode->i_gid)) { req->r_args.setattr.gid = cpu_to_le32( - from_kgid(&init_user_ns, attr->ia_gid)); + from_kgid(&init_user_ns, fsgid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_MODE) { - dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, - attr->ia_mode); - if (issued & CEPH_CAP_AUTH_EXCL) { + doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode, + ceph_vinop(inode), inode->i_mode, attr->ia_mode); + if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) { inode->i_mode = attr->ia_mode; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1646,84 +2690,111 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) } if (ia_valid & ATTR_ATIME) { - dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + struct timespec64 atime = inode_get_atime(inode); + + doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &atime, &attr->ia_atime); + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_atime, - &attr->ia_atime) < 0) { - inode->i_atime = attr->ia_atime; + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&atime, + &attr->ia_atime) < 0) { + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_atime, &attr->ia_atime)) { - ceph_encode_timespec(&req->r_args.setattr.atime, - &attr->ia_atime); + !timespec64_equal(&atime, &attr->ia_atime)) { + ceph_encode_timespec64(&req->r_args.setattr.atime, + &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; - release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } + if (ia_valid & ATTR_SIZE) { + doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode, + ceph_vinop(inode), isize, attr->ia_size); + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + if (attr->ia_size > isize) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != isize) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { - dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); - if (issued & CEPH_CAP_FILE_EXCL) { + struct timespec64 mtime = inode_get_mtime(inode); + + doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n", + inode, ceph_vinop(inode), &mtime, &attr->ia_mtime); + if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) { ci->i_time_warp_seq++; - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_WR) && - timespec_compare(&inode->i_mtime, - &attr->ia_mtime) < 0) { - inode->i_mtime = attr->ia_mtime; + } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) && + timespec64_compare(&mtime, &attr->ia_mtime) < 0) { + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) { - ceph_encode_timespec(&req->r_args.setattr.mtime, - &attr->ia_mtime); + !timespec64_equal(&mtime, &attr->ia_mtime)) { + ceph_encode_timespec64(&req->r_args.setattr.mtime, + &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; - } - } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if (attr->ia_size > inode->i_sb->s_maxbytes) { - err = -EINVAL; - goto out; - } - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - inode->i_size = attr->ia_size; - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; - inode->i_ctime = attr->ia_ctime; - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR; + release |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } /* these do nothing */ if (ia_valid & ATTR_CTIME) { + struct timespec64 ictime = inode_get_ctime(inode); bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; - dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, - only ? "ctime only" : "ignored"); - inode->i_ctime = attr->ia_ctime; + doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n", + inode, ceph_vinop(inode), &ictime, &attr->ia_ctime, + only ? "ctime only" : "ignored"); if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -1741,15 +2812,22 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) } } if (ia_valid & ATTR_FILE) - dout("setattr %p ATTR_FILE ... hrm!\n", inode); + doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode, + ceph_vinop(inode)); if (dirtied) { - inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); - inode->i_ctime = CURRENT_TIME; + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); + inode_set_ctime_to_ts(inode, attr->ia_ctime); + inode_inc_iversion_raw(inode); } release &= issued; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) { + up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1760,52 +2838,215 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n", + inode, ceph_vinop(inode), err, + ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } - dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, - ceph_cap_string(dirtied), mask); +out: + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode, + ceph_vinop(inode), err, ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; -out: - spin_unlock(&ci->i_ceph_lock); - ceph_mdsc_put_request(req); +} + +/* + * setattr + */ +int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = setattr_prepare(idmap, dentry, attr); + if (err != 0) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) + return -EFBIG; + + if ((attr->ia_valid & ATTR_SIZE) && + ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) + return -EDQUOT; + + err = __ceph_setattr(idmap, inode, attr, NULL); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(idmap, dentry, attr->ia_mode); + return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + * + * And for the 'Xs' caps for getxattr we will also choose the + * auth MDS, because the MDS side code is buggy due to setxattr + * won't notify the replica MDSes when the values changed and + * the replica MDS will return the old values. Though we will + * fix it in MDS code, but this still makes sense for old ceph. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ -int ceph_do_getattr(struct inode *inode, int mask) +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int mode; int err; if (ceph_snap(inode) == CEPH_SNAPDIR) { - dout("do_getattr inode %p SNAPDIR\n", inode); + doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode, + ceph_vinop(inode)); return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); - if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) - return 0; + doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode, + ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode); + if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) + return 0; - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = ceph_try_to_choose_auth_mds(inode, mask); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE || + inline_version == 1) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } ceph_mdsc_put_request(req); - dout("do_getattr result=%d\n", err); + doutc(cl, "result=%d\n", err); + return err; +} + +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + doutc(cl, "result=%d\n", err); return err; } @@ -1814,48 +3055,162 @@ int ceph_do_getattr(struct inode *inode, int mask) * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask) +int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) { int err; if (mask & MAY_NOT_BLOCK) return -ECHILD; - err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(idmap, inode, mask); return err; } +/* Craft a mask of needed caps given a set of requested statx attrs. */ +static int statx_to_caps(u32 want, umode_t mode) +{ + int mask = 0; + + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) + mask |= CEPH_CAP_AUTH_SHARED; + + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { + /* + * The link count for directories depends on inode->i_subdirs, + * and that is only updated when Fs caps are held. + */ + if (S_ISDIR(mode)) + mask |= CEPH_CAP_FILE_SHARED; + else + mask |= CEPH_CAP_LINK_SHARED; + } + + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) + mask |= CEPH_CAP_FILE_SHARED; + + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) + mask |= CEPH_CAP_XATTR_SHARED; + + return mask; +} + /* - * Get all attributes. Hopefully somedata we'll have a statlite() - * and can limit the fields we require to be accurate. + * Get all the attributes. If we have sufficient caps for the requested attrs, + * then we can avoid talking to the MDS at all. */ -int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); - int err; + u32 valid_mask = STATX_BASIC_STATS; + int err = 0; - err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) != CEPH_NOSNAP) - stat->dev = ceph_snap(inode); - else - stat->dev = 0; - if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) - stat->size = ci->i_rbytes; + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + /* Skip the getattr altogether if we're asked not to sync */ + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { + err = ceph_do_getattr(inode, + statx_to_caps(request_mask, inode->i_mode), + flags & AT_STATX_FORCE_SYNC); + if (err) + return err; + } + + generic_fillattr(idmap, request_mask, inode, stat); + stat->ino = ceph_present_inode(inode); + + /* + * btime on newly-allocated inodes is 0, so if this is still set to + * that, then assume that it's not valid. + */ + if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) { + stat->btime = ci->i_btime; + valid_mask |= STATX_BTIME; + } + + if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = sb->s_dev; + else + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + + if (S_ISDIR(inode->i_mode)) { + if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) { + stat->size = ci->i_rbytes; + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (IS_ERR(parent)) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; else - stat->size = ci->i_files + ci->i_subdirs; - stat->blocks = 0; - stat->blksize = 65536; + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { + stat->size = ci->i_files + ci->i_subdirs; } + stat->blocks = 0; + stat->blksize = 65536; + /* + * Some applications rely on the number of st_nlink + * value on directories to be either 0 (if unlinked) + * or 2 + number of subdirectories. + */ + if (stat->nlink == 1) + /* '.' + '..' + subdirs */ + stat->nlink = 1 + 1 + ci->i_subdirs; } + + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | + STATX_ATTR_ENCRYPTED); + + stat->result_mask = request_mask & valid_mask; return err; } + +void ceph_inode_shutdown(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + int iputs = 0; + bool invalidate = false; + + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_SHUTDOWN; + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + + p = rb_next(p); + iputs += ceph_purge_inode_cap(inode, cap, &invalidate); + } + spin_unlock(&ci->i_ceph_lock); + + if (invalidate) + ceph_queue_invalidate(inode); + while (iputs--) + iput(inode); +} diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..2d10f49c93a9 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/rwsem.h> +#include <linux/fs.h> + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + bool is_odirect; + + lockdep_assert_held_write(&inode->i_rwsem); + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (is_odirect) { + clear_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); + } + spin_unlock(&ci->i_ceph_lock); + + if (is_odirect) + inode_dio_wait(inode); +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +int ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; + + /* Be an optimist! */ + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (!is_odirect) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); + + return 0; +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +int ceph_start_io_write(struct inode *inode) +{ + int err = down_write_killable(&inode->i_rwsem); + if (!err) + ceph_block_o_direct(ceph_inode(inode), inode); + return err; +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + bool is_odirect; + + lockdep_assert_held_write(&inode->i_rwsem); + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (!is_odirect) { + set_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); + } + spin_unlock(&ci->i_ceph_lock); + + if (!is_odirect) { + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +int ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; + + /* Be an optimist! */ + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (is_odirect) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path.... */ + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); + + return 0; +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..79029825e8b8 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +#include <linux/compiler_attributes.h> + +int __must_check ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +int __must_check ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +int __must_check ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index e0b4ef31d3c8..15cde055f3da 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,11 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" - +#include <linux/ceph/striper.h> +#include <linux/fscrypt.h> /* * ioctls @@ -20,13 +21,13 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) struct ceph_ioctl_layout l; int err; - err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (!err) { - l.stripe_unit = ceph_file_layout_su(ci->i_layout); - l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - l.object_size = ceph_file_layout_object_size(ci->i_layout); - l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); - l.preferred_osd = (s32)-1; + l.stripe_unit = ci->i_layout.stripe_unit; + l.stripe_count = ci->i_layout.stripe_count; + l.object_size = ci->i_layout.object_size; + l.data_pool = ci->i_layout.pool_id; + l.preferred_osd = -1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -42,7 +43,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc, /* validate striping parameters */ if ((l->object_size & ~PAGE_MASK) || (l->stripe_unit & ~PAGE_MASK) || - (l->stripe_unit != 0 && + ((unsigned)l->stripe_unit != 0 && ((unsigned)l->object_size % (unsigned)l->stripe_unit))) return -EINVAL; @@ -64,8 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct inode *parent_inode; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; struct ceph_inode_info *ci = ceph_inode(file_inode(file)); @@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return -EFAULT; /* validate changed params against current layout */ - err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (err) return err; @@ -84,22 +84,22 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) if (l.stripe_count) nl.stripe_count = l.stripe_count; else - nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.stripe_count = ci->i_layout.stripe_count; if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; else - nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_unit = ci->i_layout.stripe_unit; if (l.object_size) nl.object_size = l.object_size; else - nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.object_size = ci->i_layout.object_size; if (l.data_pool) nl.data_pool = l.data_pool; else - nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ - nl.preferred_osd = le64_to_cpu(-1); + nl.preferred_osd = -1; err = __validate_layout(mdsc, &nl); if (err) @@ -111,6 +111,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -121,9 +123,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } @@ -140,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) @@ -157,6 +157,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); @@ -182,8 +183,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = - &ceph_sb_to_client(inode->i_sb)->client->osdc; - u64 len = 1, olen; + &ceph_sb_to_fs_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + CEPH_DEFINE_OID_ONSTACK(oid); + u32 xlen; u64 tmp; struct ceph_pg pgid; int r; @@ -192,15 +195,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, - &dl.object_no, &dl.object_offset, - &olen); - if (r < 0) - return -EIO; + down_read(&osdc->lock); + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1, + &dl.object_no, &dl.object_offset, &xlen); dl.file_offset -= dl.object_offset; - dl.object_size = ceph_file_layout_object_size(ci->i_layout); - dl.block_size = ceph_file_layout_su(ci->i_layout); + dl.object_size = ci->i_layout.object_size; + dl.block_size = ci->i_layout.stripe_unit; /* block_offset = object_offset % block_size */ tmp = dl.object_offset; @@ -209,10 +209,19 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ci->i_layout.pool_id; + oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ceph_oid_printf(&oid, "%s", dl.object_name); + + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + ceph_oloc_destroy(&oloc); + if (r < 0) { + up_read(&osdc->lock); + return r; + } + + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); @@ -221,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) @@ -235,19 +244,30 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; + bool is_file_already_lazy = false; + spin_lock(&ci->i_ceph_lock); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { - spin_lock(&ci->i_ceph_lock); - ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; - ci->i_nr_by_mode[fi->fmode]++; - spin_unlock(&ci->i_ceph_lock); - dout("ioctl_layzio: file %p marked lazy\n", file); + ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; + __ceph_touch_fmode(ci, mdsc, fi->fmode); + } else { + is_file_already_lazy = true; + } + spin_unlock(&ci->i_ceph_lock); - ceph_check_caps(ci, 0, NULL); + if (is_file_already_lazy) { + doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, + ceph_vinop(inode)); } else { - dout("ioctl_layzio: file %p already lazy\n", file); + doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode, + ceph_vinop(inode)); + + ceph_check_caps(ci, 0); } + return 0; } @@ -259,9 +279,98 @@ static long ceph_ioctl_syncio(struct file *file) return 0; } +static int vet_mds_for_fscrypt(struct file *file) +{ + int i, ret = -EOPNOTSUPP; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = mdsc->sessions[i]; + + if (!s) + continue; + if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features)) + ret = 0; + break; + } + mutex_unlock(&mdsc->mutex); + return ret; +} + +static long ceph_set_encryption_policy(struct file *file, unsigned long arg) +{ + int ret, got = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + /* encrypted directories can't have striped layout */ + if (ci->i_layout.stripe_count > 1) + return -EINVAL; + + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + + /* + * Ensure we hold these caps so that we _know_ that the rstats check + * in the empty_dir check is reliable. + */ + ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got); + if (ret) + return ret; + + ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg); + if (got) + ceph_put_cap_refs(ci, got); + + return ret; +} + +static const char *ceph_ioctl_cmd_name(const unsigned int cmd) +{ + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return "get_layout"; + case CEPH_IOC_SET_LAYOUT: + return "set_layout"; + case CEPH_IOC_SET_LAYOUT_POLICY: + return "set_layout_policy"; + case CEPH_IOC_GET_DATALOC: + return "get_dataloc"; + case CEPH_IOC_LAZYIO: + return "lazyio"; + case CEPH_IOC_SYNCIO: + return "syncio"; + case FS_IOC_SET_ENCRYPTION_POLICY: + return "set_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY: + return "get_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return "get_encryption_policy_ex"; + case FS_IOC_ADD_ENCRYPTION_KEY: + return "add_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return "remove_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return "remove_encryption_key_all_users"; + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return "get_encryption_key_status"; + case FS_IOC_GET_ENCRYPTION_NONCE: + return "get_encryption_nonce"; + default: + return "unknown"; + } +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + int ret; + + doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file, + inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg); switch (cmd) { case CEPH_IOC_GET_LAYOUT: return ceph_ioctl_get_layout(file, (void __user *)arg); @@ -280,6 +389,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SYNCIO: return ceph_ioctl_syncio(file); + + case FS_IOC_SET_ENCRYPTION_POLICY: + return ceph_set_encryption_policy(file, arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_nonce(file, (void __user *)arg); } return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index c77028afb1e1..51f7f1d39a94 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_CEPH_IOCTL_H #define FS_CEPH_IOCTL_H diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae6d14e82b0f..dd764f9c64b9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -1,30 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/file.h> #include <linux/namei.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" +#include <linux/filelock.h> #include <linux/ceph/pagelist.h> -/** +static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + +static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct inode *inode = file_inode(dst->c.flc_file); + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + dst->fl_u.ceph.inode = igrab(inode); +} + +/* + * Do not use the 'fl->fl_file' in release function, which + * is possibly already released by another thread. + */ +static void ceph_fl_release_lock(struct file_lock *fl) +{ + struct inode *inode = fl->fl_u.ceph.inode; + struct ceph_inode_info *ci; + + /* + * If inode is NULL it should be a request file_lock, + * nothing we can do. + */ + if (!inode) + return; + + ci = ceph_inode(inode); + if (atomic_dec_and_test(&ci->i_filelock_ref)) { + /* clear error when all locks are released */ + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; + spin_unlock(&ci->i_ceph_lock); + } + fl->fl_u.ceph.inode = NULL; + iput(inode); +} + +static const struct file_lock_operations ceph_fl_lock_ops = { + .fl_copy_lock = ceph_fl_copy_lock, + .fl_release_private = ceph_fl_release_lock, +}; + +/* * Implement fcntl and flock locking functions. */ -static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, +static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; + + if (operation == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, auth caps may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + fl->fl_ops->fl_copy_lock(fl, NULL); + } + + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -32,32 +110,34 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + owner = secure_addr(fl->c.flc_owner); + + doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d\n", + (int)lock_type, (int)operation, owner, + (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; - req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); + req->r_args.filelock_change.owner = cpu_to_le64(owner); + req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; - err = ceph_mdsc_do_request(mdsc, inode, req); - - if ( operation == CEPH_MDS_OP_GETFILELOCK){ - fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + err = ceph_mdsc_submit_request(mdsc, inode, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, wait ? + ceph_lock_wait_for_completion : NULL); + if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { + fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; else - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + @@ -69,128 +149,292 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, } ceph_mdsc_put_request(req); - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type, err); + doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d\n", + (int)lock_type, (int)operation, (u64) fl->c.flc_pid, + fl->fl_start, length, wait, fl->c.flc_type, err); return err; } -/** +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + doutc(cl, "request %llu was interrupted\n", req->r_tid); + + mutex_lock(&mdsc->mutex); + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + err = 0; + } else { + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + + if (!req->r_session) { + // haven't sent the request + err = 0; + } + } + mutex_unlock(&mdsc->mutex); + if (!err) + return 0; + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + err = wait_for_completion_killable(&req->r_safe_completion); + if (err) + return err; + + return 0; +} + +static int try_unlock_file(struct file *file, struct file_lock *fl) +{ + int err; + unsigned int orig_flags = fl->c.flc_flags; + fl->c.flc_flags |= FL_EXISTS; + err = locks_lock_file_wait(file, fl); + fl->c.flc_flags = orig_flags; + if (err == -ENOENT) { + if (!(orig_flags & FL_EXISTS)) + err = 0; + return err; + } + return 1; +} + +/* * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; - int err; - u8 wait = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); + int err = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; + u8 wait = 0; + u8 lock_cmd; + + if (!(fl->c.flc_flags & FL_POSIX)) + return -ENOLCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + doutc(cl, "fl_owner: %p\n", fl->c.flc_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ - if (F_SETLKW == cmd) - wait = 1; - if (F_GETLK == cmd) + if (IS_GETLK(cmd)) op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; - if (F_RDLCK == fl->fl_type) + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) + posix_lock_file(file, fl, NULL); + return err; + } + + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (F_WRLCK == fl->fl_type) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ - dout("mds locked, locking locally"); + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) { + doutc(cl, "locking locally\n"); err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + if (err) { /* undo! This should only happen if * the kernel detects local * deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", - err); + doutc(cl, "got %d on posix_lock_file, undid lock\n", + err); } } - - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - CEPH_LOCK_UNLOCK, 0, fl); } return err; } int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); + int err = 0; + u8 wait = 0; u8 lock_cmd; - int err; - u8 wait = 1; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); + if (!(fl->c.flc_flags & FL_FLOCK)) + return -ENOLCK; - /* set wait bit, then clear it out of cmd*/ - if (cmd & LOCK_NB) - wait = 0; - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); - /* set command sequence that Ceph wants to see: - shared lock, exclusive lock, or unlock */ - if (LOCK_SH == cmd) + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + doutc(cl, "fl_file: %p\n", fl->c.flc_file); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (lock_is_unlock(fl)) + locks_lock_file_wait(file, fl); + return err; + } + + if (IS_SETLKW(cmd)) + wait = 1; + + if (lock_is_read(fl)) lock_cmd = CEPH_LOCK_SHARED; - else if (LOCK_EX == cmd) + else if (lock_is_write(fl)) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; + if (lock_is_unlock(fl)) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, lock_cmd, wait, fl); - if (!err) { - err = flock_lock_file_wait(file, fl); + inode, lock_cmd, wait, fl); + if (!err && F_UNLCK != fl->c.flc_type) { + err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on flock_lock_file_wait, undid lock", err); + inode, CEPH_LOCK_UNLOCK, 0, fl); + doutc(cl, "got %d on locks_lock_file_wait, undid lock\n", + err); } - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FLOCK, - CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } -/** - * Must be called with lock_flocks() already held. Fills in the passed - * counter variables, so you can prepare pagelist metadata before calling - * ceph_encode_locks. +/* + * Fills in the passed counter variables, so you can prepare pagelist metadata + * before calling ceph_encode_locks. */ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct file_lock *lock; + struct file_lock_context *ctx; *fcntl_count = 0; *flock_count = 0; - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_POSIX) + ctx = locks_inode_context(inode); + if (ctx) { + spin_lock(&ctx->flc_lock); + for_each_file_lock(lock, &ctx->flc_posix) ++(*fcntl_count); - else if (lock->fl_flags & FL_FLOCK) + for_each_file_lock(lock, &ctx->flc_flock) ++(*flock_count); + spin_unlock(&ctx->flc_lock); } - dout("counted %d flock locks and %d fcntl locks", - *flock_count, *fcntl_count); + doutc(cl, "counted %d flock locks and %d fcntl locks\n", + *flock_count, *fcntl_count); } -/** +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +static int lock_to_ceph_filelock(struct inode *inode, + struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + int err = 0; + + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner)); + + switch (lock->c.flc_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + doutc(cl, "Have unknown lock type %d\n", + lock->c.flc_type); + err = -EINVAL; + } + + return err; +} + +/* * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. * If we encounter more of a specific lock type than expected, return -ENOSPC. @@ -200,45 +444,48 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; + struct file_lock_context *ctx = locks_inode_context(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int err = 0; int seen_fcntl = 0; int seen_flock = 0; int l = 0; - dout("encoding %d flock and %d fcntl locks", num_flock_locks, - num_fcntl_locks); + doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks, + num_fcntl_locks); - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_POSIX) { - ++seen_fcntl; - if (seen_fcntl > num_fcntl_locks) { - err = -ENOSPC; - goto fail; - } - err = lock_to_ceph_filelock(lock, &flocks[l]); - if (err) - goto fail; - ++l; + if (!ctx) + return 0; + + spin_lock(&ctx->flc_lock); + for_each_file_lock(lock, &ctx->flc_posix) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; } + err = lock_to_ceph_filelock(inode, lock, &flocks[l]); + if (err) + goto fail; + ++l; } - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_FLOCK) { - ++seen_flock; - if (seen_flock > num_flock_locks) { - err = -ENOSPC; - goto fail; - } - err = lock_to_ceph_filelock(lock, &flocks[l]); - if (err) - goto fail; - ++l; + for_each_file_lock(lock, &ctx->flc_flock) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; } + err = lock_to_ceph_filelock(inode, lock, &flocks[l]); + if (err) + goto fail; + ++l; } fail: + spin_unlock(&ctx->flc_lock); return err; } -/** +/* * Copy the encoded flock and fcntl locks into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. @@ -256,52 +503,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks, if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, flocks, - num_fcntl_locks * sizeof(*flocks)); - if (err) - goto out_fail; + if (num_fcntl_locks > 0) { + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + } nlocks = cpu_to_le32(num_flock_locks); err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, - &flocks[num_fcntl_locks], - num_flock_locks * sizeof(*flocks)); -out_fail: - return err; -} - -/* - * Given a pointer to a lock, convert it to a ceph filelock - */ -int lock_to_ceph_filelock(struct file_lock *lock, - struct ceph_filelock *cephlock) -{ - int err = 0; - - cephlock->start = cpu_to_le64(lock->fl_start); - cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); - cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); - - switch (lock->fl_type) { - case F_RDLCK: - cephlock->type = CEPH_LOCK_SHARED; - break; - case F_WRLCK: - cephlock->type = CEPH_LOCK_EXCL; - break; - case F_UNLCK: - cephlock->type = CEPH_LOCK_UNLOCK; - break; - default: - dout("Have unknown lock type %d", lock->fl_type); - err = -EINVAL; + if (num_flock_locks > 0) { + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); } - +out_fail: return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 187bf214444d..1740047aef0f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,14 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include <linux/bits.h> +#include <linux/ktime.h> +#include <linux/bitmap.h> +#include <linux/mnt_idmapping.h> #include "super.h" #include "mds_client.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> @@ -17,6 +25,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) + /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and @@ -43,12 +53,17 @@ */ struct ceph_reconnect_state { + struct ceph_mds_session *session; + int nr_caps, nr_realms; struct ceph_pagelist *pagelist; - bool flock; + unsigned msg_version; + bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); +static void ceph_cap_release_work(struct work_struct *work); +static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; @@ -57,15 +72,54 @@ static const struct ceph_connection_operations mds_con_ops; * mds reply parsing */ +static int parse_reply_info_quota(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + u8 struct_v, struct_compat; + u32 struct_len; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only + * understand encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + *p = end; + return 0; +bad: + return -EIO; +} + /* * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { - int err = -EIO; + int err = 0; + u8 struct_v = 0; + + if (features == (u64)-1) { + u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * @@ -76,28 +130,240 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; - if (features & CEPH_FEATURE_DIRLAYOUTHASH) - ceph_decode_copy_safe(p, end, &info->dir_layout, - sizeof(info->dir_layout), bad); - else - memset(&info->dir_layout, 0, sizeof(info->dir_layout)); - + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; + + if (features == (u64)-1) { + /* inline data */ + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + /* quota */ + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + /* pool namespace */ + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + + /* btime */ + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + + /* change attribute */ + ceph_decode_64_safe(p, end, info->change_attr, bad); + + /* dir pin */ + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, info->dir_pin, bad); + } else { + info->dir_pin = -ENODATA; + } + + /* snapshot birth time, remains zero for v<=2 */ + if (struct_v >= 3) { + ceph_decode_need(p, end, sizeof(info->snap_btime), bad); + ceph_decode_copy(p, &info->snap_btime, + sizeof(info->snap_btime)); + } else { + memset(&info->snap_btime, 0, sizeof(info->snap_btime)); + } + + /* snapshot count, remains zero for v<=3 */ + if (struct_v >= 4) { + ceph_decode_64_safe(p, end, info->rsnaps, bad); + } else { + info->rsnaps = 0; + } + + if (struct_v >= 5) { + u32 alen; + + ceph_decode_32_safe(p, end, alen, bad); + + while (alen--) { + u32 len; + + /* key */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + /* value */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + } + } + + /* fscrypt flag -- ignore */ + if (struct_v >= 6) + ceph_decode_skip_8(p, end, bad); + + info->fscrypt_auth = NULL; + info->fscrypt_auth_len = 0; + info->fscrypt_file = NULL; + info->fscrypt_file_len = 0; + if (struct_v >= 7) { + ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); + if (info->fscrypt_auth_len) { + info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, + GFP_KERNEL); + if (!info->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_auth, + info->fscrypt_auth_len, bad); + } + ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); + if (info->fscrypt_file_len) { + info->fscrypt_file = kmalloc(info->fscrypt_file_len, + GFP_KERNEL); + if (!info->fscrypt_file) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_file, + info->fscrypt_file_len, bad); + } + } + *p = end; + } else { + /* legacy (unversioned) struct */ + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + if (features & CEPH_FEATURE_MDS_QUOTA) { + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + } else { + info->max_bytes = 0; + info->max_files = 0; + } + + info->pool_ns_len = 0; + info->pool_ns_data = NULL; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + } + + if (features & CEPH_FEATURE_FS_BTIME) { + ceph_decode_need(p, end, sizeof(info->btime), bad); + ceph_decode_copy(p, &info->btime, sizeof(info->btime)); + ceph_decode_64_safe(p, end, info->change_attr, bad); + } + + info->dir_pin = -ENODATA; + /* info->snap_btime and info->rsnaps remain zero */ + } return 0; bad: + err = -EIO; +out_bad: return err; } +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_dirfrag **dirfrag, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(**dirfrag), bad); + *dirfrag = *p; + *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); + if (unlikely(*p > end)) + goto bad; + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +static int parse_reply_info_lease(void **p, void *end, + struct ceph_mds_reply_lease **lease, + u64 features, u32 *altname_len, u8 **altname) +{ + u8 struct_v; + u32 struct_len; + void *lend; + + if (features == (u64)-1) { + u8 struct_compat; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + + ceph_decode_32_safe(p, end, struct_len, bad); + } else { + struct_len = sizeof(**lease); + *altname_len = 0; + *altname = NULL; + } + + lend = *p + struct_len; + ceph_decode_need(p, end, struct_len, bad); + *lease = *p; + *p += sizeof(**lease); + + if (features == (u64)-1) { + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, *altname_len, bad); + ceph_decode_need(p, end, *altname_len, bad); + *altname = *p; + *p += *altname_len; + } else { + *altname = NULL; + *altname_len = 0; + } + } + *p = lend; + return 0; +bad: + return -EIO; +} + /* * parse a normal reply, which may contain a (dir+)dentry and/or a * target inode. */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -106,20 +372,19 @@ static int parse_reply_info_trace(void **p, void *end, if (err < 0) goto out_bad; - if (unlikely(*p + sizeof(*info->dirfrag) > end)) - goto bad; - info->dirfrag = *p; - *p += sizeof(*info->dirfrag) + - sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); - if (unlikely(*p > end)) - goto bad; + err = parse_reply_info_dir(p, end, &info->dirfrag, features); + if (err < 0) + goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; - info->dlease = *p; - *p += sizeof(*info->dlease); + + err = parse_reply_info_lease(p, end, &info->dlease, features, + &info->altname_len, &info->altname); + if (err < 0) + goto out_bad; } if (info->head->is_target) { @@ -142,72 +407,141 @@ out_bad: /* * parse readdir results */ -static int parse_reply_info_dir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, - int features) +static int parse_reply_info_readdir(void **p, void *end, + struct ceph_mds_request *req, + u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; + struct ceph_client *cl = req->r_mdsc->fsc->client; u32 num, i = 0; int err; - info->dir_dir = *p; - if (*p + sizeof(*info->dir_dir) > end) - goto bad; - *p += sizeof(*info->dir_dir) + - sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); - if (*p > end) - goto bad; + err = parse_reply_info_dir(p, end, &info->dir_dir, features); + if (err < 0) + goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); + } if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { + pr_err_client(cl, "dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; } - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); + info->dir_nr = num; while (num) { + struct inode *inode = d_inode(req->r_dentry); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + struct fscrypt_str tname = FSTR_INIT(NULL, 0); + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname; + u32 altname_len, _name_len; + u8 *altname, *_name; + /* dentry */ - ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; - *p += sizeof(struct ceph_mds_reply_lease); + ceph_decode_32_safe(p, end, _name_len, bad); + ceph_decode_need(p, end, _name_len, bad); + _name = *p; + *p += _name_len; + doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name); + + if (info->hash_order) + rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + _name, _name_len); + + /* dentry lease */ + err = parse_reply_info_lease(p, end, &rde->lease, features, + &altname_len, &altname); + if (err) + goto out_bad; + + /* + * Try to dencrypt the dentry names and update them + * in the ceph_mds_reply_dir_entry struct. + */ + fname.dir = inode; + fname.name = _name; + fname.name_len = _name_len; + fname.ctext = altname; + fname.ctext_len = altname_len; + /* + * The _name_len maybe larger than altname_len, such as + * when the human readable name length is in range of + * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), + * then the copy in ceph_fname_to_usr will corrupt the + * data if there has no encryption key. + * + * Just set the no_copy flag and then if there has no + * encryption key the oname.name will be assigned to + * _name always. + */ + fname.no_copy = true; + if (altname_len == 0) { + /* + * Set tname to _name, and this will be used + * to do the base64_decode in-place. It's + * safe because the decoded string should + * always be shorter, which is 3/4 of origin + * string. + */ + tname.name = _name; + + /* + * Set oname to _name too, and this will be + * used to do the dencryption in-place. + */ + oname.name = _name; + oname.len = _name_len; + } else { + /* + * This will do the decryption only in-place + * from altname cryptext directly. + */ + oname.name = altname; + oname.len = altname_len; + } + rde->is_nokey = false; + err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); + if (err) { + pr_err_client(cl, "unable to decode %.*s, got %d\n", + _name_len, _name, err); + goto out_bad; + } + rde->name = oname.name; + rde->name_len = oname.len; /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } done: - if (*p != end) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; bad: err = -EIO; out_bad: - pr_err("problem parsing dir contents %d\n", err); + pr_err_client(cl, "problem parsing dir contents %d\n", err); return err; } @@ -216,42 +550,171 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; info->filelock_reply = *p; - *p += sizeof(*info->filelock_reply); - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; +bad: + return -EIO; +} + + +#if BITS_PER_LONG == 64 + +#define DELEGATED_INO_AVAILABLE xa_mk_value(1) + +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + struct ceph_client *cl = s->s_mdsc->fsc->client; + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + doutc(cl, "got %u sets of delegated inodes\n", sets); + while (sets--) { + u64 start, len; + + ceph_decode_64_safe(p, end, start, bad); + ceph_decode_64_safe(p, end, len, bad); + + /* Don't accept a delegation of system inodes */ + if (start < CEPH_INO_SYSTEM_BASE) { + pr_warn_ratelimited_client(cl, + "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); + continue; + } + while (len--) { + int err = xa_insert(&s->s_delegated_inos, start++, + DELEGATED_INO_AVAILABLE, + GFP_KERNEL); + if (!err) { + doutc(cl, "added delegated inode 0x%llx\n", start - 1); + } else if (err == -EBUSY) { + pr_warn_client(cl, + "MDS delegated inode 0x%llx more than once.\n", + start - 1); + } else { + return err; + } + } + } + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + unsigned long ino; + void *val; + + xa_for_each(&s->s_delegated_inos, ino, val) { + val = xa_erase(&s->s_delegated_inos, ino); + if (val == DELEGATED_INO_AVAILABLE) + return ino; + } + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, + GFP_KERNEL); +} +#else /* BITS_PER_LONG == 64 */ +/* + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top + * and bottom words? + */ +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + ceph_decode_32_safe(p, end, sets, bad); + if (sets) + ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); + return 0; bad: return -EIO; } +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return 0; +} +#endif /* BITS_PER_LONG == 64 */ + /* * parse create results */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features, struct ceph_mds_session *s) { - if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + int ret; + + if (features == (u64)-1 || + (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { + /* Malformed reply? */ info->has_create_ino = false; + } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { + info->has_create_ino = true; + /* struct_v, struct_compat, and len */ + ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad); + ceph_decode_64_safe(p, end, info->ino, bad); + ret = ceph_parse_deleg_inos(p, end, s); + if (ret) + return ret; } else { + /* legacy */ + ceph_decode_64_safe(p, end, info->ino, bad); info->has_create_ino = true; - info->ino = ceph_decode_64(p); } + } else { + if (*p != end) + goto bad; } - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; +bad: + return -EIO; +} + +static int parse_reply_info_getvxattr(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 value_len; + + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ + ceph_decode_skip_32(p, end, bad); /* skip payload length */ + ceph_decode_32_safe(p, end, value_len, bad); + + if (value_len == end - *p) { + info->xattr_info.xattr_value = *p; + info->xattr_info.xattr_value_len = value_len; + *p = end; + return value_len; + } bad: return -EIO; } @@ -260,16 +723,20 @@ bad: * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, - int features) + struct ceph_mds_request *req, + u64 features, struct ceph_mds_session *s) { - if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; + u32 op = le32_to_cpu(info->head->op); + + if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR || - info->head->op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_dir(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_CREATE) - return parse_reply_info_create(p, end, info, features); + else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) + return parse_reply_info_readdir(p, end, req, features); + else if (op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features, s); + else if (op == CEPH_MDS_OP_GETVXATTR) + return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } @@ -277,10 +744,11 @@ static int parse_reply_info_extra(void **p, void *end, /* * parse entire mds reply */ -static int parse_reply_info(struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info, - int features) +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, + struct ceph_mds_request *req, u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; + struct ceph_client *cl = s->s_mdsc->fsc->client; void *p, *end; u32 len; int err; @@ -302,7 +770,7 @@ static int parse_reply_info(struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features); + err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } @@ -320,20 +788,110 @@ static int parse_reply_info(struct ceph_msg *msg, bad: err = -EIO; out_bad: - pr_err("mds parse_reply err %d\n", err); + pr_err_client(cl, "mds parse_reply err %d\n", err); + ceph_msg_dump(msg); return err; } static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + int i; + + kfree(info->diri.fscrypt_auth); + kfree(info->diri.fscrypt_file); + kfree(info->targeti.fscrypt_auth); + kfree(info->targeti.fscrypt_file); + if (!info->dir_entries) + return; + + for (i = 0; i < info->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + + kfree(rde->inode.fscrypt_auth); + kfree(rde->inode.fscrypt_file); + } + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); +} + +/* + * In async unlink case the kclient won't wait for the first reply + * from MDS and just drop all the links and unhash the dentry and then + * succeeds immediately. + * + * For any new create/link/rename,etc requests followed by using the + * same file names we must wait for the first reply of the inflight + * unlink request, or the MDS possibly will fail these following + * requests with -EEXIST if the inflight async unlink request was + * delayed for some reasons. + * + * And the worst case is that for the none async openc request it will + * successfully open the file if the CDentry hasn't been unlinked yet, + * but later the previous delayed async unlink request will remove the + * CDentry. That means the just created file is possibly deleted later + * by accident. + * + * We need to wait for the inflight async unlink requests to finish + * when creating new files/directories by using the same file names. + */ +int ceph_wait_on_conflict_unlink(struct dentry *dentry) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); + struct ceph_client *cl = fsc->client; + struct dentry *pdentry = dentry->d_parent; + struct dentry *udentry, *found = NULL; + struct ceph_dentry_info *di; + struct qstr dname; + u32 hash = dentry->d_name.hash; + int err; + + dname.name = dentry->d_name.name; + dname.len = dentry->d_name.len; + + rcu_read_lock(); + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, + hnode, hash) { + udentry = di->dentry; + + spin_lock(&udentry->d_lock); + if (udentry->d_name.hash != hash) + goto next; + if (unlikely(udentry->d_parent != pdentry)) + goto next; + if (!hash_hashed(&di->hnode)) + goto next; + + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) + pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n", + dentry, dentry); + + if (!d_same_name(udentry, pdentry, &dname)) + goto next; + + found = dget_dlock(udentry); + spin_unlock(&udentry->d_lock); + break; +next: + spin_unlock(&udentry->d_lock); + } + rcu_read_unlock(); + + if (likely(!found)) + return 0; + + doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry, + found, found); + + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, + TASK_KILLABLE); + dput(found); + return err; } /* * sessions */ -static const char *session_state_name(int s) +const char *ceph_session_state_name(int s) { switch (s) { case CEPH_MDS_SESSION_NEW: return "new"; @@ -341,33 +899,31 @@ static const char *session_state_name(int s) case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_CLOSED: return "closed"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + case CEPH_MDS_SESSION_REJECTED: return "rejected"; default: return "???"; } } -static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { - if (atomic_inc_not_zero(&s->s_ref)) { - dout("mdsc get_session %p %d -> %d\n", s, - atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + if (refcount_inc_not_zero(&s->s_ref)) return s; - } else { - dout("mdsc get_session %p 0 -- FAIL", s); - return NULL; - } + return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) { - dout("mdsc put_session %p %d -> %d\n", s, - atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); - if (atomic_dec_and_test(&s->s_ref)) { + if (IS_ERR_OR_NULL(s)) + return; + + if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - ceph_auth_destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer(s->s_auth.authorizer); + WARN_ON(mutex_is_locked(&s->s_mutex)); + xa_destroy(&s->s_delegated_inos); kfree(s); } } @@ -378,22 +934,17 @@ void ceph_put_mds_session(struct ceph_mds_session *s) struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, int mds) { - struct ceph_mds_session *session; - - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; - session = mdsc->sessions[mds]; - dout("lookup_mds_session %p %d\n", session, - atomic_read(&session->s_ref)); - get_session(session); - return session; + return ceph_get_mds_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) { - if (mds >= mdsc->max_sessions) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return false; - return mdsc->sessions[mds]; + else + return true; } static int __verify_registered_session(struct ceph_mds_client *mdsc, @@ -412,59 +963,63 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc, static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, int mds) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *s; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return ERR_PTR(-EIO); + + if (mds >= mdsc->mdsmap->possible_max_rank) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); + + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds + 1); + struct ceph_mds_session **sa; + size_t ptr_size = sizeof(struct ceph_mds_session *); + + doutc(cl, "realloc to %d\n", newmax); + sa = kcalloc(newmax, ptr_size, GFP_NOFS); + if (!sa) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * ptr_size); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + + doutc(cl, "mds%d\n", mds); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; - s->s_ttl = 0; - s->s_seq = 0; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); - spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 0; + atomic_set(&s->s_cap_gen, 1); s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); - s->s_renew_requested = 0; - s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); - s->s_nr_caps = 0; - s->s_trim_caps = 0; - atomic_set(&s->s_ref, 1); + refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); - s->s_num_cap_releases = 0; - s->s_cap_iterator = NULL; + xa_init(&s->s_delegated_inos); INIT_LIST_HEAD(&s->s_cap_releases); - INIT_LIST_HEAD(&s->s_cap_releases_done); - INIT_LIST_HEAD(&s->s_cap_flushing); - INIT_LIST_HEAD(&s->s_cap_snaps_flushing); + INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); - dout("register_session mds%d\n", mds); - if (mds >= mdsc->max_sessions) { - int newmax = 1 << get_count_order(mds+1); - struct ceph_mds_session **sa; + INIT_LIST_HEAD(&s->s_cap_dirty); + INIT_LIST_HEAD(&s->s_cap_flushing); - dout("register_session realloc to %d\n", newmax); - sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); - if (sa == NULL) - goto fail_realloc; - if (mdsc->sessions) { - memcpy(sa, mdsc->sessions, - mdsc->max_sessions * sizeof(void *)); - kfree(mdsc->sessions); - } - mdsc->sessions = sa; - mdsc->max_sessions = newmax; - } mdsc->sessions[mds] = s; - atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + atomic_inc(&mdsc->num_sessions); + refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); @@ -482,11 +1037,12 @@ fail_realloc: static void __unregister_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { - dout("__unregister_session mds%d %p\n", s->s_mds, s); + doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); } /* @@ -502,28 +1058,59 @@ static void put_request_session(struct ceph_mds_request *req) } } +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, mds); + if (!s) + continue; + + if (check_state && !check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + ceph_mdsc_release_dir_caps_async(req); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } - if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_target_inode) - iput(req->r_target_inode); + if (req->r_parent) { + ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + iput(req->r_parent); + } + iput(req->r_target_inode); + iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have @@ -532,61 +1119,40 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - dput(req->r_old_dentry); iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); + put_cred(req->r_cred); + if (req->r_mnt_idmap) + mnt_idmap_put(req->r_mnt_idmap); + if (req->r_pagelist) + ceph_pagelist_release(req->r_pagelist); + kfree(req->r_fscrypt_auth); + kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); - kfree(req); + WARN_ON_ONCE(!list_empty(&req->r_wait)); + kmem_cache_free(ceph_mds_request_cachep, req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} - -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -599,23 +1165,38 @@ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, struct inode *dir) { + struct ceph_client *cl = mdsc->fsc->client; + int ret = 0; + req->r_tid = ++mdsc->last_tid; - if (req->r_num_caps) - ceph_reserve_caps(mdsc, &req->r_caps_reservation, - req->r_num_caps); - dout("__register_request %p tid %lld\n", req, req->r_tid); + if (req->r_num_caps) { + ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); + if (ret < 0) { + pr_err_client(cl, "%p failed to reserve caps: %d\n", + req, ret); + /* set req->r_err to fail early from __do_request */ + req->r_err = ret; + return; + } + } + doutc(cl, "%p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); + + req->r_cred = get_current_cred(); + if (!req->r_mnt_idmap) + req->r_mnt_idmap = &nop_mnt_idmap; - req->r_uid = current_fsuid(); - req->r_gid = current_fsgid(); + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); ihold(dir); - spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; + spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); spin_unlock(&ci->i_unsafe_lock); } @@ -624,25 +1205,74 @@ static void __register_request(struct ceph_mds_client *mdsc, static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { - dout("__unregister_request %p tid %lld\n", req, req->r_tid); - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid); + + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); - spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + } + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_target_item); + spin_unlock(&ci->i_unsafe_lock); + } + if (req->r_unsafe_dir) { iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } /* + * Walk back up the dentry tree until we hit a dentry representing a + * non-snapshot inode. We do this using the rcu_read_lock (which must be held + * when calling this) to ensure that the objects won't disappear while we're + * working with them. Once we hit a candidate dentry, we attempt to take a + * reference to it, and return that as the result. + */ +static struct inode *get_nonsnap_parent(struct dentry *dentry) +{ + struct inode *inode = NULL; + + while (dentry && !IS_ROOT(dentry)) { + inode = d_inode_rcu(dentry); + if (!inode || ceph_snap(inode) == CEPH_NOSNAP) + break; + dentry = dentry->d_parent; + } + if (inode) + inode = igrab(inode); + return inode; +} + +/* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the @@ -650,21 +1280,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ -static struct dentry *get_nonsnap_parent(struct dentry *dentry) -{ - /* - * we don't need to worry about protecting the d_parent access - * here because we never renaming inside the snapped namespace - * except to resplice to another snapdir, and either the old or new - * result is a valid result. - */ - while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) - dentry = dentry->d_parent; - return dentry; -} - static int __choose_mds(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) + struct ceph_mds_request *req, + bool *random) { struct inode *inode; struct ceph_inode_info *ci; @@ -672,7 +1290,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; - bool is_hash = req->r_direct_is_hash; + bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + struct ceph_client *cl = mdsc->fsc->client; + + if (random) + *random = false; /* * is there a specific mds we should try? ignore hint if we have @@ -681,8 +1303,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_resend_mds >= 0 && (__have_session(mdsc, req->r_resend_mds) || ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { - dout("choose_mds using resend_mds mds%d\n", - req->r_resend_mds); + doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds); return req->r_resend_mds; } @@ -691,36 +1312,57 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { - inode = req->r_inode; + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { + inode = req->r_inode; + ihold(inode); + } else { + /* req->r_dentry is non-null for LSSNAP request */ + rcu_read_lock(); + inode = get_nonsnap_parent(req->r_dentry); + rcu_read_unlock(); + doutc(cl, "using snapdir's parent %p %llx.%llx\n", + inode, ceph_vinop(inode)); + } } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ - struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = parent->d_inode; - - if (dir->i_sb != mdsc->fsc->sb) { - /* not this fs! */ - inode = req->r_dentry->d_inode; + struct dentry *parent; + struct inode *dir; + + rcu_read_lock(); + parent = READ_ONCE(req->r_dentry->d_parent); + dir = req->r_parent ? : d_inode_rcu(parent); + + if (!dir || dir->i_sb != mdsc->fsc->sb) { + /* not this fs or parent went negative */ + inode = d_inode(req->r_dentry); + if (inode) + ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = get_nonsnap_parent(parent); - inode = dn->d_inode; - dout("__choose_mds using nonsnap parent %p\n", inode); - } else if (req->r_dentry->d_inode) { - /* dentry target */ - inode = req->r_dentry->d_inode; + inode = get_nonsnap_parent(parent); + doutc(cl, "using nonsnap parent %p %llx.%llx\n", + inode, ceph_vinop(inode)); } else { - /* dir + name */ - inode = dir; - hash = ceph_dentry_hash(dir, req->r_dentry); - is_hash = true; + /* dentry target */ + inode = d_inode(req->r_dentry); + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = igrab(dir); + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } else { + ihold(inode); + } } + rcu_read_unlock(); } - dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, - (int)hash, mode); if (!inode) goto random; + + doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode, + ceph_vinop(inode), (int)is_hash, hash, mode); ci = ceph_inode(inode); if (is_hash && S_ISDIR(inode->i_mode)) { @@ -736,30 +1378,31 @@ static int __choose_mds(struct ceph_mds_client *mdsc, get_random_bytes(&r, 1); r %= frag.ndist; mds = frag.dist[r]; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (%d/%d)\n", - inode, ceph_vinop(inode), - frag.frag, mds, - (int)r, frag.ndist); + doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), frag.frag, + mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) - return mds; + CEPH_MDS_STATE_ACTIVE && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) + goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ - mode = USE_AUTH_MDS; if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; - dout("choose_mds %p %llx.%llx " - "frag %u mds%d (auth)\n", - inode, ceph_vinop(inode), frag.frag, mds); + doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) - return mds; + CEPH_MDS_STATE_ACTIVE) { + if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, + mds)) + goto out; + } } + mode = USE_AUTH_MDS; } } @@ -771,18 +1414,24 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); + iput(inode); goto random; } mds = cap->session->s_mds; - dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", - inode, ceph_vinop(inode), mds, - cap == ci->i_auth_cap ? "auth " : "", cap); + doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode, + ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); +out: + iput(inode); return mds; random: + if (random) + *random = true; + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); - dout("choose_mds chose random mds%d\n", mds); + doutc(cl, "chose random mds%d\n", mds); return mds; } @@ -790,7 +1439,7 @@ random: /* * session messages */ -static struct ceph_msg *create_session_msg(u32 op, u64 seq) +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -798,12 +1447,210 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session %s msg\n", + ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); + + return msg; +} + +static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; +#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) +static int encode_supported_features(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(feature_bits); + + if (count > 0) { + size_t i; + size_t size = FEATURE_BYTES(count); + unsigned long bit; + + if (WARN_ON_ONCE(*p + 4 + size > end)) + return -ERANGE; + + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) { + bit = feature_bits[i]; + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); + } + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 > end)) + return -ERANGE; + + ceph_encode_32(p, 0); + } + + return 0; +} + +static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; +#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) +static int encode_metric_spec(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(metric_bits); + + /* header */ + if (WARN_ON_ONCE(*p + 2 > end)) + return -ERANGE; + + ceph_encode_8(p, 1); /* version */ + ceph_encode_8(p, 1); /* compat */ + + if (count > 0) { + size_t i; + size_t size = METRIC_BYTES(count); + + if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4 + size); + + /* metric spec */ + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) + ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 + 4 > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4); + /* metric spec */ + ceph_encode_32(p, 0); + } + + return 0; +} + +/* + * session message, specialization for CEPH_SESSION_REQUEST_OPEN + * to include additional client metadata fields. + */ +static struct ceph_msg * +create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + int i; + int extra_bytes = 0; + int metadata_key_count = 0; + struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; + struct ceph_client *cl = mdsc->fsc->client; + size_t size, count; + void *p, *end; + int ret; + + const char* metadata[][2] = { + {"hostname", mdsc->nodename}, + {"kernel_version", init_utsname()->release}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, + {NULL, NULL} + }; + + /* Calculate serialized length of metadata */ + extra_bytes = 4; /* map length */ + for (i = 0; metadata[i][0]; ++i) { + extra_bytes += 8 + strlen(metadata[i][0]) + + strlen(metadata[i][1]); + metadata_key_count++; + } + + /* supported feature */ + size = 0; + count = ARRAY_SIZE(feature_bits); + if (count > 0) + size = FEATURE_BYTES(count); + extra_bytes += 4 + size; + + /* metric spec */ + size = 0; + count = ARRAY_SIZE(metric_bits); + if (count > 0) + size = METRIC_BYTES(count); + extra_bytes += 2 + 4 + 4 + size; + + /* flags, mds auth caps and oldest_client_tid */ + extra_bytes += 4 + 4 + 8; + + /* Allocate the message */ + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, + GFP_NOFS, false); + if (!msg) { + pr_err_client(cl, "ENOMEM creating session open msg\n"); + return ERR_PTR(-ENOMEM); + } + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + h = p; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + + /* + * Serialize client metadata into waiting buffer space, using + * the format that userspace expects for map<string, string> + * + * ClientSession messages with metadata are v7 + */ + msg->hdr.version = cpu_to_le16(7); + msg->hdr.compat_version = cpu_to_le16(1); + + /* The write pointer, following the session_head structure */ + p += sizeof(*h); + + /* Number of entries in the map */ + ceph_encode_32(&p, metadata_key_count); + + /* Two length-prefixed strings for each entry in the map */ + for (i = 0; metadata[i][0]; ++i) { + size_t const key_len = strlen(metadata[i][0]); + size_t const val_len = strlen(metadata[i][1]); + + ceph_encode_32(&p, key_len); + memcpy(p, metadata[i][0], key_len); + p += key_len; + ceph_encode_32(&p, val_len); + memcpy(p, metadata[i][1], val_len); + p += val_len; + } + + ret = encode_supported_features(&p, end); + if (ret) { + pr_err_client(cl, "encode_supported_features failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + + ret = encode_metric_spec(&p, end); + if (ret) { + pr_err_client(cl, "encode_metric_spec failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + + /* version == 5, flags */ + ceph_encode_32(&p, 0); + + /* version == 6, mds auth caps */ + ceph_encode_32(&p, 0); + + /* version == 7, oldest_client_tid */ + ceph_encode_64(&p, mdsc->oldest_tid); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + return msg; } @@ -819,17 +1666,21 @@ static int __open_session(struct ceph_mds_client *mdsc, int mstate; int mds = session->s_mds; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) + return -EIO; + /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); - dout("open_session to mds%d (%s)\n", mds, - ceph_mds_state_name(mstate)); + doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); session->s_state = CEPH_MDS_SESSION_OPENING; session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); - if (!msg) - return -ENOMEM; + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, + session->s_seq); + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -839,71 +1690,122 @@ static int __open_session(struct ceph_mds_client *mdsc, * * called under mdsc->mutex */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + int ret; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) { + ret = __open_session(mdsc, session); + if (ret) + return ERR_PTR(ret); + } + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; - int target; + struct ceph_client *cl = mdsc->fsc->client; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->possible_max_rank) return; + mi = &mdsc->mdsmap->m_info[mds]; - dout("open_export_target_sessions for mds%d (%d targets)\n", - session->s_mds, mi->num_export_targets); + doutc(cl, "for mds%d (%d targets)\n", session->s_mds, + mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { - target = mi->export_targets[i]; - ts = __ceph_lookup_mds_session(mdsc, target); - if (!ts) { - ts = register_session(mdsc, target); - if (IS_ERR(ts)) - return; - } - if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); - else - dout(" mds%d target mds%d %p is %s\n", session->s_mds, - i, ts, session_state_name(ts->s_state)); + ts = __open_export_target_session(mdsc, mi->export_targets[i]); ceph_put_mds_session(ts); } } -void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - mutex_lock(&mdsc->mutex); - __open_export_target_sessions(mdsc, session); - mutex_unlock(&mdsc->mutex); -} - /* * session caps */ -/* - * Free preallocated cap messages assigned to this session - */ -static void cleanup_cap_releases(struct ceph_mds_session *session) +static void detach_cap_releases(struct ceph_mds_session *session, + struct list_head *target) { - struct ceph_msg *msg; + struct ceph_client *cl = session->s_mdsc->fsc->client; - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); + lockdep_assert_held(&session->s_cap_lock); + + list_splice_init(&session->s_cap_releases, target); + session->s_num_cap_releases = 0; + doutc(cl, "mds%d\n", session->s_mds); +} + +static void dispose_cap_releases(struct ceph_mds_client *mdsc, + struct list_head *dispose) +{ + while (!list_empty(dispose)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(dispose, struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); } - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); +} + +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_request *req; + struct rb_node *p; + + doutc(cl, "mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n", + req->r_tid); + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); + __unregister_request(mdsc, req); } - spin_unlock(&session->s_cap_lock); + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); } /* @@ -912,27 +1814,31 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) * * Caller must hold session s_mutex. */ -static int iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) +int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, int mds, void *), + void *arg) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct list_head *p; struct ceph_cap *cap; struct inode *inode, *last_inode = NULL; struct ceph_cap *old_cap = NULL; int ret; - dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + doutc(cl, "%p mds%d\n", session, session->s_mds); spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { + int mds; + cap = list_entry(p, struct ceph_cap, session_caps); - inode = igrab(&cap->ci->vfs_inode); + inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; } session->s_cap_iterator = cap; + mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { @@ -944,19 +1850,22 @@ static int iterate_session_caps(struct ceph_mds_session *session, old_cap = NULL; } - ret = cb(inode, cap, arg); + ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); p = p->next; - if (cap->ci == NULL) { - dout("iterate_session_caps finishing cap %p removal\n", - cap); + if (!cap->ci) { + doutc(cl, "finishing cap %p removal\n", cap); BUG_ON(cap->session != session); + cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - cap->session = NULL; - old_cap = cap; /* put_cap it w/o locks held */ + atomic64_dec(&session->s_mdsc->metric.total_caps); + if (cap->queue_release) + __ceph_queue_cap_release(session, cap); + else + old_cap = cap; /* put_cap it w/o locks held */ } if (ret < 0) goto out; @@ -966,57 +1875,36 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - if (last_inode) - iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); return ret; } -static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); - int drop = 0; + struct ceph_client *cl = ceph_inode_to_client(inode); + bool invalidate = false; + struct ceph_cap *cap; + int iputs = 0; - dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap); - if (!__ceph_is_any_real_caps(ci)) { - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; - - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - pr_info(" dropping dirty %s state for %p %lld\n", - ceph_cap_string(ci->i_dirty_caps), - inode, ceph_ino(inode)); - ci->i_dirty_caps = 0; - list_del_init(&ci->i_dirty_item); - drop = 1; - } - if (!list_empty(&ci->i_flushing_item)) { - pr_info(" dropping dirty+flushing %s state for %p %lld\n", - ceph_cap_string(ci->i_flushing_caps), - inode, ceph_ino(inode)); - ci->i_flushing_caps = 0; - list_del_init(&ci->i_flushing_item); - mdsc->num_cap_flushing--; - drop = 1; - } - if (drop && ci->i_wrbuffer_ref) { - pr_info(" dropping dirty data for %p %lld\n", - inode, ceph_ino(inode)); - ci->i_wrbuffer_ref = 0; - ci->i_wrbuffer_ref_head = 0; - drop++; - } - spin_unlock(&mdsc->cap_dirty_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap) { + doutc(cl, " removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->netfs.inode); + + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); } spin_unlock(&ci->i_ceph_lock); - while (drop--) + + if (cap) + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + while (iputs--) iput(inode); return 0; } @@ -1026,40 +1914,96 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { - dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; + LIST_HEAD(dispose); + + doutc(fsc->client, "on %p\n", session); + ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); + + wake_up_all(&fsc->mdsc->cap_flushing_wq); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + + // drop cap expires and unlock s_cap_lock + detach_cap_releases(session, &dispose); + BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); - cleanup_cap_releases(session); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(session->s_mdsc, &dispose); } +enum { + RECONNECT, + RENEWCAPS, + FORCE_RO, +}; + /* * wake up any threads waiting on this session's caps. if the cap is * old (didn't get renewed on the client reconnect), remove it now. * * caller must hold s_mutex. */ -static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + unsigned long ev = (unsigned long)arg; - wake_up_all(&ci->i_cap_wq); - if (arg) { + if (ev == RECONNECT) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); + } else if (ev == RENEWCAPS) { + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + /* mds did not re-issue stale cap */ + if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) + cap->issued = cap->implemented = CEPH_CAP_PIN; + spin_unlock(&ci->i_ceph_lock); + } else if (ev == FORCE_RO) { } + wake_up_all(&ci->i_cap_wq); return 0; } -static void wake_up_session_caps(struct ceph_mds_session *session, - int reconnect) +static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { - dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); - iterate_session_caps(session, wake_up_session_cb, - (void *)(unsigned long)reconnect); + struct ceph_client *cl = session->s_mdsc->fsc->client; + + doutc(cl, "session %p mds%d\n", session, session->s_mds); + ceph_iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)ev); } /* @@ -1071,33 +2015,50 @@ static void wake_up_session_caps(struct ceph_mds_session *session, static int send_renew_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; int state; if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) - pr_info("mds%d caps stale\n", session->s_mds); + pr_info_client(cl, "mds%d caps stale\n", session->s_mds); session->s_renew_requested = jiffies; /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); if (state < CEPH_MDS_STATE_RECONNECT) { - dout("send_renew_caps ignoring mds%d (%s)\n", - session->s_mds, ceph_mds_state_name(state)); + doutc(cl, "ignoring mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); return 0; } - dout("send_renew_caps to mds%d (%s)\n", session->s_mds, - ceph_mds_state_name(state)); - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, - ++session->s_renew_seq); + doutc(cl, "to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (IS_ERR(msg)) + return PTR_ERR(msg); + ceph_con_send(&session->s_con, msg); + return 0; +} + +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_msg *msg; + + doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds, + ceph_session_state_name(session->s_state), seq); + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -1106,6 +2067,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, static void renewed_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int is_renew) { + struct ceph_client *cl = mdsc->fsc->client; int was_stale; int wake = 0; @@ -1117,37 +2079,39 @@ static void renewed_caps(struct ceph_mds_client *mdsc, if (was_stale) { if (time_before(jiffies, session->s_cap_ttl)) { - pr_info("mds%d caps renewed\n", session->s_mds); + pr_info_client(cl, "mds%d caps renewed\n", + session->s_mds); wake = 1; } else { - pr_info("mds%d caps still stale\n", session->s_mds); + pr_info_client(cl, "mds%d caps still stale\n", + session->s_mds); } } - dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", - session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", - time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds, + session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); spin_unlock(&session->s_cap_lock); if (wake) - wake_up_session_caps(session, 0); + wake_up_session_caps(session, RENEWCAPS); } /* * send a session close request */ -static int request_close_session(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int request_close_session(struct ceph_mds_session *session) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; - dout("request_close_session mds%d state %s seq %lld\n", - session->s_mds, session_state_name(session->s_state), - session->s_seq); - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + doutc(cl, "mds%d state %s seq %lld\n", session->s_mds, + ceph_session_state_name(session->s_state), session->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, + session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); - return 0; + return 1; } /* @@ -1159,7 +2123,30 @@ static int __close_session(struct ceph_mds_client *mdsc, if (session->s_state >= CEPH_MDS_SESSION_CLOSING) return 0; session->s_state = CEPH_MDS_SESSION_CLOSING; - return request_close_session(mdsc, session); + return request_close_session(session); +} + +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + hlist_for_each_entry(child, &dentry->d_children, d_sib) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; } /* @@ -1172,40 +2159,77 @@ static int __close_session(struct ceph_mds_client *mdsc, * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ -static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +static int trim_caps_cb(struct inode *inode, int mds, void *arg) { - struct ceph_mds_session *session = arg; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; + struct ceph_cap *cap; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + return 0; + } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", - inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(mine), + ceph_cap_string(oissued), ceph_cap_string(used), + ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + /* Note: it's possible that i_filelock_ref becomes non-zero + * after dropping auth caps. It doesn't hurt because reply + * of lock mds request will re-add auth caps. */ + if (atomic_read(&ci->i_filelock_ref) > 0) + goto out; + } + /* The inode has cached pages, but it's no longer used. + * we can safely drop it */ + if (S_ISREG(inode->i_mode) && + wanted == 0 && used == CEPH_CAP_FILE_CACHE && + !(oissued & CEPH_CAP_FILE_CACHE)) { + used = 0; + oissued = 0; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ - session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); - __ceph_remove_cap(cap); + ceph_remove_cap(mdsc, cap, true); + (*remaining)--; } else { - /* try to drop referring dentries */ + struct dentry *dentry; + /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); - d_prune_aliases(inode); - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, atomic_read(&inode->i_count)); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = icount_read(inode); + if (count == 1) + (*remaining)--; + doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", + inode, ceph_vinop(inode), cap, count); + } else { + dput(dentry); + } return 0; } @@ -1217,229 +2241,364 @@ out: /* * Trim session cap count down to some max number. */ -static int trim_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int max_caps) +int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) { + struct ceph_client *cl = mdsc->fsc->client; int trim_caps = session->s_nr_caps - max_caps; - dout("trim_caps mds%d start: %d / %d, trim %d\n", - session->s_mds, session->s_nr_caps, max_caps, trim_caps); + doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds, + session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - iterate_session_caps(session, trim_caps_cb, session); - dout("trim_caps mds%d done: %d / %d, trimmed %d\n", - session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); + doutc(cl, "mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - remaining); } + + ceph_flush_session_cap_releases(mdsc, session); return 0; } +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + if (cf->tid <= want_flush_tid) { + doutc(cl, "still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); + ret = 0; + } + } + spin_unlock(&mdsc->cap_dirty_lock); + return ret; +} + /* - * Allocate cap_release messages. If there is a partially full message - * in the queue, try to allocate enough to cover it's remainder, so that - * we can send it immediately. + * flush all dirty inode data to disk. * - * Called under s_mutex. + * returns true if we've flushed through want_flush_tid */ -int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "want %llu\n", want_flush_tid); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); +} + +/* + * called under s_mutex + */ +static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { - struct ceph_msg *msg, *partial = NULL; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; - int err = -ENOMEM; - int extra = mdsc->fsc->mount_options->cap_release_safety; - int num; + struct ceph_mds_cap_item *item; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; + __le32 barrier, *cap_barrier; - dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, - extra); + down_read(&osdc->lock); + barrier = cpu_to_le32(osdc->epoch_barrier); + up_read(&osdc->lock); spin_lock(&session->s_cap_lock); +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - if (num) { - dout(" partial %p with (%d/%d)\n", msg, num, - (int)CEPH_CAPS_PER_RELEASE); - extra += CEPH_CAPS_PER_RELEASE - num; - partial = msg; + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); } - } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { - spin_unlock(&session->s_cap_lock); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS, false); - if (!msg) - goto out_unlocked; - dout("add_cap_releases %p msg %p now %d\n", session, msg, - (int)msg->front.iov_len); + + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; + head = msg->front.iov_base; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - spin_lock(&session->s_cap_lock); - list_add(&msg->list_head, &session->s_cap_releases); - session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; + put_unaligned_le32(get_unaligned_le32(&head->num) + 1, + &head->num); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->issue_seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + doutc(cl, "mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } } - if (partial) { - head = partial->front.iov_base; - num = le32_to_cpu(head->num); - dout(" queueing partial %p with %d/%d\n", partial, num, - (int)CEPH_CAPS_PER_RELEASE); - list_move_tail(&partial->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + BUG_ON(num_cap_releases != 0); + + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + doutc(cl, "mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); } - err = 0; + return; +out_err: + pr_err_client(cl, "mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; spin_unlock(&session->s_cap_lock); -out_unlocked: - return err; } -/* - * flush all dirty inode data to disk. - * - * returns true if we've flushed through want_flush_seq - */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void ceph_cap_release_work(struct work_struct *work) { - int mds, ret = 1; + struct ceph_mds_session *session = + container_of(work, struct ceph_mds_session, s_cap_release_work); - dout("check_cap_flush want %lld\n", want_flush_seq); - mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { - struct ceph_mds_session *session = mdsc->sessions[mds]; + mutex_lock(&session->s_mutex); + if (session->s_state == CEPH_MDS_SESSION_OPEN || + session->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(session->s_mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); +} - if (!session) - continue; - get_session(session); - mutex_unlock(&mdsc->mutex); +void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; - mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; - } - spin_unlock(&ci->i_ceph_lock); - } - mutex_unlock(&session->s_mutex); + ceph_get_mds_session(session); + if (queue_work(mdsc->fsc->cap_wq, + &session->s_cap_release_work)) { + doutc(cl, "cap release work queued\n"); + } else { ceph_put_mds_session(session); - - if (!ret) - return ret; - mutex_lock(&mdsc->mutex); + doutc(cl, "failed to queue cap release work\n"); } - - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; } /* - * called under s_mutex + * caller holds session->s_cap_lock */ -void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap) { - struct ceph_msg *msg; + list_add_tail(&cap->session_caps, &session->s_cap_releases); + session->s_num_cap_releases++; - dout("send_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - spin_unlock(&session->s_cap_lock); - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); - ceph_con_send(&session->s_con, msg); - spin_lock(&session->s_cap_lock); - } - spin_unlock(&session->s_cap_lock); + if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) + ceph_flush_session_cap_releases(session->s_mdsc, session); } -static void discard_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static void ceph_cap_reclaim_work(struct work_struct *work) { - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - unsigned num; + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_reclaim_work); + int ret = ceph_trim_dentries(mdsc); + if (ret == -EAGAIN) + ceph_queue_cap_reclaim_work(mdsc); +} - dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); +void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; - - /* requeue completed messages */ - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { + doutc(cl, "caps reclaim work queued\n"); + } else { + doutc(cl, "failed to queue caps release work\n"); + } +} - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, - num); - session->s_num_cap_releases += num; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - list_add(&msg->list_head, &session->s_cap_releases); +void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) +{ + int val; + if (!nr) + return; + val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { + atomic_set(&mdsc->cap_reclaim_pending, 0); + ceph_queue_cap_reclaim_work(mdsc); } +} - spin_unlock(&session->s_cap_lock); +void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { + doutc(cl, "caps unlink work queued\n"); + } else { + doutc(cl, "failed to queue caps unlink work\n"); + } +} + +static void ceph_cap_unlink_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_unlink_work); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_delay_lock); + while (!list_empty(&mdsc->cap_unlink_delay_list)) { + struct ceph_inode_info *ci; + struct inode *inode; + + ci = list_first_entry(&mdsc->cap_unlink_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); + iput(inode); + spin_lock(&mdsc->cap_delay_lock); + } + } + spin_unlock(&mdsc->cap_delay_lock); + doutc(cl, "done\n"); } /* * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(struct ceph_mds_reply_dir_entry); + unsigned int num_entries; + u64 bytes_count; + int order; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1U); + num_entries = min(num_entries, opt->max_readdir); + + bytes_count = (u64)size * num_entries; + if (unlikely(bytes_count > ULONG_MAX)) + bytes_count = ULONG_MAX; + + order = get_order((unsigned long)bytes_count); + while (order >= 0) { + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN | + __GFP_ZERO, + order); + if (rinfo->dir_entries) + break; + order--; + } + if (!rinfo->dir_entries || unlikely(order < 0)) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { - struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct ceph_mds_request *req; + req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); req->r_mdsc = mdsc; req->r_started = jiffies; + req->r_start_latency = ktime_get(); req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); + INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + ktime_get_coarse_real_ts64(&req->r_stamp); + req->r_op = op; req->r_direct_mode = mode; return req; @@ -1458,145 +2617,275 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) struct ceph_mds_request, r_node); } -static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *req = __get_oldest_req(mdsc); - - if (req) - return req->r_tid; - return 0; + return mdsc->oldest_tid; } -/* - * Build a dentry's path. Allocate on heap; caller must kfree. Based - * on build_path_from_dentry in fs/cifs/dir.c. +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + struct inode *dir = req->r_parent; + struct dentry *dentry = req->r_dentry; + const struct qstr *name = req->r_dname; + u8 *cryptbuf = NULL; + u32 len = 0; + int ret = 0; + + /* only encode if we have parent and dentry */ + if (!dir || !dentry) + goto success; + + /* No-op unless this is encrypted */ + if (!IS_ENCRYPTED(dir)) + goto success; + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret < 0) + return ERR_PTR(ret); + + /* No key? Just ignore it. */ + if (!fscrypt_has_encryption_key(dir)) + goto success; + + if (!name) + name = &dentry->d_name; + + if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) { + WARN_ON_ONCE(1); + return ERR_PTR(-ENAMETOOLONG); + } + + /* No need to append altname if name is short enough */ + if (len <= CEPH_NOHASH_NAME_MAX) { + len = 0; + goto success; + } + + cryptbuf = kmalloc(len, GFP_KERNEL); + if (!cryptbuf) + return ERR_PTR(-ENOMEM); + + ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len); + if (ret) { + kfree(cryptbuf); + return ERR_PTR(ret); + } +success: + *plen = len; + return cryptbuf; +} +#else +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + *plen = 0; + return NULL; +} +#endif + +/** + * ceph_mdsc_build_path - build a path string to a given dentry + * @mdsc: mds client + * @dentry: dentry to which path should be built + * @path_info: output path, length, base ino+snap, and freepath ownership flag + * @for_wire: is this path going to be sent to the MDS? * - * If @stop_on_nosnap, generate path relative to the first non-snapped - * inode. + * Build a string that represents the path to the dentry. This is mostly called + * for two different purposes: + * + * 1) we need to build a path string to send to the MDS (for_wire == true) + * 2) we need a path string for local presentation (e.g. debugfs) + * (for_wire == false) + * + * The path is built in reverse, starting with the dentry. Walk back up toward + * the root, building the path until the first non-snapped inode is reached + * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ -char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap) +char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, + struct ceph_path_info *path_info, int for_wire) { - struct dentry *temp; + struct ceph_client *cl = mdsc->fsc->client; + struct dentry *cur; + struct inode *inode; char *path; - int len, pos; + int pos; unsigned seq; + u64 base; - if (dentry == NULL) + if (!dentry) return ERR_PTR(-EINVAL); + path = __getname(); + if (!path) + return ERR_PTR(-ENOMEM); retry: - len = 0; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = temp->d_inode; - if (inode && ceph_snap(inode) == CEPH_SNAPDIR) - len++; /* slash only */ - else if (stop_on_nosnap && inode && - ceph_snap(inode) == CEPH_NOSNAP) - break; - else - len += 1 + temp->d_name.len; - temp = temp->d_parent; - } - rcu_read_unlock(); - if (len) - len--; /* no leading '/' */ + pos = PATH_MAX - 1; + path[pos] = '\0'; - path = kmalloc(len+1, GFP_NOFS); - if (path == NULL) - return ERR_PTR(-ENOMEM); - pos = len; - path[pos] = 0; /* trailing null */ - rcu_read_lock(); - for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { - struct inode *inode; + seq = read_seqbegin(&rename_lock); + cur = dget(dentry); + for (;;) { + struct dentry *parent; - spin_lock(&temp->d_lock); - inode = temp->d_inode; + spin_lock(&cur->d_lock); + inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path path+%d: %p SNAPDIR\n", - pos, temp); - } else if (stop_on_nosnap && inode && + doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { - spin_unlock(&temp->d_lock); + spin_unlock(&cur->d_lock); + pos++; /* get rid of any prepended '/' */ break; + } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { + pos -= cur->d_name.len; + if (pos < 0) { + spin_unlock(&cur->d_lock); + break; + } + memcpy(path + pos, cur->d_name.name, cur->d_name.len); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); } else { - pos -= temp->d_name.len; + int len, ret; + char buf[NAME_MAX]; + + /* + * Proactively copy name into buf, in case we need to + * present it as-is. + */ + memcpy(buf, cur->d_name.name, cur->d_name.len); + len = cur->d_name.len; + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + + ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); + if (ret < 0) { + dput(parent); + dput(cur); + return ERR_PTR(ret); + } + + if (fscrypt_has_encryption_key(d_inode(parent))) { + len = ceph_encode_encrypted_dname(d_inode(parent), + buf, len); + if (len < 0) { + dput(parent); + dput(cur); + return ERR_PTR(len); + } + } + pos -= len; if (pos < 0) { - spin_unlock(&temp->d_lock); + dput(parent); break; } - strncpy(path + pos, temp->d_name.name, - temp->d_name.len); + memcpy(path + pos, buf, len); } - spin_unlock(&temp->d_lock); - if (pos) - path[--pos] = '/'; - temp = temp->d_parent; + dput(cur); + cur = parent; + + /* Are we at the root? */ + if (IS_ROOT(cur)) + break; + + /* Are we out of buffer? */ + if (--pos < 0) + break; + + path[pos] = '/'; } - rcu_read_unlock(); - if (pos != 0 || read_seqretry(&rename_lock, seq)) { - pr_err("build_path did not end path lookup where " - "expected, namelen is %d, pos is %d\n", len, pos); - /* presumably this is only possible if racing with a - rename of one of the parent directories (we can not - lock the dentries above us to prevent this, but - retrying should be harmless) */ - kfree(path); + inode = d_inode(cur); + base = inode ? ceph_ino(inode) : 0; + dput(cur); + + if (read_seqretry(&rename_lock, seq)) goto retry; + + if (pos < 0) { + /* + * The path is longer than PATH_MAX and this function + * cannot ever succeed. Creating paths that long is + * possible with Ceph, but Linux cannot use them. + */ + return ERR_PTR(-ENAMETOOLONG); } - *base = ceph_ino(temp->d_inode); - *plen = len; - dout("build_path on %p %d built %llx '%.*s'\n", - dentry, d_count(dentry), *base, len, path); - return path; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), + base, PATH_MAX - 1 - pos, path + pos); + return path + pos; } -static int build_dentry_path(struct dentry *dentry, - const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) +static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { - *pino = ceph_ino(dentry->d_parent->d_inode); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + rcu_read_lock(); + if (!dir) + dir = d_inode_rcu(dentry->d_parent); + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && + !IS_ENCRYPTED(dir)) { + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); + rcu_read_unlock(); + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } - path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + rcu_read_unlock(); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = 1; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = 1; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -1604,79 +2893,228 @@ static int build_inode_path(struct inode *inode, * request arguments may be specified via an inode *, a dentry *, or * an explicit ino+path. */ -static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, - const char *rpath, u64 rino, - const char **ppath, int *pathlen, - u64 *ino, int *freepath) +static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, + struct dentry *rdentry, struct inode *rdiri, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, + bool parent_locked) { + struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); - dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), - ceph_snap(rinode)); + r = build_inode_path(rinode, path_info); + doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); - dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, - *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - dout(" path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; } +static void encode_mclientrequest_tail(void **p, + const struct ceph_mds_request *req) +{ + struct ceph_timespec ts; + int i; + + ceph_encode_timespec64(&ts, &req->r_stamp); + ceph_encode_copy(p, &ts, sizeof(ts)); + + /* v4: gid_list */ + ceph_encode_32(p, req->r_cred->group_info->ngroups); + for (i = 0; i < req->r_cred->group_info->ngroups; i++) + ceph_encode_64(p, from_kgid(&init_user_ns, + req->r_cred->group_info->gid[i])); + + /* v5: altname */ + ceph_encode_32(p, req->r_altname_len); + ceph_encode_copy(p, req->r_altname, req->r_altname_len); + + /* v6: fscrypt_auth and fscrypt_file */ + if (req->r_fscrypt_auth) { + u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + ceph_encode_32(p, authlen); + ceph_encode_copy(p, req->r_fscrypt_auth, authlen); + } else { + ceph_encode_32(p, 0); + } + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { + ceph_encode_32(p, sizeof(__le64)); + ceph_encode_64(p, req->r_fscrypt_file); + } else { + ceph_encode_32(p, 0); + } +} + +static inline u16 mds_supported_head_version(struct ceph_mds_session *session) +{ + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features)) + return 1; + + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) + return 2; + + return CEPH_MDS_REQUEST_HEAD_VERSION; +} + +static struct ceph_mds_request_head_legacy * +find_legacy_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head *head; + + if (legacy) + return (struct ceph_mds_request_head_legacy *)p; + head = (struct ceph_mds_request_head *)p; + return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; +} + /* * called under mdsc->mutex */ -static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, +static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds) + bool drop_cap_releases) { + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; - struct ceph_mds_request_head *head; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; + struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; int ret; - - ret = set_request_path_attr(req->r_inode, req->r_dentry, - req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); + u16 request_head_version = mds_supported_head_version(session); + kuid_t caller_fsuid = req->r_cred->fsuid; + kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + + ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } - ret = set_request_path_attr(NULL, req->r_old_dentry, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + + /* If r_old_dentry is set, then assume that its parent is locked */ + if (req->r_old_dentry && + !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) + old_dentry = req->r_old_dentry; + ret = set_request_path_attr(mdsc, NULL, old_dentry, + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; } - len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); + if (IS_ERR(req->r_altname)) { + msg = ERR_CAST(req->r_altname); + req->r_altname = NULL; + goto out_free2; + } - /* calculate (max) length for cap releases */ + /* + * For old cephs without supporting the 32bit retry/fwd feature + * it will copy the raw memories directly when decoding the + * requests. While new cephs will decode the head depending the + * version member, so we need to make sure it will be compatible + * with them both. + */ + if (legacy) + len = sizeof(struct ceph_mds_request_head_legacy); + else if (request_head_version == 1) + len = offsetofend(struct ceph_mds_request_head, args); + else if (request_head_version == 2) + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); + else + len = sizeof(struct ceph_mds_request_head); + + /* filepaths */ + len += 2 * (1 + sizeof(u32) + sizeof(u64)); + len += path_info1.pathlen + path_info2.pathlen; + + /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) - len += req->r_dentry->d_name.len; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += req->r_old_dentry->d_name.len; + len += path_info2.pathlen; + + /* MClientRequest tail */ + + /* req->r_stamp */ + len += sizeof(struct ceph_timespec); - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); + /* gid list */ + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + + /* alternate name */ + len += sizeof(u32) + req->r_altname_len; + + /* fscrypt_auth */ + len += sizeof(u32); // fscrypt_auth + if (req->r_fscrypt_auth) + len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + /* fscrypt_file */ + len += sizeof(u32); + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) + len += sizeof(__le64); + + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; @@ -1684,18 +3122,90 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->hdr.tid = cpu_to_le64(req->r_tid); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + + if ((req->r_mnt_idmap != &nop_mnt_idmap) && + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) { + WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op)); + + if (enable_unsafe_idmap) { + pr_warn_once_client(cl, + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" + " is not supported by MDS. UID/GID-based restrictions may" + " not work properly.\n"); + + caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, + VFSUIDT_INIT(req->r_cred->fsuid)); + caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, + VFSGIDT_INIT(req->r_cred->fsgid)); + } else { + pr_err_ratelimited_client(cl, + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" + " is not supported by MDS. Fail request with -EIO.\n"); + + ret = -EIO; + goto out_err; + } + } + + /* + * The ceph_mds_request_head_legacy didn't contain a version field, and + * one was added when we moved the message version from 3->4. + */ + if (legacy) { + msg->hdr.version = cpu_to_le16(3); + p = msg->front.iov_base + sizeof(*lhead); + } else if (request_head_version == 1) { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(4); + nhead->version = cpu_to_le16(1); + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args); + } else if (request_head_version == 2) { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(2); + + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd); + } else { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + kuid_t owner_fsuid; + kgid_t owner_fsgid; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head)); + + if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) { + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, + VFSUIDT_INIT(req->r_cred->fsuid)); + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, + VFSGIDT_INIT(req->r_cred->fsgid)); + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid)); + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid)); + } else { + nhead->owner_uid = cpu_to_le32(-1); + nhead->owner_gid = cpu_to_le32(-1); + } + + p = msg->front.iov_base + sizeof(*nhead); + } + end = msg->front.iov_base + msg->front.iov_len; - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); - head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); - head->args = req->r_args; + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + lhead->op = cpu_to_le32(req->r_op); + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + caller_fsuid)); + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + caller_fsgid)); + lhead->ino = cpu_to_le64(req->r_deleg_ino); + lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -1704,41 +3214,69 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_inode ? req->r_inode : req->r_dentry->d_inode, - mds, req->r_inode_drop, req->r_inode_unless, 0); - if (req->r_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_dentry, - mds, req->r_dentry_drop, req->r_dentry_unless); - if (req->r_old_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_old_dentry, - mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + req->r_inode ? req->r_inode : d_inode(req->r_dentry), + mds, req->r_inode_drop, req->r_inode_unless, + req->r_op == CEPH_MDS_OP_READDIR); + if (req->r_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_dentry, + req->r_parent, mds, req->r_dentry_drop, + req->r_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } + if (req->r_old_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_old_dentry, + req->r_old_dentry_dir, mds, + req->r_old_dentry_drop, + req->r_old_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_old_dentry->d_inode, + d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); - head->num_releases = cpu_to_le16(releases); - BUG_ON(p > end); + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + + lhead->num_releases = cpu_to_le16(releases); + + encode_mclientrequest_tail(&p, req); + + if (WARN_ON_ONCE(p > end)) { + ceph_msg_put(msg); + msg = ERR_PTR(-ERANGE); + goto out_free2; + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - if (req->r_data_len) { - /* outbound data set only by ceph_sync_setxattr() */ - BUG_ON(!req->r_pages); - ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + if (req->r_pagelist) { + struct ceph_pagelist *pagelist = req->r_pagelist; + ceph_msg_data_add_pagelist(msg, pagelist); + msg->hdr.data_len = cpu_to_le32(pagelist->length); + } else { + msg->hdr.data_len = 0; } - msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - kfree((char *)path2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - kfree((char *)path1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; +out_err: + ceph_msg_put(msg); + msg = ERR_PTR(ret); + goto out_free2; } /* @@ -1748,22 +3286,46 @@ out: static void complete_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + req->r_end_latency = ktime_get(); + if (req->r_callback) req->r_callback(mdsc, req); - else - complete_all(&req->r_completion); + complete_all(&req->r_completion); } /* * called under mdsc->mutex */ -static int __prepare_send_request(struct ceph_mds_client *mdsc, +static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds) + bool drop_cap_releases) { - struct ceph_mds_request_head *rhead; + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_mds_request_head *nhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, old_max_retry; + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); + + /* + * Avoid infinite retrying after overflow. The client will + * increase the retry count and if the MDS is old version, + * so we limit to retry at most 256 times. + */ + if (req->r_attempts) { + old_max_retry = sizeof_field(struct ceph_mds_request_head, + num_retry); + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); + if ((old_version && req->r_attempts >= old_max_retry) || + ((uint32_t)req->r_attempts >= U32_MAX)) { + pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", + req->r_tid); + return -EMULTIHOP; + } + } req->r_attempts++; if (req->r_inode) { @@ -1775,10 +3337,12 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, - req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid, + ceph_mds_op_name(req->r_op), req->r_attempts); + + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + void *p; - if (req->r_got_unsafe) { /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. @@ -1786,21 +3350,30 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, * d_move mangles the src name. */ msg = req->r_request; - rhead = msg->front.iov_base; + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); - flags = le32_to_cpu(rhead->flags); + flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; - rhead->flags = cpu_to_le32(flags); + lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - rhead->num_retry = req->r_attempts - 1; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } /* remove cap/dentry releases from message */ - rhead->num_releases = 0; - msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); - msg->front.iov_len = req->r_request_release_offset; + lhead->num_releases = 0; + + p = msg->front.iov_base + req->r_request_release_offset; + encode_mclientrequest_tail(&p, req); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return 0; } @@ -1808,57 +3381,118 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds); + msg = create_request_message(session, req, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); - complete_request(mdsc, req); return PTR_ERR(msg); } req->r_request = msg; - rhead = msg->front.iov_base; - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); - if (req->r_got_unsafe) + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; - if (req->r_locked_dir) + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) + flags |= CEPH_MDS_FLAG_ASYNC; + if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; - rhead->flags = cpu_to_le32(flags); - rhead->num_fwd = req->r_num_fwd; - rhead->num_retry = req->r_attempts - 1; - rhead->ino = 0; + lhead->flags = cpu_to_le32(flags); + lhead->num_fwd = req->r_num_fwd; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } - dout(" r_locked_dir = %p\n", req->r_locked_dir); + doutc(cl, " r_parent = %p\n", req->r_parent); return 0; } /* + * called under mdsc->mutex + */ +static int __send_request(struct ceph_mds_session *session, + struct ceph_mds_request *req, + bool drop_cap_releases) +{ + int err; + + err = __prepare_send_request(session, req, drop_cap_releases); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + + return err; +} + +/* * send request, or put it on the appropriate wait list. */ -static int __do_request(struct ceph_mds_client *mdsc, +static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session = NULL; int mds = -1; - int err = -EAGAIN; + int err = 0; + bool random; - if (req->r_err || req->r_got_result) - goto out; + if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) + __unregister_request(mdsc, req); + return; + } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { + doutc(cl, "metadata corrupted\n"); + err = -EIO; + goto finish; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { - dout("do_request timed out\n"); + doutc(cl, "timed out\n"); + err = -ETIMEDOUT; + goto finish; + } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + doutc(cl, "forced umount\n"); err = -EIO; goto finish; } + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (mdsc->mdsmap_err) { + err = mdsc->mdsmap_err; + doutc(cl, "mdsmap err %d\n", err); + goto finish; + } + if (mdsc->mdsmap->m_epoch == 0) { + doutc(cl, "no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + return; + } + if (!(mdsc->fsc->mount_options->flags & + CEPH_MOUNT_OPT_MOUNTWAIT) && + !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { + err = -EHOSTUNREACH; + goto finish; + } + } put_request_session(req); - mds = __choose_mds(mdsc, req); + mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { - dout("do_request no mds or not active, waiting for map\n"); + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto finish; + } + doutc(cl, "no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); - goto out; + return; } /* get, open session */ @@ -1870,15 +3504,54 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } } - req->r_session = get_session(session); + req->r_session = ceph_get_mds_session(session); + + doutc(cl, "mds%d session %p state %s\n", mds, session, + ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } - dout("do_request mds%d session %p state %s\n", mds, session, - session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { + /* + * We cannot queue async requests since the caps and delegated + * inodes are bound to the session. Just return -EJUKEBOX and + * let the caller retry a sync request in that case. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto out_session; + } + + /* + * If the session has been REJECTED, then return a hard error, + * unless it's a CLEANRECOVER mount, in which case we'll queue + * it to the mdsc queue. + */ + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) + list_add(&req->r_wait, &mdsc->waiting_for_map); + else + err = -EACCES; + goto out_session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); + session->s_state == CEPH_MDS_SESSION_CLOSING) { + err = __open_session(mdsc, session); + if (err) + goto out_session; + /* retry the same mds later */ + if (random) + req->r_resend_mds = mds; + } list_add(&req->r_wait, &session->s_waiting); goto out_session; } @@ -1889,21 +3562,76 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds); - if (!err) { - ceph_msg_get(req->r_request); - ceph_con_send(&session->s_con, req->r_request); + /* + * For async create we will choose the auth MDS of frag in parent + * directory to send the request and usually this works fine, but + * if the migrated the dirtory to another MDS before it could handle + * it the request will be forwarded. + * + * And then the auth cap will be changed. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + /* + * The request maybe handled very fast and the new inode + * hasn't been linked to the dentry yet. We need to wait + * for the ceph_finish_async_create(), which shouldn't be + * stuck too long or fail in thoery, to finish when forwarding + * the request. + */ + if (!d_inode(req->r_dentry)) { + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, + TASK_KILLABLE); + if (err) { + mutex_lock(&req->r_fill_mutex); + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + goto out_session; + } + } + + ci = ceph_inode(d_inode(req->r_dentry)); + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { + doutc(cl, "session changed for auth cap %d -> %d\n", + cap->session->s_mds, session->s_mds); + + /* Remove the auth cap from old session */ + spin_lock(&cap->session->s_cap_lock); + cap->session->s_nr_caps--; + list_del_init(&cap->session_caps); + spin_unlock(&cap->session->s_cap_lock); + + /* Add the auth cap to the new session */ + cap->mds = mds; + cap->session = session; + spin_lock(&session->s_cap_lock); + session->s_nr_caps++; + list_add_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + + change_auth_cap_ses(ci, session); + } + spin_unlock(&ci->i_ceph_lock); } + err = __send_request(session, req, false); + out_session: ceph_put_mds_session(session); -out: - return err; - finish: - req->r_err = err; - complete_request(mdsc, req); - goto out; + if (err) { + doutc(cl, "early error %d\n", err); + req->r_err = err; + complete_request(mdsc, req); + __unregister_request(mdsc, req); + } + return; } /* @@ -1912,6 +3640,7 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; LIST_HEAD(tmp_list); @@ -1921,7 +3650,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc, req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); - dout(" wake request %p tid %llu\n", req, req->r_tid); + doutc(cl, " wake request %p tid %llu\n", req, + req->r_tid); __do_request(mdsc, req); } } @@ -1932,84 +3662,104 @@ static void __wake_requests(struct ceph_mds_client *mdsc, */ static void kick_requests(struct ceph_mds_client *mdsc, int mds) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; - struct rb_node *p; + struct rb_node *p = rb_first(&mdsc->request_tree); - dout("kick_requests mds%d\n", mds); - for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + doutc(cl, "kick_requests mds%d\n", mds); + while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); - if (req->r_got_unsafe) + p = rb_next(p); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; + if (req->r_attempts > 0) + continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { - dout(" kicking tid %llu\n", req->r_tid); + doutc(cl, " kicking tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); __do_request(mdsc, req); } } } -void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, +int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { - dout("submit_request on %p\n", req); - mutex_lock(&mdsc->mutex); - __register_request(mdsc, req, NULL); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); -} - -/* - * Synchrously perform an mds request. Take care of all of the - * session setup, forwarding, retry details. - */ -int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, - struct inode *dir, - struct ceph_mds_request *req) -{ - int err; - - dout("do_request on %p\n", req); + struct ceph_client *cl = mdsc->fsc->client; + int err = 0; - /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ + /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_locked_dir) - ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) + if (req->r_parent) { + struct ceph_inode_info *ci = ceph_inode(req->r_parent); + int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? + CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; + spin_lock(&ci->i_ceph_lock); + ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); + } + if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - /* issue */ + if (req->r_inode) { + err = ceph_wait_on_async_create(req->r_inode); + if (err) { + doutc(cl, "wait for async create returned: %d\n", err); + return err; + } + } + + if (!err && req->r_old_inode) { + err = ceph_wait_on_async_create(req->r_old_inode); + if (err) { + doutc(cl, "wait for async create returned: %d\n", err); + return err; + } + } + + doutc(cl, "submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); __do_request(mdsc, req); + err = req->r_err; + mutex_unlock(&mdsc->mutex); + return err; +} - if (req->r_err) { - err = req->r_err; - __unregister_request(mdsc, req); - dout("do_request early error %d\n", err); - goto out; - } +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func) +{ + struct ceph_client *cl = mdsc->fsc->client; + int err; /* wait */ - mutex_unlock(&mdsc->mutex); - dout("do_request waiting\n"); - if (req->r_timeout) { - err = (long)wait_for_completion_killable_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - err = -EIO; + doutc(cl, "do_request waiting\n"); + if (wait_func) { + err = wait_func(mdsc, req); } else { - err = wait_for_completion_killable(&req->r_completion); + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -ETIMEDOUT; /* timed out */ + else + err = timeleft; /* killed */ } - dout("do_request waited, got %d\n", err); + doutc(cl, "do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ - if (req->r_got_result) { + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { - dout("aborted request %lld with %d\n", req->r_tid, err); + doutc(cl, "aborted request %lld with %d\n", req->r_tid, err); /* * ensure we aren't running concurrently with @@ -2018,19 +3768,38 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, */ mutex_lock(&req->r_fill_mutex); req->r_err = err; - req->r_aborted = true; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); - if (req->r_locked_dir && + if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { err = req->r_err; } -out: mutex_unlock(&mdsc->mutex); - dout("do_request %p done, result %d\n", req, err); + return err; +} + +/* + * Synchrously perform an mds request. Take care of all of the + * session setup, forwarding, retry details. + */ +int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req) +{ + struct ceph_client *cl = mdsc->fsc->client; + int err; + + doutc(cl, "do_request on %p\n", req); + + /* issue */ + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, NULL); + doutc(cl, "do_request %p done, result %d\n", req, err); return err; } @@ -2040,11 +3809,16 @@ out: */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { - struct inode *inode = req->r_locked_dir; + struct inode *dir = req->r_parent; + struct inode *old_dir = req->r_old_dentry_dir; + struct ceph_client *cl = req->r_mdsc->fsc->client; - dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n", + dir, old_dir); - ceph_dir_clear_complete(inode); + ceph_dir_clear_complete(dir); + if (old_dir) + ceph_dir_clear_complete(old_dir); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2061,15 +3835,18 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; + bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { - pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + pr_err_client(cl, "got corrupt (short) reply\n"); ceph_msg_dump(msg); return; } @@ -2077,84 +3854,49 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { - dout("handle_reply on unknown tid %llu\n", tid); + doutc(cl, "on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); return; } - dout("handle_reply %p\n", req); + doutc(cl, "handle_reply %p\n", req); /* correct session? */ if (req->r_session != session) { - pr_err("mdsc_handle_reply got %llu on session mds%d" - " not mds%d\n", tid, session->s_mds, - req->r_session ? req->r_session->s_mds : -1); + pr_err_client(cl, "got %llu on session mds%d not mds%d\n", + tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); mutex_unlock(&mdsc->mutex); goto out; } /* dup? */ - if ((req->r_got_unsafe && !head->safe) || - (req->r_got_safe && head->safe)) { - pr_warning("got a dup %s reply on %llu from mds%d\n", - head->safe ? "safe" : "unsafe", tid, mds); + if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || + (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { + pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } - if (req->r_got_safe && !head->safe) { - pr_warning("got unsafe after safe on %llu from mds%d\n", - tid, mds); + if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { + pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n", + tid, mds); mutex_unlock(&mdsc->mutex); goto out; } result = le32_to_cpu(head->result); - /* - * Handle an ESTALE - * if we're not talking to the authority, send to them - * if the authority has changed while we weren't looking, - * send to new authority - * Otherwise we just have to return an ESTALE - */ - if (result == -ESTALE) { - dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) { - /* do nothing; not an authority problem */ - } else if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now"); - req->r_direct_mode = USE_AUTH_MDS; - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } else { - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = NULL; - - if (req->r_session) - cap = ceph_get_cap_for_mds(ci, - req->r_session->s_mds); - - dout("already using auth"); - if ((!cap || cap != ci->i_auth_cap) || - (cap->mseq != req->r_sent_on_mseq)) { - dout("but cap changed, so resending"); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } - } - dout("have to return ESTALE on request %llu", req->r_tid); - } - - if (head->safe) { - req->r_got_safe = true; + set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); - if (req->r_got_unsafe) { + /* last request during umount? */ + if (mdsc->stopping && !__get_oldest_req(mdsc)) + complete_all(&mdsc->safe_umount_waiters); + + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS @@ -2162,38 +3904,77 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * response. And even if it did, there is nothing * useful we could do with a revised return value. */ - dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); + doutc(cl, "got safe reply %llu, mds%d\n", tid, mds); - /* last unsafe request during umount? */ - if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } } else { - req->r_got_unsafe = true; + set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } - dout("handle_reply tid %lld result %d\n", tid, result); - rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + doutc(cl, "tid %lld result %d\n", tid, result); + if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) + err = parse_reply_info(session, msg, req, (u64)-1); + else + err = parse_reply_info(session, msg, req, + session->s_con.peer_features); mutex_unlock(&mdsc->mutex); + /* Must find target inode outside of mutexes to avoid deadlocks */ + rinfo = &req->r_reply_info; + if ((err >= 0) && rinfo->head->is_target) { + struct inode *in = xchg(&req->r_new_inode, NULL); + struct ceph_vino tvino = { + .ino = le64_to_cpu(rinfo->targeti.in->ino), + .snap = le64_to_cpu(rinfo->targeti.in->snapid) + }; + + /* + * If we ended up opening an existing inode, discard + * r_new_inode + */ + if (req->r_op == CEPH_MDS_OP_CREATE && + !req->r_reply_info.has_create_ino) { + /* This should never happen on an async create */ + WARN_ON_ONCE(req->r_deleg_ino); + iput(in); + in = NULL; + } + + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); + if (IS_ERR(in)) { + err = PTR_ERR(in); + mutex_lock(&session->s_mutex); + goto out_err; + } + req->r_target_inode = in; + } + mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); + pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n", + mds, tid); ceph_msg_dump(msg); goto out_err; } /* snap trace */ + realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, rinfo->snapblob, - rinfo->snapblob + rinfo->snapblob_len, - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + err = ceph_update_snap_trace(mdsc, rinfo->snapblob, + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); + if (err) { + up_write(&mdsc->snap_rwsem); + close_sessions = true; + if (err == -EIO) + ceph_msg_dump(msg); + goto out_err; + } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -2201,39 +3982,60 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); - err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); + current->journal_info = req; + err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || - req->r_op == CEPH_MDS_OP_LSSNAP) && - rinfo->dir_nr) - ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(mdsc, &req->r_caps_reservation); + req->r_op == CEPH_MDS_OP_LSSNAP)) + err = ceph_readdir_prepopulate(req, req->r_session); } + current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); + + if (err == 0) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, + &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } + + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); + } out_err: mutex_lock(&mdsc->mutex); - if (!req->r_aborted) { + if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { - req->r_reply = msg; - ceph_msg_get(msg); - req->r_got_result = true; + req->r_reply = ceph_msg_get(msg); + set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { - dout("reply arrived after request %lld was aborted\n", tid); + doutc(cl, "reply arrived after request %lld was aborted\n", tid); } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ complete_request(mdsc, req); + + ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, + req->r_end_latency, err); out: ceph_mdsc_put_request(req); + + /* Defer closing the sessions after s_mutex lock being released */ + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } @@ -2246,6 +4048,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; u64 tid = le64_to_cpu(msg->hdr.tid); u32 next_mds; @@ -2253,41 +4056,89 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { - dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + mutex_unlock(&mdsc->mutex); + doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds); + return; /* dup reply? */ } - if (req->r_aborted) { - dout("forward tid %llu aborted, unregistering\n", tid); + if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + doutc(cl, "forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); - } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { + /* + * Avoid infinite retrying after overflow. + * + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", + tid); } else { /* resend. forward race not possible; mds would drop */ - dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); - BUG_ON(req->r_got_result); + BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); + req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: - pr_err("mdsc_handle_forward decode error err=%d\n", err); + pr_err_client(cl, "decode error err=%d\n", err); + ceph_msg_dump(msg); +} + +static int __decode_session_metadata(void **p, void *end, + bool *blocklisted) +{ + /* map<string,string> */ + u32 n; + bool err_str; + ceph_decode_32_safe(p, end, n, bad); + while (n-- > 0) { + u32 len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); + *p += len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + /* + * Match "blocklisted (blacklisted)" from newer MDSes, + * or "blacklisted" from older MDSes. + */ + if (err_str && strnstr(*p, "blacklisted", len)) + *blocklisted = true; + *p += len; + } + return 0; +bad: + return -1; } /* @@ -2297,42 +4148,203 @@ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - u32 op; - u64 seq; + struct ceph_client *cl = mdsc->fsc->client; int mds = session->s_mds; - struct ceph_mds_session_head *h = msg->front.iov_base; + int msg_version = le16_to_cpu(msg->hdr.version); + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; + struct ceph_mds_cap_auth *cap_auths = NULL; + u32 op, cap_auths_num = 0; + u64 seq, features = 0; int wake = 0; + bool blocklisted = false; + u32 i; + /* decode */ - if (msg->front.iov_len != sizeof(*h)) - goto bad; + ceph_decode_need(&p, end, sizeof(*h), bad); + h = p; + p += sizeof(*h); + op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); + if (msg_version >= 3) { + u32 len; + /* version >= 2 and < 5, decode metadata, skip otherwise + * as it's handled via flags. + */ + if (msg_version >= 5) + ceph_decode_skip_map(&p, end, string, string, bad); + else if (__decode_session_metadata(&p, end, &blocklisted) < 0) + goto bad; + + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); + if (len) { + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); + } + } + + if (msg_version >= 5) { + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + + /* version >= 5, flags */ + ceph_decode_32_safe(&p, end, flags, bad); + if (flags & CEPH_SESSION_BLOCKLISTED) { + pr_warn_client(cl, "mds%d session blocklisted\n", + session->s_mds); + blocklisted = true; + } + } + + if (msg_version >= 6) { + ceph_decode_32_safe(&p, end, cap_auths_num, bad); + doutc(cl, "cap_auths_num %d\n", cap_auths_num); + + if (cap_auths_num && op != CEPH_SESSION_OPEN) { + WARN_ON_ONCE(op != CEPH_SESSION_OPEN); + goto skip_cap_auths; + } + + cap_auths = kcalloc(cap_auths_num, + sizeof(struct ceph_mds_cap_auth), + GFP_KERNEL); + if (!cap_auths) { + pr_err_client(cl, "No memory for cap_auths\n"); + return; + } + + for (i = 0; i < cap_auths_num; i++) { + u32 _len, j; + + /* struct_v, struct_compat, and struct_len in MDSCapAuth */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); + + /* struct_v, struct_compat, and struct_len in MDSCapMatch */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); + ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad); + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.gids = kcalloc(_len, sizeof(u32), + GFP_KERNEL); + if (!cap_auths[i].match.gids) { + pr_err_client(cl, "No memory for gids\n"); + goto fail; + } + + cap_auths[i].match.num_gids = _len; + for (j = 0; j < _len; j++) + ceph_decode_32_safe(&p, end, + cap_auths[i].match.gids[j], + bad); + } + + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char), + GFP_KERNEL); + if (!cap_auths[i].match.path) { + pr_err_client(cl, "No memory for path\n"); + goto fail; + } + ceph_decode_copy(&p, cap_auths[i].match.path, _len); + + /* Remove the tailing '/' */ + while (_len && cap_auths[i].match.path[_len - 1] == '/') { + cap_auths[i].match.path[_len - 1] = '\0'; + _len -= 1; + } + } + + ceph_decode_32_safe(&p, end, _len, bad); + if (_len) { + cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char), + GFP_KERNEL); + if (!cap_auths[i].match.fs_name) { + pr_err_client(cl, "No memory for fs_name\n"); + goto fail; + } + ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len); + } + + ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad); + ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad); + ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad); + doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n", + cap_auths[i].match.uid, cap_auths[i].match.num_gids, + cap_auths[i].match.path, cap_auths[i].match.fs_name, + cap_auths[i].match.root_squash, + cap_auths[i].readable, cap_auths[i].writeable); + } + } + +skip_cap_auths: mutex_lock(&mdsc->mutex); - if (op == CEPH_SESSION_CLOSE) + if (op == CEPH_SESSION_OPEN) { + if (mdsc->s_cap_auths) { + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + kfree(mdsc->s_cap_auths[i].match.gids); + kfree(mdsc->s_cap_auths[i].match.path); + kfree(mdsc->s_cap_auths[i].match.fs_name); + } + kfree(mdsc->s_cap_auths); + } + mdsc->s_cap_auths_num = cap_auths_num; + mdsc->s_cap_auths = cap_auths; + } + if (op == CEPH_SESSION_CLOSE) { + ceph_get_mds_session(session); __unregister_session(mdsc, session); + } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - dout("handle_session mds%d %s %p state %s seq %llu\n", - mds, ceph_session_op_name(op), session, - session_state_name(session->s_state), seq); + doutc(cl, "mds%d %s %p state %s seq %llu\n", mds, + ceph_session_op_name(op), session, + ceph_session_state_name(session->s_state), seq); if (session->s_state == CEPH_MDS_SESSION_HUNG) { session->s_state = CEPH_MDS_SESSION_OPEN; - pr_info("mds%d came back\n", session->s_mds); + pr_info_client(cl, "mds%d came back\n", session->s_mds); } switch (op) { case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) - pr_info("mds%d reconnect success\n", session->s_mds); - session->s_state = CEPH_MDS_SESSION_OPEN; - renewed_caps(mdsc, session, 0); + pr_info_client(cl, "mds%d reconnect success\n", + session->s_mds); + + session->s_features = features; + if (session->s_state == CEPH_MDS_SESSION_OPEN) { + pr_notice_client(cl, "mds%d is already opened\n", + session->s_mds); + } else { + session->s_state = CEPH_MDS_SESSION_OPEN; + renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, + &session->s_features)) + metric_schedule_delayed(&mdsc->metric); + } + + /* + * The connection maybe broken and the session in client + * side has been reinitialized, need to update the seq + * anyway. + */ + if (!session->s_seq && seq) + session->s_seq = seq; + wake = 1; if (mdsc->stopping) __close_session(mdsc, session); @@ -2345,29 +4357,59 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) - pr_info("mds%d reconnect denied\n", session->s_mds); + pr_info_client(cl, "mds%d reconnect denied\n", + session->s_mds); + session->s_state = CEPH_MDS_SESSION_CLOSED; + cleanup_session_requests(mdsc, session); remove_session_caps(session); - wake = 1; /* for good measure */ + wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); - kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: - pr_info("mds%d caps went stale, renewing\n", - session->s_mds); - spin_lock(&session->s_gen_ttl_lock); - session->s_cap_gen++; + pr_info_client(cl, "mds%d caps went stale, renewing\n", + session->s_mds); + atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; - spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; case CEPH_SESSION_RECALL_STATE: - trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); + break; + + case CEPH_SESSION_FLUSHMSG: + /* flush cap releases */ + spin_lock(&session->s_cap_lock); + if (session->s_num_cap_releases) + ceph_flush_session_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); + + send_flushmsg_ack(mdsc, session, seq); + break; + + case CEPH_SESSION_FORCE_RO: + doutc(cl, "force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, FORCE_RO); + break; + + case CEPH_SESSION_REJECT: + WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); + pr_info_client(cl, "mds%d rejected session\n", + session->s_mds); + session->s_state = CEPH_MDS_SESSION_REJECTED; + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + if (blocklisted) + mdsc->fsc->blocklisted = true; + wake = 2; /* for good measure */ break; default: - pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + pr_err_client(cl, "bad op %d mds%d\n", op, mds); WARN_ON(1); } @@ -2375,17 +4417,51 @@ static void handle_session(struct ceph_mds_session *session, if (wake) { mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); + if (wake == 2) + kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } + if (op == CEPH_SESSION_CLOSE) + ceph_put_mds_session(session); return; bad: - pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, - (int)msg->front.iov_len); + pr_err_client(cl, "corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); ceph_msg_dump(msg); +fail: + for (i = 0; i < cap_auths_num; i++) { + kfree(cap_auths[i].match.gids); + kfree(cap_auths[i].match.path); + kfree(cap_auths[i].match.fs_name); + } + kfree(cap_auths); return; } +void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) +{ + struct ceph_client *cl = req->r_mdsc->fsc->client; + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); + } +} + +void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) +{ + struct ceph_client *cl = req->r_mdsc->fsc->client; + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); + } +} /* * called under session->mutex. @@ -2394,134 +4470,394 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; - int err; + struct rb_node *p; - dout("replay_unsafe_requests mds%d\n", session->s_mds); + doutc(mdsc->fsc->client, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); - list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds); - if (!err) { - ceph_msg_get(req->r_request); - ceph_con_send(&session->s_con, req->r_request); - } + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) + __send_request(session, req, true); + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (!req->r_session) + continue; + if (req->r_session->s_mds != session->s_mds) + continue; + + ceph_mdsc_release_dir_caps_async(req); + + __send_request(session, req, true); } mutex_unlock(&mdsc->mutex); } +static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) +{ + struct ceph_msg *reply; + struct ceph_pagelist *_pagelist; + struct page *page; + __le32 *addr; + int err = -ENOMEM; + + if (!recon_state->allow_multi) + return -ENOSPC; + + /* can't handle message that contains both caps and realm */ + BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); + + /* pre-allocate new pagelist */ + _pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!_pagelist) + return -ENOMEM; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_msg; + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + + if (recon_state->nr_caps) { + /* currently encoding caps */ + err = ceph_pagelist_encode_32(recon_state->pagelist, 0); + if (err) + goto fail; + } else { + /* placeholder for nr_realms (currently encoding relams) */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + } + + err = ceph_pagelist_encode_8(recon_state->pagelist, 1); + if (err) + goto fail; + + page = list_first_entry(&recon_state->pagelist->head, struct page, lru); + addr = kmap_atomic(page); + if (recon_state->nr_caps) { + /* currently encoding caps */ + *addr = cpu_to_le32(recon_state->nr_caps); + } else { + /* currently encoding relams */ + *(addr + 1) = cpu_to_le32(recon_state->nr_realms); + } + kunmap_atomic(addr); + + reply->hdr.version = cpu_to_le16(5); + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state->pagelist); + + ceph_con_send(&recon_state->session->s_con, reply); + ceph_pagelist_release(recon_state->pagelist); + + recon_state->pagelist = _pagelist; + recon_state->nr_caps = 0; + recon_state->nr_realms = 0; + recon_state->msg_version = 5; + return 0; +fail: + ceph_msg_put(reply); +fail_msg: + ceph_pagelist_release(_pagelist); + return err; +} + +static struct dentry* d_find_primary(struct inode *inode) +{ + struct dentry *alias, *dn = NULL; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + + spin_lock(&inode->i_lock); + if (hlist_empty(&inode->i_dentry)) + goto out_unlock; + + if (S_ISDIR(inode->i_mode)) { + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + if (!IS_ROOT(alias)) + dn = dget(alias); + goto out_unlock; + } + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (!d_unhashed(alias) && + (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { + dn = dget_dlock(alias); + } + spin_unlock(&alias->d_lock); + if (dn) + break; + } +out_unlock: + spin_unlock(&inode->i_lock); + return dn; +} + /* * Encode information about a cap for a reconnect with the MDS. */ -static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - size_t reclen; - struct ceph_inode_info *ci; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; - char *path; - int pathlen, err; - u64 pathbase; struct dentry *dentry; + struct ceph_cap *cap; + struct ceph_path_info path_info = {0}; + int err; + u64 snap_follows; - ci = cap->ci; - - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", - inode, ceph_vinop(inode), cap, cap->cap_id, - ceph_cap_string(cap->issued)); - err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - if (err) - return err; - - dentry = d_find_alias(inode); + dentry = d_find_primary(inode); if (dentry) { - path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); + /* set pathbase to parent dir when msg_version >= 2 */ + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, + recon_state->msg_version >= 2); + dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); - goto out_dput; + goto out_err; } - } else { - path = NULL; - pathlen = 0; } - err = ceph_pagelist_encode_string(pagelist, path, pathlen); - if (err) - goto out_free; spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + err = 0; + goto out_err; + } + doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode, + ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = atomic_read(&cap->session->s_cap_gen); + + /* These are lost when the session goes away */ + if (S_ISDIR(inode->i_mode)) { + if (cap->issued & CEPH_CAP_DIR_CREATE) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } + cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; + } - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); - rec.v2.flock_len = 0; - reclen = sizeof(rec.v2); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); + rec.v2.flock_len = (__force __le32) + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { + struct timespec64 ts; + rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); - rec.v1.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.size = cpu_to_le64(i_size_read(inode)); + ts = inode_get_mtime(inode); + ceph_encode_timespec64(&rec.v1.mtime, &ts); + ts = inode_get_atime(inode); + ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); - reclen = sizeof(rec.v1); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); + } + + if (list_empty(&ci->i_cap_snaps)) { + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; + } else { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + snap_follows = capsnap->follows; } spin_unlock(&ci->i_ceph_lock); - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; - struct ceph_filelock *flocks; + struct ceph_filelock *flocks = NULL; + size_t struct_len, total_len = sizeof(u64); + u8 struct_v = 0; encode_again: - spin_lock(&inode->i_lock); - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - spin_unlock(&inode->i_lock); - flocks = kmalloc((num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock), GFP_NOFS); - if (!flocks) { - err = -ENOMEM; - goto out_free; - } - spin_lock(&inode->i_lock); - err = ceph_encode_locks_to_buffer(inode, flocks, - num_fcntl_locks, - num_flock_locks); - spin_unlock(&inode->i_lock); - if (err) { + if (rec.v2.flock_len) { + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + } else { + num_fcntl_locks = 0; + num_flock_locks = 0; + } + if (num_fcntl_locks + num_flock_locks > 0) { + flocks = kmalloc_array(num_fcntl_locks + num_flock_locks, + sizeof(struct ceph_filelock), + GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_err; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + flocks = NULL; + if (err == -ENOSPC) + goto encode_again; + goto out_err; + } + } else { kfree(flocks); - if (err == -ENOSPC) - goto encode_again; - goto out_free; + flocks = NULL; + } + + if (recon_state->msg_version >= 3) { + /* version, compat_version and struct_len */ + total_len += 2 * sizeof(u8) + sizeof(u32); + struct_v = 2; } /* * number of encoded locks is stable, so copy to pagelist */ - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_locks_to_pagelist(flocks, pagelist, - num_fcntl_locks, - num_flock_locks); + struct_len = 2 * sizeof(u32) + + (num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock); + rec.v2.flock_len = cpu_to_le32(struct_len); + + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); + + if (struct_v >= 2) + struct_len += sizeof(u64); /* snap_follows */ + + total_len += struct_len; + + if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto out_freeflocks; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, total_len); + if (err) + goto out_freeflocks; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + if (recon_state->msg_version >= 3) { + ceph_pagelist_encode_8(pagelist, struct_v); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, struct_len); + } + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); + ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, num_flock_locks); + if (struct_v >= 2) + ceph_pagelist_encode_64(pagelist, snap_follows); +out_freeflocks: kfree(flocks); } else { - err = ceph_pagelist_append(pagelist, &rec, reclen); + err = ceph_pagelist_reserve(pagelist, + sizeof(u64) + sizeof(u32) + + path_info.pathlen + sizeof(rec.v1)); + if (err) + goto out_err; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } -out_free: - kfree(path); -out_dput: - dput(dentry); + +out_err: + ceph_mdsc_free_path_info(&path_info); + if (!err) + recon_state->nr_caps++; + return err; +} + +static int encode_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_reconnect_state *recon_state) +{ + struct rb_node *p; + struct ceph_pagelist *pagelist = recon_state->pagelist; + struct ceph_client *cl = mdsc->fsc->client; + int err = 0; + + if (recon_state->msg_version >= 4) { + err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); + if (err < 0) + goto fail; + } + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + if (recon_state->msg_version >= 4) { + size_t need = sizeof(u8) * 2 + sizeof(u32) + + sizeof(sr_rec); + + if (pagelist->length + need > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto fail; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, need); + if (err) + goto fail; + + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); + } + + doutc(cl, " adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + + recon_state->nr_realms++; + } +fail: return err; } @@ -2535,34 +4871,58 @@ out_dput: * recovering MDS might have. * * This is a relatively heavyweight operation, but it's rare. - * - * called with mdsc->mutex held. */ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *reply; - struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; - struct ceph_pagelist *pagelist; - struct ceph_reconnect_state recon_state; + struct ceph_reconnect_state recon_state = { + .session = session, + }; + LIST_HEAD(dispose); - pr_info("mds%d reconnect start\n", mds); + pr_info_client(cl, "mds%d reconnect start\n", mds); - pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); - if (!pagelist) + recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!recon_state.pagelist) goto fail_nopagelist; - ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_nomsg; + xa_destroy(&session->s_delegated_inos); + mutex_lock(&session->s_mutex); session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; + doutc(cl, "session %p state %s\n", session, + ceph_session_state_name(session->s_state)); + + atomic_inc(&session->s_cap_gen); + + spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; + /* drop old cap expires; we're about to reestablish that state */ + detach_cap_releases(session, &dispose); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(mdsc, &dispose); + + /* trim unused caps to reduce MDS's cache rejoin time */ + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); + ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, CEPH_ENTITY_TYPE_MDS, mds, @@ -2571,52 +4931,91 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); - down_read(&mdsc->snap_rwsem); - - dout("session %p state %s\n", session, - session_state_name(session->s_state)); + ceph_early_kick_flushing_caps(mdsc, session); - /* drop old cap expires; we're about to reestablish that state */ - discard_cap_releases(mdsc, session); + down_read(&mdsc->snap_rwsem); - /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; - recon_state.pagelist = pagelist; - recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; - err = iterate_session_caps(session, encode_caps_cb, &recon_state); + if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { + recon_state.msg_version = 3; + recon_state.allow_multi = true; + } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { + recon_state.msg_version = 3; + } else { + recon_state.msg_version = 2; + } + /* traverse this session's caps */ + err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); + + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + if (err < 0) goto fail; - /* - * snaprealms. we provide mds with the ino, seq (version), and - * parent for all of our realms. If the mds has any newer info, - * it will tell us. - */ - for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { - struct ceph_snap_realm *realm = - rb_entry(p, struct ceph_snap_realm, node); - struct ceph_mds_snaprealm_reconnect sr_rec; + /* check if all realms can be encoded into current message */ + if (mdsc->num_snap_realms) { + size_t total_len = + recon_state.pagelist->length + + mdsc->num_snap_realms * + sizeof(struct ceph_mds_snaprealm_reconnect); + if (recon_state.msg_version >= 4) { + /* number of realms */ + total_len += sizeof(u32); + /* version, compat_version and struct_len */ + total_len += mdsc->num_snap_realms * + (2 * sizeof(u8) + sizeof(u32)); + } + if (total_len > RECONNECT_MAX_SIZE) { + if (!recon_state.allow_multi) { + err = -ENOSPC; + goto fail; + } + if (recon_state.nr_caps) { + err = send_reconnect_partial(&recon_state); + if (err) + goto fail; + } + recon_state.msg_version = 5; + } + } - dout(" adding snap realm %llx seq %lld parent %llx\n", - realm->ino, realm->seq, realm->parent_ino); - sr_rec.ino = cpu_to_le64(realm->ino); - sr_rec.seq = cpu_to_le64(realm->seq); - sr_rec.parent = cpu_to_le64(realm->parent_ino); - err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); - if (err) + err = encode_snap_realms(mdsc, &recon_state); + if (err < 0) + goto fail; + + if (recon_state.msg_version >= 5) { + err = ceph_pagelist_encode_8(recon_state.pagelist, 0); + if (err < 0) goto fail; } - if (recon_state.flock) - reply->hdr.version = cpu_to_le16(2); - if (pagelist->length) { - /* set up outbound data if we have any */ - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + if (recon_state.nr_caps || recon_state.nr_realms) { + struct page *page = + list_first_entry(&recon_state.pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + if (recon_state.nr_caps) { + WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); + *addr = cpu_to_le32(recon_state.nr_caps); + } else if (recon_state.msg_version >= 4) { + *(addr + 1) = cpu_to_le32(recon_state.nr_realms); + } + kunmap_atomic(addr); } + + reply->hdr.version = cpu_to_le16(recon_state.msg_version); + if (recon_state.msg_version >= 4) + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state.pagelist); + ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -2626,6 +5025,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); + ceph_pagelist_release(recon_state.pagelist); return; fail: @@ -2633,10 +5033,10 @@ fail: up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: - ceph_pagelist_release(pagelist); - kfree(pagelist); + ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: - pr_err("error %d preparing reconnect for mds%d\n", err, mds); + pr_err_client(cl, "error %d preparing reconnect for mds%d\n", + err, mds); return; } @@ -2651,48 +5051,64 @@ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { - int i; + int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; + unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - dout("check_new_map new %u old %u\n", - newmap->m_epoch, oldmap->m_epoch); + if (newmap->m_info) { + for (i = 0; i < newmap->possible_max_rank; i++) { + for (j = 0; j < newmap->m_info[i].num_export_targets; j++) + set_bit(newmap->m_info[i].export_targets[j], targets); + } + } - for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { - if (mdsc->sessions[i] == NULL) + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { + if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", - i, ceph_mds_state_name(oldstate), - ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", - ceph_mds_state_name(newstate), - ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", - session_state_name(s->s_state)); + doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", + ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", + ceph_session_state_name(s->s_state)); + + if (i >= newmap->possible_max_rank) { + /* force close session for stopped mds */ + ceph_get_mds_session(s); + __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); - if (i >= newmap->m_max_mds || - memcmp(ceph_mdsmap_get_addr(oldmap, i), - ceph_mdsmap_get_addr(newmap, i), - sizeof(struct ceph_entity_addr))) { - if (s->s_state == CEPH_MDS_SESSION_OPENING) { - /* the session never opened, just close it - * out now */ - __wake_requests(mdsc, &s->s_waiting); - __unregister_session(mdsc, s); - } else { - /* just close it */ - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_lock(&mdsc->mutex); - ceph_con_close(&s->s_con); - mutex_unlock(&s->s_mutex); - s->s_state = CEPH_MDS_SESSION_RESTARTING; - } + ceph_put_mds_session(s); - /* kick any requests waiting on the recovering mds */ + mutex_lock(&mdsc->mutex); kick_requests(mdsc, i); + continue; + } + + if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + ceph_mdsmap_get_addr(newmap, i), + sizeof(struct ceph_entity_addr))) { + /* just close it */ + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); + ceph_con_close(&s->s_con); + mutex_unlock(&s->s_mutex); + s->s_state = CEPH_MDS_SESSION_RESTARTING; } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -2703,6 +5119,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); + clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } @@ -2714,14 +5131,65 @@ static void check_new_map(struct ceph_mds_client *mdsc, newstate >= CEPH_MDS_STATE_ACTIVE) { if (oldstate != CEPH_MDS_STATE_CREATING && oldstate != CEPH_MDS_STATE_STARTING) - pr_info("mds%d recovery completed\n", s->s_mds); + pr_info_client(cl, "mds%d recovery completed\n", + s->s_mds); kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); ceph_kick_flushing_caps(mdsc, s); - wake_up_session_caps(s, 1); + mutex_unlock(&s->s_mutex); + wake_up_session_caps(s, RECONNECT); + } + } + + /* + * Only open and reconnect sessions that don't exist yet. + */ + for (i = 0; i < newmap->possible_max_rank; i++) { + /* + * In case the import MDS is crashed just after + * the EImportStart journal is flushed, so when + * a standby MDS takes over it and is replaying + * the EImportStart journal the new MDS daemon + * will wait the client to reconnect it, but the + * client may never register/open the session yet. + * + * Will try to reconnect that MDS daemon if the + * rank number is in the export targets array and + * is the up:reconnect state. + */ + newstate = ceph_mdsmap_get_state(newmap, i); + if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) + continue; + + /* + * The session maybe registered and opened by some + * requests which were choosing random MDSes during + * the mdsc->mutex's unlock/lock gap below in rare + * case. But the related MDS daemon will just queue + * that requests and be still waiting for the client's + * reconnection request in up:reconnect state. + */ + s = __ceph_lookup_mds_session(mdsc, i); + if (likely(!s)) { + s = __open_export_target_session(mdsc, i); + if (IS_ERR(s)) { + err = PTR_ERR(s); + pr_err_client(cl, + "failed to open export target session, err %d\n", + err); + continue; + } } + doutc(cl, "send reconnect to export target mds.%d\n", i); + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); } - for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; @@ -2730,8 +5198,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG || s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout(" connecting to export targets of laggy mds%d\n", - i); + doutc(cl, " connecting to export targets of laggy mds%d\n", i); __open_export_target_sessions(mdsc, s); } } @@ -2758,6 +5225,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct dentry *parent, *dentry; @@ -2769,7 +5237,10 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct qstr dname; int release = 0; - dout("handle_lease from mds%d\n", mds); + doutc(cl, "from mds%d\n", mds); + + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) @@ -2777,32 +5248,30 @@ static void handle_lease(struct ceph_mds_client *mdsc, vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); - dname.name = (void *)h + sizeof(*h) + sizeof(u32); - dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); - if (dname.len != get_unaligned_le32(h+1)) + dname.len = get_unaligned_le32(h + 1); + if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len) goto bad; - - mutex_lock(&session->s_mutex); - session->s_seq++; + dname.name = (void *)(h + 1) + sizeof(u32); /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), vino.ino, inode, - dname.len, dname.name); - if (inode == NULL) { - dout("handle_lease no inode %llx\n", vino.ino); + doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action), + vino.ino, inode, dname.len, dname.name); + + mutex_lock(&session->s_mutex); + if (!inode) { + doutc(cl, "no inode %llx\n", vino.ino); goto release; } /* dentry */ parent = d_find_alias(inode); if (!parent) { - dout("no parent dentry on inode %p\n", inode); + doutc(cl, "no parent dentry on inode %p\n", inode); WARN_ON(1); goto release; /* hrm... */ } - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); dentry = d_lookup(parent, &dname); dput(parent); if (!dentry) @@ -2822,14 +5291,14 @@ static void handle_lease(struct ceph_mds_client *mdsc, case CEPH_MDS_LEASE_RENEW: if (di->lease_session == session && - di->lease_gen == session->s_cap_gen && + di->lease_gen == atomic_read(&session->s_cap_gen) && di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = - le32_to_cpu(h->duration_ms) * HZ / 1000; + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; - dentry->d_time = di->lease_renew_from + duration; + di->time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); di->lease_renew_from = 0; @@ -2849,136 +5318,153 @@ release: ceph_con_send(&session->s_con, msg); out: - iput(inode); mutex_unlock(&session->s_mutex); + iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: - pr_err("corrupt lease message\n"); + ceph_dec_mds_stopping_blocker(mdsc); + + pr_err_client(cl, "corrupt lease message\n"); ceph_msg_dump(msg); } void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, - struct inode *inode, struct dentry *dentry, char action, u32 seq) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_lease *lease; - int len = sizeof(*lease) + sizeof(u32); - int dnamelen = 0; + struct inode *dir; + int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; - dout("lease_send_msg inode %p dentry %p %s to mds%d\n", - inode, dentry, ceph_lease_op_name(action), session->s_mds); - dnamelen = dentry->d_name.len; - len += dnamelen; + doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action), + session->s_mds); msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; - lease->ino = cpu_to_le64(ceph_vino(inode).ino); - lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); - put_unaligned_le32(dnamelen, lease + 1); - memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); + spin_lock(&dentry->d_lock); + dir = d_inode(dentry->d_parent); + lease->ino = cpu_to_le64(ceph_ino(dir)); + lease->first = lease->last = cpu_to_le64(ceph_snap(dir)); + + put_unaligned_le32(dentry->d_name.len, lease + 1); + memcpy((void *)(lease + 1) + 4, + dentry->d_name.name, dentry->d_name.len); + spin_unlock(&dentry->d_lock); ceph_con_send(&session->s_con, msg); } /* - * Preemptively release a lease we expect to invalidate anyway. - * Pass @inode always, @dentry is optional. + * lock unlock the session, to wait ongoing session activities */ -void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry) +static void lock_unlock_session(struct ceph_mds_session *s) { - struct ceph_dentry_info *di; - struct ceph_mds_session *session; - u32 seq; + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); +} - BUG_ON(inode == NULL); - BUG_ON(dentry == NULL); +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_fs_client *fsc = mdsc->fsc; - /* is dentry lease valid? */ - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (!di || !di->lease_session || - di->lease_session->s_mds < 0 || - di->lease_gen != di->lease_session->s_cap_gen || - !time_before(jiffies, dentry->d_time)) { - dout("lease_release inode %p dentry %p -- " - "no lease\n", - inode, dentry); - spin_unlock(&dentry->d_lock); + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) return; - } - /* we do have a lease on this dentry; note mds and seq */ - session = ceph_get_mds_session(di->lease_session); - seq = di->lease_seq; - __ceph_mdsc_drop_dentry_lease(dentry); - spin_unlock(&dentry->d_lock); + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; - dout("lease_release inode %p dentry %p to mds%d\n", - inode, dentry, session->s_mds); - ceph_mdsc_lease_send_msg(session, inode, dentry, - CEPH_MDS_LEASE_RELEASE, seq); - ceph_put_mds_session(session); + if (!READ_ONCE(fsc->blocklisted)) + return; + + pr_info_client(cl, "auto reconnect after blocklisted\n"); + ceph_force_reconnect(fsc->sb); } -/* - * drop all leases (and dentry refs) in preparation for umount - */ -static void drop_leases(struct ceph_mds_client *mdsc) +bool check_session_state(struct ceph_mds_session *s) { - int i; + struct ceph_client *cl = s->s_mdsc->fsc->client; - dout("drop_leases\n"); - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); + switch (s->s_state) { + case CEPH_MDS_SESSION_OPEN: + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info_client(cl, "mds%d hung\n", s->s_mds); + } + break; + case CEPH_MDS_SESSION_CLOSING: + case CEPH_MDS_SESSION_NEW: + case CEPH_MDS_SESSION_RESTARTING: + case CEPH_MDS_SESSION_CLOSED: + case CEPH_MDS_SESSION_REJECTED: + return false; } - mutex_unlock(&mdsc->mutex); + + return true; } +/* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ + struct ceph_client *cl = s->s_mdsc->fsc->client; + + lockdep_assert_held(&s->s_mutex); + + s->s_seq++; + + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + int ret; + doutc(cl, "resending session close request for mds%d\n", s->s_mds); + ret = request_close_session(s); + if (ret < 0) + pr_err_client(cl, "unable to close session to mds%d: %d\n", + s->s_mds, ret); + } +} /* - * delayed work -- periodically trim expired leases, renew caps with mds + * delayed work -- periodically trim expired leases, renew caps with mds. If + * the @delay parameter is set to 0 or if it's more than 5 secs, the default + * workqueue delay value of 5 secs will be used. */ -static void schedule_delayed(struct ceph_mds_client *mdsc) +static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay) { - int delay = 5; - unsigned hz = round_jiffies_relative(HZ * delay); - schedule_delayed_work(&mdsc->delayed_work, hz); + unsigned long max_delay = HZ * 5; + + /* 5 secs default delay */ + if (!delay || (delay > max_delay)) + delay = max_delay; + schedule_delayed_work(&mdsc->delayed_work, + round_jiffies_relative(delay)); } static void delayed_work(struct work_struct *work) { - int i; struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, delayed_work.work); + unsigned long delay; int renew_interval; int renew_caps; + int i; - dout("mdsc delayed_work\n"); - ceph_check_delayed_caps(mdsc); + doutc(mdsc->fsc->client, "mdsc delayed_work\n"); + + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) + return; mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; @@ -2989,34 +5475,22 @@ static void delayed_work(struct work_struct *work) for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (s == NULL) - continue; - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(mdsc, s); - ceph_put_mds_session(s); + if (!s) continue; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { - s->s_state = CEPH_MDS_SESSION_HUNG; - pr_info("mds%d hung\n", s->s_mds); - } - } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { - /* this mds is failed or recovering, just wait */ + + if (!check_session_state(s)) { ceph_put_mds_session(s); continue; } mutex_unlock(&mdsc->mutex); + ceph_flush_session_cap_releases(mdsc, s); + mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -3027,57 +5501,95 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); - schedule_delayed(mdsc); + delay = ceph_check_delayed_caps(mdsc); + + ceph_queue_cap_reclaim_work(mdsc); + + ceph_trim_snapid_map(mdsc); + + maybe_recover_session(mdsc); + + schedule_delayed(mdsc, delay); } int ceph_mdsc_init(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc; + int err; mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); if (!mdsc) return -ENOMEM; mdsc->fsc = fsc; - fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) { - kfree(mdsc); - return -ENOMEM; + if (!mdsc->mdsmap) { + err = -ENOMEM; + goto err_mdsc; } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); + atomic64_set(&mdsc->dirty_folios, 0); + init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); - mdsc->sessions = NULL; - mdsc->max_sessions = 0; - mdsc->stopping = 0; + mdsc->quotarealms_inodes = RB_ROOT; + mutex_init(&mdsc->quotarealms_inodes_mutex); init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); - mdsc->last_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); +#ifdef CONFIG_DEBUG_FS + INIT_LIST_HEAD(&mdsc->cap_wait_list); +#endif spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); - mdsc->cap_flush_seq = 0; - INIT_LIST_HEAD(&mdsc->cap_dirty); + mdsc->last_cap_flush_tid = 1; + INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); - mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); - spin_lock_init(&mdsc->dentry_lru_lock); - INIT_LIST_HEAD(&mdsc->dentry_lru); + INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); + err = ceph_metric_init(&mdsc->metric); + if (err) + goto err_mdsmap; + + spin_lock_init(&mdsc->dentry_list_lock); + INIT_LIST_HEAD(&mdsc->dentry_leases); + INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, fsc->min_caps); + ceph_adjust_caps_max_min(mdsc, fsc->mount_options); + + spin_lock_init(&mdsc->snapid_map_lock); + mdsc->snapid_map_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snapid_map_lru); + + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + + strscpy(mdsc->nodename, utsname()->nodename, + sizeof(mdsc->nodename)); + fsc->mdsc = mdsc; return 0; + +err_mdsmap: + kfree(mdsc->mdsmap); +err_mdsc: + kfree(mdsc); + return err; } /* @@ -3086,27 +5598,228 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; - struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { mutex_unlock(&mdsc->mutex); - dout("wait_requests waiting for requests\n"); + doutc(cl, "waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - fsc->client->options->mount_timeout * HZ); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); while ((req = __get_oldest_req(mdsc))) { - dout("wait_requests timed out on tid %llu\n", - req->r_tid); + doutc(cl, "timed out on tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); __unregister_request(mdsc, req); } } mutex_unlock(&mdsc->mutex); - dout("wait_requests done\n"); + doutc(cl, "done\n"); +} + +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_client *cl = s->s_mdsc->fsc->client; + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + +static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, + struct ceph_mds_cap_auth *auth, + const struct cred *cred, + char *tpath) +{ + u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); + u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); + struct ceph_client *cl = mdsc->fsc->client; + const char *fs_name = mdsc->fsc->mount_options->mds_namespace; + const char *spath = mdsc->fsc->mount_options->server_path; + bool gid_matched = false; + u32 gid, tlen, len; + int i, j; + + doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n", + fs_name, auth->match.fs_name ? auth->match.fs_name : ""); + if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) { + /* fsname mismatch, try next one */ + return 0; + } + + doutc(cl, "match.uid %lld\n", auth->match.uid); + if (auth->match.uid != MDS_AUTH_UID_ANY) { + if (auth->match.uid != caller_uid) + return 0; + if (auth->match.num_gids) { + for (i = 0; i < auth->match.num_gids; i++) { + if (caller_gid == auth->match.gids[i]) + gid_matched = true; + } + if (!gid_matched && cred->group_info->ngroups) { + for (i = 0; i < cred->group_info->ngroups; i++) { + gid = from_kgid(&init_user_ns, + cred->group_info->gid[i]); + for (j = 0; j < auth->match.num_gids; j++) { + if (gid == auth->match.gids[j]) { + gid_matched = true; + break; + } + } + if (gid_matched) + break; + } + } + if (!gid_matched) + return 0; + } + } + + /* path match */ + if (auth->match.path) { + if (!tpath) + return 0; + + tlen = strlen(tpath); + len = strlen(auth->match.path); + if (len) { + char *_tpath = tpath; + bool free_tpath = false; + int m, n; + + doutc(cl, "server path %s, tpath %s, match.path %s\n", + spath, tpath, auth->match.path); + if (spath && (m = strlen(spath)) != 1) { + /* mount path + '/' + tpath + an extra space */ + n = m + 1 + tlen + 1; + _tpath = kmalloc(n, GFP_NOFS); + if (!_tpath) + return -ENOMEM; + /* remove the leading '/' */ + snprintf(_tpath, n, "%s/%s", spath + 1, tpath); + free_tpath = true; + tlen = strlen(_tpath); + } + + /* + * Please note the tailing '/' for match.path has already + * been removed when parsing. + * + * Remove the tailing '/' for the target path. + */ + while (tlen && _tpath[tlen - 1] == '/') { + _tpath[tlen - 1] = '\0'; + tlen -= 1; + } + doutc(cl, "_tpath %s\n", _tpath); + + /* + * In case first == _tpath && tlen == len: + * match.path=/foo --> /foo _path=/foo --> match + * match.path=/foo/ --> /foo _path=/foo --> match + * + * In case first == _tmatch.path && tlen > len: + * match.path=/foo/ --> /foo _path=/foo/ --> match + * match.path=/foo --> /foo _path=/foo/ --> match + * match.path=/foo/ --> /foo _path=/foo/d --> match + * match.path=/foo --> /foo _path=/food --> mismatch + * + * All the other cases --> mismatch + */ + bool path_matched = true; + char *first = strstr(_tpath, auth->match.path); + if (first != _tpath || + (tlen > len && _tpath[len] != '/')) { + path_matched = false; + } + + if (free_tpath) + kfree(_tpath); + + if (!path_matched) + return 0; + } + } + + doutc(cl, "matched\n"); + return 1; +} + +int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) +{ + const struct cred *cred = get_current_cred(); + u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); + u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); + struct ceph_mds_cap_auth *rw_perms_s = NULL; + struct ceph_client *cl = mdsc->fsc->client; + bool root_squash_perms = true; + int i, err; + + doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n", + tpath, mask, caller_uid, caller_gid); + + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i]; + + err = ceph_mds_auth_match(mdsc, s, cred, tpath); + if (err < 0) { + put_cred(cred); + return err; + } else if (err > 0) { + /* always follow the last auth caps' permission */ + root_squash_perms = true; + rw_perms_s = NULL; + if ((mask & MAY_WRITE) && s->writeable && + s->match.root_squash && (!caller_uid || !caller_gid)) + root_squash_perms = false; + + if (((mask & MAY_WRITE) && !s->writeable) || + ((mask & MAY_READ) && !s->readable)) + rw_perms_s = s; + } + } + + put_cred(cred); + + doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms, + rw_perms_s); + if (root_squash_perms && rw_perms_s == NULL) { + doutc(cl, "access allowed\n"); + return 0; + } + + if (!root_squash_perms) { + doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write", + caller_uid, caller_gid); + } + if (rw_perms_s) { + doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d", + rw_perms_s->readable, rw_perms_s->writeable, + !!(mask & MAY_READ), !!(mask & MAY_WRITE)); + } + doutc(cl, "access denied\n"); + return -EACCES; } /* @@ -3115,10 +5828,11 @@ static void wait_requests(struct ceph_mds_client *mdsc) */ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { - dout("pre_umount\n"); - mdsc->stopping = 1; + doutc(mdsc->fsc->client, "begin\n"); + mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; - drop_leases(mdsc); + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); @@ -3127,18 +5841,24 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) * their inode/dcache refs */ ceph_msgr_flush(); + + ceph_cleanup_quotarealms_inodes(mdsc); + doutc(mdsc->fsc->client, "done\n"); } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + doutc(cl, "want %lld\n", want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -3148,15 +5868,34 @@ restart: nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; - if ((req->r_op & CEPH_MDS_OP_WRITE)) { + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", - req->r_tid, want_tid); + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + doutc(cl, "wait on %llu (want %llu)\n", + req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -3171,58 +5910,63 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + doutc(cl, "done\n"); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; u64 want_tid, want_flush; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; - dout("sync\n"); + doutc(cl, "sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; - want_flush = mdsc->cap_flush_seq; mutex_unlock(&mdsc->mutex); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); ceph_flush_dirty_caps(mdsc); + ceph_flush_cap_releases(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->last_cap_flush_tid; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + cf->wake = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); + wait_caps_flush(mdsc, want_flush); } /* * true if all sessions are closed, or we force unmount */ -static bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { - int i, n = 0; - - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) - if (mdsc->sessions[i]) - n++; - mutex_unlock(&mdsc->mutex); - return n == 0; + return atomic_read(&mdsc->num_sessions) <= skipped; } /* - * called after sb is ro. + * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session; int i; - struct ceph_fs_client *fsc = mdsc->fsc; - unsigned long timeout = fsc->client->options->mount_timeout * HZ; + int skipped = 0; - dout("close_sessions\n"); + doutc(cl, "begin\n"); /* close sessions */ mutex_lock(&mdsc->mutex); @@ -3232,22 +5976,24 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - __close_session(mdsc, session); + if (__close_session(mdsc, session) <= 0) + skipped++; mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); - dout("waiting for sessions to close\n"); - wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), - timeout); + doutc(cl, "waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, + done_closing_sessions(mdsc, skipped), + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { - session = get_session(mdsc->sessions[i]); + session = ceph_get_mds_session(mdsc->sessions[i]); __unregister_session(mdsc, session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -3260,44 +6006,179 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); - ceph_cleanup_empty_realms(mdsc); + ceph_cleanup_snapid_map(mdsc); + ceph_cleanup_global_and_empty_realms(mdsc); + cancel_work_sync(&mdsc->cap_reclaim_work); + cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ - dout("stopped\n"); + doutc(cl, "done\n"); +} + +void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *session; + int mds; + + doutc(mdsc->fsc->client, "force umount\n"); + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; mds++) { + session = __ceph_lookup_mds_session(mdsc, mds); + if (!session) + continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + if (session->s_state == CEPH_MDS_SESSION_CLOSING) { + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + mutex_lock(&mdsc->mutex); + kick_requests(mdsc, mds); + } + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); } static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { - dout("stop\n"); - cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + doutc(mdsc->fsc->client, "stop\n"); + /* + * Make sure the delayed work stopped before releasing + * the resources. + * + * Because the cancel_delayed_work_sync() will only + * guarantee that the work finishes executing. But the + * delayed work will re-arm itself again after that. + */ + flush_delayed_work(&mdsc->delayed_work); + if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + + if (mdsc->s_cap_auths) { + int i; + + for (i = 0; i < mdsc->s_cap_auths_num; i++) { + kfree(mdsc->s_cap_auths[i].match.gids); + kfree(mdsc->s_cap_auths[i].match.path); + kfree(mdsc->s_cap_auths[i].match.fs_name); + } + kfree(mdsc->s_cap_auths); + } + + ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc = fsc->mdsc; + doutc(fsc->client, "%p\n", mdsc); - dout("mdsc_destroy %p\n", mdsc); - ceph_mdsc_stop(mdsc); + if (!mdsc) + return; /* flush out any connection work with references to us */ ceph_msgr_flush(); + ceph_mdsc_stop(mdsc); + + ceph_metric_destroy(&mdsc->metric); + fsc->mdsc = NULL; kfree(mdsc); - dout("mdsc_destroy %p done\n", mdsc); + doutc(fsc->client, "%p done\n", mdsc); } +void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + struct ceph_client *cl = fsc->client; + const char *mds_namespace = fsc->mount_options->mds_namespace; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + u32 epoch; + u32 num_fs; + u32 mount_fscid = (u32)-1; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(u32), bad); + epoch = ceph_decode_32(&p); + + doutc(cl, "epoch %u\n", epoch); + + /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); + + ceph_decode_32_safe(&p, end, num_fs, bad); + while (num_fs-- > 0) { + void *info_p, *info_end; + u32 info_len; + u32 fscid, namelen; + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + p += 2; // info_v, info_cv + info_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, info_len, bad); + info_p = p; + info_end = p + info_len; + p = info_end; + + ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); + fscid = ceph_decode_32(&info_p); + namelen = ceph_decode_32(&info_p); + ceph_decode_need(&info_p, info_end, namelen, bad); + + if (mds_namespace && + strlen(mds_namespace) == namelen && + !strncmp(mds_namespace, (char *)info_p, namelen)) { + mount_fscid = fscid; + break; + } + } + + ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); + if (mount_fscid != (u32)-1) { + fsc->client->monc.fs_cluster_id = mount_fscid; + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + ceph_monc_renew_subs(&fsc->client->monc); + } else { + err = -ENOENT; + goto err_out; + } + return; + +bad: + pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n", + err); + ceph_umount_begin(mdsc->fsc->sb); + ceph_msg_dump(msg); +err_out: + mutex_lock(&mdsc->mutex); + mdsc->mdsmap_err = err; + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); +} /* * handle mds map update. */ -void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; u32 epoch; u32 maplen; void *p = msg->front.iov_base; @@ -3312,19 +6193,17 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); - dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + doutc(cl, "epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { - dout("handle_map epoch %u <= our %u\n", - epoch, mdsc->mdsmap->m_epoch); + doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); return; } - newmap = ceph_mdsmap_decode(&p, end); + newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; @@ -3339,38 +6218,40 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size, + MAX_LFS_FILESIZE); __wake_requests(mdsc, &mdsc->waiting_for_map); + ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, + mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); - schedule_delayed(mdsc); + schedule_delayed(mdsc, 0); return; bad_unlock: mutex_unlock(&mdsc->mutex); bad: - pr_err("error decoding mdsmap %d\n", err); + pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n", + err); + ceph_umount_begin(mdsc->fsc->sb); + ceph_msg_dump(msg); return; } -static struct ceph_connection *con_get(struct ceph_connection *con) +static struct ceph_connection *mds_get_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - if (get_session(s)) { - dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + if (ceph_get_mds_session(s)) return con; - } - dout("mdsc con_get %p FAIL\n", s); return NULL; } -static void con_put(struct ceph_connection *con) +static void mds_put_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); ceph_put_mds_session(s); } @@ -3378,19 +6259,23 @@ static void con_put(struct ceph_connection *con) * if the client is unresponsive for long enough, the mds will kill * the session entirely. */ -static void peer_reset(struct ceph_connection *con) +static void mds_peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warning("mds%d closed our session\n", s->s_mds); - send_mds_reconnect(mdsc, s); + pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", + s->s_mds); + if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && + ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) + send_mds_reconnect(mdsc, s); } -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; int type = le16_to_cpu(msg->hdr.type); mutex_lock(&mdsc->mutex); @@ -3402,7 +6287,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(mdsc, msg); + ceph_mdsc_handle_mdsmap(mdsc, msg); + break; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(mdsc, msg); break; case CEPH_MSG_CLIENT_SESSION: handle_session(s, msg); @@ -3422,10 +6310,13 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_CLIENT_LEASE: handle_lease(mdsc, s, msg); break; + case CEPH_MSG_CLIENT_QUOTA: + ceph_handle_quota(mdsc, s, msg); + break; default: - pr_err("received unknown message type %d %s\n", type, - ceph_msg_type_name(type)); + pr_err_client(cl, "received unknown message type %d %s\n", + type, ceph_msg_type_name(type)); } out: ceph_msg_put(msg); @@ -3439,45 +6330,47 @@ out: * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ -static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - int *proto, int force_new) +static struct ceph_auth_handshake * +mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } +static int mds_add_authorizer_challenge(struct ceph_connection *con, + void *challenge_buf, int challenge_buf_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; -static int verify_authorizer_reply(struct ceph_connection *con, int len) + return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, + challenge_buf, challenge_buf_len); +} + +static int mds_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; - return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } -static int invalidate_authorizer(struct ceph_connection *con) +static int mds_invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -3488,6 +6381,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static int mds_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int mds_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { @@ -3509,15 +6476,38 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, return msg; } +static int mds_sign_message(struct ceph_msg *msg) +{ + struct ceph_mds_session *s = msg->con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_sign_message(auth, msg); +} + +static int mds_check_message_signature(struct ceph_msg *msg) +{ + struct ceph_mds_session *s = msg->con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_check_message_signature(auth, msg); +} + static const struct ceph_connection_operations mds_con_ops = { - .get = con_get, - .put = con_put, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .peer_reset = peer_reset, + .get = mds_get_con, + .put = mds_put_con, .alloc_msg = mds_alloc_msg, + .dispatch = mds_dispatch, + .peer_reset = mds_peer_reset, + .get_authorizer = mds_get_authorizer, + .add_authorizer_challenge = mds_add_authorizer_challenge, + .verify_authorizer_reply = mds_verify_authorizer_reply, + .invalidate_authorizer = mds_invalidate_authorizer, + .sign_message = mds_sign_message, + .check_message_signature = mds_check_message_signature, + .get_auth_request = mds_get_auth_request, + .handle_auth_reply_more = mds_handle_auth_reply_more, + .handle_auth_done = mds_handle_auth_done, + .handle_auth_bad_method = mds_handle_auth_bad_method, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c2a19fbbe517..0428a5eaf28c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MDS_CLIENT_H #define _FS_CEPH_MDS_CLIENT_H @@ -7,12 +8,54 @@ #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/spinlock.h> +#include <linux/refcount.h> +#include <linux/utsname.h> +#include <linux/ktime.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> -#include <linux/ceph/mdsmap.h> #include <linux/ceph/auth.h> +#include "mdsmap.h" +#include "metric.h" +#include "super.h" + +/* The first 8 bits are reserved for old ceph releases */ +enum ceph_feature_type { + CEPHFS_FEATURE_MIMIC = 8, + CEPHFS_FEATURE_REPLY_ENCODING, + CEPHFS_FEATURE_RECLAIM_CLIENT, + CEPHFS_FEATURE_LAZY_CAP_WANTED, + CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_METRIC_COLLECT, + CEPHFS_FEATURE_ALTERNATE_NAME, + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_32BITS_RETRY_FWD, + CEPHFS_FEATURE_NEW_SNAPREALM_INFO, + CEPHFS_FEATURE_HAS_OWNER_UIDGID, + CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, + + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, +}; + +#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ + CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ + CEPHFS_FEATURE_DELEG_INO, \ + CEPHFS_FEATURE_METRIC_COLLECT, \ + CEPHFS_FEATURE_ALTERNATE_NAME, \ + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ + CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \ +} + /* * Some lock dependencies: * @@ -30,6 +73,24 @@ struct ceph_fs_client; struct ceph_cap; +#define MDS_AUTH_UID_ANY -1 + +struct ceph_mds_cap_match { + s64 uid; /* default to MDS_AUTH_UID_ANY */ + u32 num_gids; + u32 *gids; /* use these GIDs */ + char *path; /* require path to be child of this + (may be "" or "/" for any) */ + char *fs_name; + bool root_squash; /* default to false */ +}; + +struct ceph_mds_cap_auth { + struct ceph_mds_cap_match match; + bool readable; + bool writeable; +}; + /* * parsed info about a single inode. pointers are into the encoded * on-wire structures within the mds reply message payload. @@ -41,6 +102,37 @@ struct ceph_mds_reply_info_in { char *symlink; u32 xattr_len; char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; + u32 pool_ns_len; + char *pool_ns_data; + u64 max_bytes; + u64 max_files; + s32 dir_pin; + struct ceph_timespec btime; + struct ceph_timespec snap_btime; + u8 *fscrypt_auth; + u8 *fscrypt_file; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u64 rsnaps; + u64 change_attr; +}; + +struct ceph_mds_reply_dir_entry { + bool is_nokey; + char *name; + u32 name_len; + u32 raw_hash; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + +struct ceph_mds_reply_xattr { + char *xattr_value; + size_t xattr_value_len; }; /* @@ -56,8 +148,11 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; + u8 *altname; u32 dname_len; + u32 altname_len; struct ceph_mds_reply_lease *dlease; + struct ceph_mds_reply_xattr xattr_info; /* extra */ union { @@ -67,12 +162,13 @@ struct ceph_mds_reply_info_parsed { /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + bool dir_end; + bool dir_complete; + bool hash_order; + bool offset_hash; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ @@ -91,10 +187,13 @@ struct ceph_mds_reply_info_parsed { /* * cap releases are batched and sent to the MDS en masse. + * + * Account for per-message overhead of mds_cap_release header + * and __le32 for osd epoch barrier trailing field. */ -#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ - sizeof(struct ceph_mds_cap_item)) + sizeof(struct ceph_mds_cap_item)) /* @@ -105,9 +204,11 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, + CEPH_MDS_SESSION_CLOSED = 8, + CEPH_MDS_SESSION_REJECTED = 9, }; struct ceph_mds_session { @@ -115,6 +216,7 @@ struct ceph_mds_session { int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ + unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ @@ -122,29 +224,33 @@ struct ceph_mds_session { struct ceph_auth_handshake s_auth; - /* protected by s_gen_ttl_lock */ - spinlock_t s_gen_ttl_lock; - u32 s_cap_gen; /* inc each time we get mds stale msg */ - unsigned long s_cap_ttl; /* when session caps expire */ + atomic_t s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ /* protected by s_cap_lock */ spinlock_t s_cap_lock; + refcount_t s_ref; struct list_head s_caps; /* all caps issued by this session */ - int s_nr_caps, s_trim_caps; + struct ceph_cap *s_cap_iterator; + int s_nr_caps; int s_num_cap_releases; + int s_cap_reconnect; + int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct list_head s_cap_releases_done; /* ready to send */ - struct ceph_cap *s_cap_iterator; + struct work_struct s_cap_release_work; + + /* See ceph_inode_info->i_dirty_item. */ + struct list_head s_cap_dirty; /* inodes w/ dirty caps */ - /* protected by mutex */ + /* See ceph_inode_info->i_flushing_item. */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ - struct list_head s_cap_snaps_flushing; + unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - atomic_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ + struct xarray s_delegated_inos; }; /* @@ -164,6 +270,11 @@ struct ceph_mds_client; */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); /* * an in-flight mds request @@ -173,6 +284,7 @@ struct ceph_mds_request { struct rb_node r_node; struct ceph_mds_client *r_mdsc; + struct kref r_kref; int r_op; /* mds op code */ /* operation on what? */ @@ -183,25 +295,45 @@ struct ceph_mds_request { char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; - struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ + struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ + struct inode *r_new_inode; /* new inode (for creates) */ + + const struct qstr *r_dname; /* stable name (for ->d_revalidate) */ + +#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ +#define CEPH_MDS_R_ABORTED (2) /* call was aborted */ +#define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ +#define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ +#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ +#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ +#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ +#define CEPH_MDS_R_ASYNC (8) /* async request */ +#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ + unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; + + struct ceph_fscrypt_auth *r_fscrypt_auth; + u64 r_fscrypt_file; + + u8 *r_altname; /* fscrypt binary crypttext for long filenames */ + u32 r_altname_len; /* length of r_altname */ + int r_fmode; /* file mode, if expecting cap */ - kuid_t r_uid; - kgid_t r_gid; + int r_request_release_offset; + const struct cred *r_cred; + struct mnt_idmap *r_mnt_idmap; + struct timespec64 r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ - bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ - struct page **r_pages; - int r_num_pages; - int r_data_len; + struct ceph_pagelist *r_pagelist; /* what caps shall we drop? */ int r_inode_drop, r_inode_unless; @@ -211,14 +343,19 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ - int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - bool r_aborted; + u32 r_readdir_offset; + + struct page *r_locked_page; + int r_dir_caps; + int r_num_caps; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_start_latency; /* start time to measure latency */ + unsigned long r_end_latency; /* finish time to measure latency */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ @@ -226,26 +363,77 @@ struct ceph_mds_request { struct inode *r_unsafe_dir; struct list_head r_unsafe_dir_item; + /* unsafe requests that modify the target inode */ + struct list_head r_unsafe_target_item; + struct ceph_mds_session *r_session; int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + u64 r_deleg_ino; - struct kref r_kref; struct list_head r_wait; struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe, r_got_result; - bool r_did_prepopulate; - u32 r_readdir_offset; + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; + + int r_feature_needed; struct ceph_cap_reservation r_caps_reservation; - int r_num_caps; +}; + +struct ceph_pool_perm { + struct rb_node node; + int perm; + s64 pool; + size_t pool_ns_len; + char pool_ns[]; +}; + +struct ceph_snapid_map { + struct rb_node node; + struct list_head lru; + atomic_t ref; + dev_t dev; + u64 snap; + unsigned long last_used; +}; + +/* + * node for list of quotarealm inodes that are not visible from the filesystem + * mountpoint, but required to handle, e.g. quotas. + */ +struct ceph_quotarealm_inode { + struct rb_node node; + u64 ino; + unsigned long timeout; /* last time a lookup failed for this inode */ + struct mutex mutex; + struct inode *inode; +}; + +#ifdef CONFIG_DEBUG_FS + +struct cap_wait { + struct list_head list; + u64 ino; + pid_t tgid; + int need; + int want; +}; + +#endif + +enum { + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -259,10 +447,27 @@ struct ceph_mds_client { struct completion safe_umount_waiters; wait_queue_head_t session_close_wq; struct list_head waiting_for_map; + int mdsmap_err; struct ceph_mds_session **sessions; /* NULL for mds if no session */ - int max_sessions; /* len of s_mds_sessions */ - int stopping; /* true if shutting down */ + atomic_t num_sessions; + int max_sessions; /* len of sessions array */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; + + atomic64_t dirty_folios; + wait_queue_head_t flush_end_wq; + + atomic64_t quotarealms_count; /* # realms with quota */ + /* + * We keep a list of inodes we don't see in the mountpoint but that we + * need to track quota realms. + */ + struct rb_root quotarealms_inodes; + struct mutex quotarealms_inodes_mutex; /* * snap_rwsem will cover cap linkage into snaprealms, and @@ -271,27 +476,37 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; + int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ - spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ + spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - u64 cap_flush_seq; - struct list_head cap_dirty; /* inodes with dirty caps */ + u64 last_cap_flush_tid; + struct list_head cap_flush_list; struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + struct work_struct cap_reclaim_work; + atomic_t cap_reclaim_pending; + + struct work_struct cap_unlink_work; + /* * Cap reservations * @@ -306,53 +521,72 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ +#ifdef CONFIG_DEBUG_FS + struct list_head cap_wait_list; +#endif int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ + int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - spinlock_t dentry_lru_lock; - struct list_head dentry_lru; - int num_dentry; + spinlock_t dentry_list_lock; + struct list_head dentry_leases; /* fifo list */ + struct list_head dentry_dir_leases; /* lru list */ + + struct ceph_client_metric metric; + + spinlock_t snapid_map_lock; + struct rb_root snapid_map_tree; + struct list_head snapid_map_lru; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; + + u32 s_cap_auths_num; + struct ceph_mds_cap_auth *s_cap_auths; + + char nodename[__NEW_UTS_LEN + 1]; }; extern const char *ceph_mds_op_name(int op); +extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s); + extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); -static inline struct ceph_mds_session * -ceph_get_mds_session(struct ceph_mds_session *s) -{ - atomic_inc(&s->s_ref); - return s; -} +extern const char *ceph_session_state_name(int s); +extern struct ceph_mds_session * +ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); -extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, - struct ceph_msg *msg, int mds); - extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, - struct inode *inode, - struct dentry *dn); - extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); -extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req); +extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, + struct inode *dir, + struct ceph_mds_request *req); +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); @@ -363,26 +597,74 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); -extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); +extern void send_flush_mdlog(struct ceph_mds_session *s); +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state); +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); +extern void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap); +extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); +extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); +extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); +extern int ceph_iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, int mds, void *), + void *arg); +extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, + int mask); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap); +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) +{ + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); +} + +extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, + struct dentry *dentry, struct ceph_path_info *path_info, + int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, - struct inode *inode, struct dentry *dentry, char action, u32 seq); -extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, - struct ceph_msg *msg); +extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); +extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); + +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); + +extern int ceph_trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps); + +static inline int ceph_wait_on_async_create(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, + TASK_KILLABLE); +} -extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); +extern bool enable_unsafe_idmap; #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 132b64eeecd4..2c7b151a7c95 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/bug.h> @@ -6,64 +7,139 @@ #include <linux/slab.h> #include <linux/types.h> -#include <linux/ceph/mdsmap.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> +#include "mdsmap.h" +#include "mds_client.h" #include "super.h" +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) -/* - * choose a random mds that is "up" (i.e. has a state > 0), or -1. - */ -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) { int n = 0; - int i; - - /* special case for one mds */ - if (1 == m->m_max_mds && m->m_info[0].state > 0) - return 0; + int i, j; /* count */ - for (i = 0; i < m->m_max_mds; i++) - if (m->m_info[i].state > 0) + for (i = 0; i < m->possible_max_rank; i++) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) n++; if (n == 0) return -1; /* pick */ - n = prandom_u32() % n; - i = 0; - for (i = 0; n > 0; i++, n--) - while (m->m_info[i].state <= 0) - i++; + n = get_random_u32_below(n); + for (j = 0, i = 0; i < m->possible_max_rank; i++) { + if (CEPH_MDS_IS_READY(i, ignore_laggy)) + j++; + if (j > n) + break; + } return i; } /* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int mds; + + mds = __mdsmap_get_random_mds(m, false); + if (mds == m->possible_max_rank || mds == -1) + mds = __mdsmap_get_random_mds(m, true); + + return mds == m->possible_max_rank ? -1 : mds; +} + +#define __decode_and_drop_type(p, end, type, bad) \ + do { \ + if (*p + sizeof(type) > end) \ + goto bad; \ + *p += sizeof(type); \ + } while (0) + +#define __decode_and_drop_set(p, end, type, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = sizeof(type) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + +#define __decode_and_drop_map(p, end, ktype, vtype, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = (sizeof(ktype) + sizeof(vtype)) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + + +static int __decode_and_drop_compat_set(void **p, void* end) +{ + int i; + /* compat, ro_compat, incompat*/ + for (i = 0; i < 3; i++) { + u32 n; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + /* mask */ + *p += sizeof(u64); + /* names (map<u64, string>) */ + n = ceph_decode_32(p); + while (n-- > 0) { + u32 len; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), + bad); + *p += sizeof(u64); + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + *p += len; + } + } + return 0; +bad: + return -1; +} + +/* * Decode an MDS map * * Ignore any fields we don't care about (there are quite a few of * them). */ -struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, + void *end, bool msgr2) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mdsmap *m; const void *start = *p; int i, j, n; - int err = -EINVAL; - u16 version; + int err; + u8 mdsmap_v; + u16 mdsmap_ev; + u32 target; m = kzalloc(sizeof(*m), GFP_NOFS); - if (m == NULL) + if (!m) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warning("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + *p += sizeof(u8); /* mdsmap_cv */ + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -76,46 +152,77 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); - m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); - if (m->m_info == NULL) - goto badmem; + /* + * pick out the active nodes as the m_num_active_mds, the + * m_num_active_mds maybe larger than m_max_mds when decreasing + * the max_mds in cluster side, in other case it should less + * than or equal to m_max_mds. + */ + m->m_num_active_mds = n = ceph_decode_32(p); + + /* + * the possible max rank, it maybe larger than the m_num_active_mds, + * for example if the mds_max == 2 in the cluster, when the MDS(0) + * was laggy and being replaced by a new MDS, we will temporarily + * receive a new mds map with n_num_mds == 1 and the active MDS(1), + * and the mds rank >= m_num_active_mds. + */ + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); + + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); + if (!m->m_info) + goto nomem; /* pick out active nodes from mds_info (state > 0) */ - n = ceph_decode_32(p); for (i = 0; i < n; i++) { u64 global_id; u32 namelen; s32 mds, inc, state; - u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + *p += sizeof(u8); /* info_cv */ + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; - ceph_decode_need(p, end, - 4*sizeof(u32) + sizeof(u64) + - sizeof(addr) + sizeof(struct ceph_timespec), - bad); - mds = ceph_decode_32(p); - inc = ceph_decode_32(p); - state = ceph_decode_32(p); - state_seq = ceph_decode_64(p); - ceph_decode_copy(p, &addr, sizeof(addr)); - ceph_decode_addr(&addr); - ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + ceph_decode_32_safe(p, end, mds, bad); + ceph_decode_32_safe(p, end, inc, bad); + ceph_decode_32_safe(p, end, state, bad); + *p += sizeof(u64); /* state_seq */ + if (info_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + err = ceph_decode_entity_addr(p, end, &addr); + if (err) + goto corrupt; + + ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since), + bad); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,29 +230,42 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", - i+1, n, global_id, mds, inc, - ceph_pr_addr(&addr.in_addr), - ceph_mds_state_name(state)); + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + + doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, + mds, inc, ceph_pr_addr(&addr), + ceph_mds_state_name(state), laggy ? "(laggy)" : ""); - if (mds < 0 || mds >= m->m_max_mds || state <= 0) + if (mds < 0 || mds >= m->possible_max_rank) { + pr_warn_client(cl, "got incorrect mds(%d)\n", mds); continue; + } + + if (state <= 0) { + doutc(cl, "got incorrect state(%s)\n", + ceph_mds_state_name(state)); + continue; + } info = &m->m_info[mds]; info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); - if (info->export_targets == NULL) - goto badmem; - for (j = 0; j < num_export_targets; j++) - info->export_targets[j] = - ceph_decode_32(&pexport_targets); + if (!info->export_targets) + goto nomem; + for (j = 0; j < num_export_targets; j++) { + target = ceph_decode_32(&pexport_targets); + info->export_targets[j] = target; + } } else { info->export_targets = NULL; } @@ -156,34 +276,175 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_num_data_pg_pools = n; m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) - goto badmem; + goto nomem; ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) m->m_data_pg_pools[i] = ceph_decode_64(p); m->m_cas_pg_pool = ceph_decode_64(p); + m->m_enabled = m->m_epoch > 1; - /* ok, we don't care about the rest. */ - dout("mdsmap_decode success epoch %u\n", m->m_epoch); - return m; + mdsmap_ev = 1; + if (mdsmap_v >= 2) { + ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); + } + if (mdsmap_ev >= 3) { + if (__decode_and_drop_compat_set(p, end) < 0) + goto bad_ext; + } + /* metadata_pool */ + if (mdsmap_ev < 5) { + __decode_and_drop_type(p, end, u32, bad_ext); + } else { + __decode_and_drop_type(p, end, u64, bad_ext); + } + + /* created + modified + tableserver */ + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, u32, bad_ext); + + /* in */ + { + int num_laggy = 0; + ceph_decode_32_safe(p, end, n, bad_ext); + ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); + + for (i = 0; i < n; i++) { + s32 mds = ceph_decode_32(p); + if (mds >= 0 && mds < m->possible_max_rank) { + if (m->m_info[mds].laggy) + num_laggy++; + } + } + m->m_num_laggy = num_laggy; + + if (n > m->possible_max_rank) { + void *new_m_info = krealloc(m->m_info, + n * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + } + m->possible_max_rank = n; + } + + /* inc */ + __decode_and_drop_map(p, end, u32, u32, bad_ext); + /* up */ + __decode_and_drop_map(p, end, u32, u64, bad_ext); + /* failed */ + __decode_and_drop_set(p, end, u32, bad_ext); + /* stopped */ + __decode_and_drop_set(p, end, u32, bad_ext); -badmem: + if (mdsmap_ev >= 4) { + /* last_failure_osd_epoch */ + __decode_and_drop_type(p, end, u32, bad_ext); + } + if (mdsmap_ev >= 6) { + /* ever_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + /* explicitly_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 7) { + /* inline_data_enabled */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 8) { + u32 fsname_len; + /* enabled */ + ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); + /* fs_name */ + ceph_decode_32_safe(p, end, fsname_len, bad_ext); + + /* validate fsname against mds_namespace */ + if (!namespace_equals(mdsc->fsc->mount_options, *p, + fsname_len)) { + pr_warn_client(cl, "fsname %*pE doesn't match mds_namespace %s\n", + (int)fsname_len, (char *)*p, + mdsc->fsc->mount_options->mds_namespace); + goto bad; + } + /* skip fsname after validation */ + ceph_decode_skip_n(p, end, fsname_len, bad); + } + /* damaged */ + if (mdsmap_ev >= 9) { + size_t need; + ceph_decode_32_safe(p, end, n, bad_ext); + need = sizeof(u32) * n; + ceph_decode_need(p, end, need, bad_ext); + *p += need; + m->m_damaged = n > 0; + } else { + m->m_damaged = false; + } + if (mdsmap_ev >= 17) { + /* balancer */ + ceph_decode_skip_string(p, end, bad_ext); + /* standby_count_wanted */ + ceph_decode_skip_32(p, end, bad_ext); + /* old_max_mds */ + ceph_decode_skip_32(p, end, bad_ext); + /* min_compat_client */ + ceph_decode_skip_8(p, end, bad_ext); + /* required_client_features */ + ceph_decode_skip_set(p, end, 64, bad_ext); + /* bal_rank_mask */ + ceph_decode_skip_string(p, end, bad_ext); + } + if (mdsmap_ev >= 18) { + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); + } +bad_ext: + doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); + *p = end; + doutc(cl, "success epoch %u\n", m->m_epoch); + return m; +nomem: err = -ENOMEM; -bad: - pr_err("corrupt mdsmap\n"); + goto out_err; +corrupt: + pr_err_client(cl, "corrupt mdsmap\n"); print_hex_dump(KERN_DEBUG, "mdsmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); +out_err: ceph_mdsmap_destroy(m); return ERR_PTR(err); +bad: + err = -EINVAL; + goto corrupt; } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_max_mds; i++) - kfree(m->m_info[i].export_targets); - kfree(m->m_info); + if (m->m_info) { + for (i = 0; i < m->possible_max_rank; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + } kfree(m->m_data_pg_pools); kfree(m); } + +bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) +{ + int i, nr_active = 0; + if (!m->m_enabled) + return false; + if (m->m_damaged) + return false; + if (m->m_num_laggy == m->m_num_active_mds) + return false; + for (i = 0; i < m->possible_max_rank; i++) { + if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) + nr_active++; + } + return nr_active > 0; +} diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h new file mode 100644 index 000000000000..1f2171dd01bf --- /dev/null +++ b/fs/ceph/mdsmap.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDSMAP_H +#define _FS_CEPH_MDSMAP_H + +#include <linux/bug.h> +#include <linux/ceph/types.h> + +struct ceph_mds_client; + +/* + * mds map - describe servers in the mds cluster. + * + * we limit fields to those the client actually xcares about + */ +struct ceph_mds_info { + u64 global_id; + struct ceph_entity_addr addr; + s32 state; + int num_export_targets; + bool laggy; + u32 *export_targets; +}; + +struct ceph_mdsmap { + u32 m_epoch, m_client_epoch, m_last_failure; + u32 m_root; + u32 m_session_timeout; /* seconds */ + u32 m_session_autoclose; /* seconds */ + u64 m_max_file_size; + /* + * maximum size for xattrs blob. + * Zeroed by default to force the usage of the (sync) SETXATTR Op. + */ + u64 m_max_xattr_size; + u32 m_max_mds; /* expected up:active mds number */ + u32 m_num_active_mds; /* actual up:active mds number */ + u32 possible_max_rank; /* possible max rank index */ + struct ceph_mds_info *m_info; + + /* which object pools file data can be stored in */ + int m_num_data_pg_pools; + u64 *m_data_pg_pools; + u64 m_cas_pg_pool; + + bool m_enabled; + bool m_damaged; + int m_num_laggy; +}; + +static inline struct ceph_entity_addr * +ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) +{ + if (w >= m->possible_max_rank) + return NULL; + return &m->m_info[w].addr; +} + +static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) +{ + BUG_ON(w < 0); + if (w >= m->possible_max_rank) + return CEPH_MDS_STATE_DNE; + return m->m_info[w].state; +} + +static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) +{ + if (w >= 0 && w < m->possible_max_rank) + return m->m_info[w].laggy; + return false; +} + +extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); +struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, + void *end, bool msgr2); +extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); +extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m); + +#endif diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c new file mode 100644 index 000000000000..871c1090e520 --- /dev/null +++ b/fs/ceph/metric.c @@ -0,0 +1,362 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/percpu_counter.h> +#include <linux/math64.h> + +#include "metric.h" +#include "mds_client.h" + +static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) +{ + struct timespec64 t = ktime_to_timespec64(val); + ceph_encode_timespec64(ts, &t); +} + +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + struct ceph_metric_head *head; + struct ceph_metric_cap *cap; + struct ceph_metric_read_latency *read; + struct ceph_metric_write_latency *write; + struct ceph_metric_metadata_latency *meta; + struct ceph_metric_dlease *dlease; + struct ceph_opened_files *files; + struct ceph_pinned_icaps *icaps; + struct ceph_opened_inodes *inodes; + struct ceph_read_io_size *rsize; + struct ceph_write_io_size *wsize; + struct ceph_client_metric *m = &mdsc->metric; + u64 nr_caps = atomic64_read(&m->total_caps); + u32 header_len = sizeof(struct ceph_metric_header); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_msg *msg; + s64 sum; + s32 items = 0; + s32 len; + + /* Do not send the metrics until the MDS rank is ready */ + mutex_lock(&mdsc->mutex); + if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) != CEPH_MDS_STATE_ACTIVE) { + mutex_unlock(&mdsc->mutex); + return false; + } + mutex_unlock(&mdsc->mutex); + + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) + + sizeof(*wsize); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); + if (!msg) { + pr_err_client(cl, "to mds%d, failed to allocate message\n", + s->s_mds); + return false; + } + + head = msg->front.iov_base; + + /* encode the cap metric */ + cap = (struct ceph_metric_cap *)(head + 1); + cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->header.ver = 1; + cap->header.compat = 1; + cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len); + cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); + cap->total = cpu_to_le64(nr_caps); + items++; + + /* encode the read latency metric */ + read = (struct ceph_metric_read_latency *)(cap + 1); + read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->header.ver = 2; + read->header.compat = 1; + read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); + sum = m->metric[METRIC_READ].latency_sum; + ktime_to_ceph_timespec(&read->lat, sum); + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); + read->count = cpu_to_le64(m->metric[METRIC_READ].total); + items++; + + /* encode the write latency metric */ + write = (struct ceph_metric_write_latency *)(read + 1); + write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->header.ver = 2; + write->header.compat = 1; + write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); + sum = m->metric[METRIC_WRITE].latency_sum; + ktime_to_ceph_timespec(&write->lat, sum); + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); + items++; + + /* encode the metadata latency metric */ + meta = (struct ceph_metric_metadata_latency *)(write + 1); + meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->header.ver = 2; + meta->header.compat = 1; + meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); + sum = m->metric[METRIC_METADATA].latency_sum; + ktime_to_ceph_timespec(&meta->lat, sum); + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); + items++; + + /* encode the dentry lease metric */ + dlease = (struct ceph_metric_dlease *)(meta + 1); + dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + dlease->header.ver = 1; + dlease->header.compat = 1; + dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len); + dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); + dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); + dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); + items++; + + sum = percpu_counter_sum(&m->total_inodes); + + /* encode the opened files metric */ + files = (struct ceph_opened_files *)(dlease + 1); + files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->header.ver = 1; + files->header.compat = 1; + files->header.data_len = cpu_to_le32(sizeof(*files) - header_len); + files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); + files->total = cpu_to_le64(sum); + items++; + + /* encode the pinned icaps metric */ + icaps = (struct ceph_pinned_icaps *)(files + 1); + icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->header.ver = 1; + icaps->header.compat = 1; + icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len); + icaps->pinned_icaps = cpu_to_le64(nr_caps); + icaps->total = cpu_to_le64(sum); + items++; + + /* encode the opened inodes metric */ + inodes = (struct ceph_opened_inodes *)(icaps + 1); + inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->header.ver = 1; + inodes->header.compat = 1; + inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len); + inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); + inodes->total = cpu_to_le64(sum); + items++; + + /* encode the read io size metric */ + rsize = (struct ceph_read_io_size *)(inodes + 1); + rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES); + rsize->header.ver = 1; + rsize->header.compat = 1; + rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); + rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total); + rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum); + items++; + + /* encode the write io size metric */ + wsize = (struct ceph_write_io_size *)(rsize + 1); + wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES); + wsize->header.ver = 1; + wsize->header.compat = 1; + wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); + wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total); + wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); + items++; + + put_unaligned_le32(items, &head->num); + msg->front.iov_len = len; + msg->hdr.version = cpu_to_le16(1); + msg->hdr.compat_version = cpu_to_le16(1); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_con_send(&s->s_con, msg); + + return true; +} + + +static void metric_get_session(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *s; + int i; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + + /* + * Skip it if MDS doesn't support the metric collection, + * or the MDS will close the session's socket connection + * directly when it get this message. + */ + if (check_session_state(s) && + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { + mdsc->metric.session = s; + break; + } + + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); +} + +static void metric_delayed_work(struct work_struct *work) +{ + struct ceph_client_metric *m = + container_of(work, struct ceph_client_metric, delayed_work.work); + struct ceph_mds_client *mdsc = + container_of(m, struct ceph_mds_client, metric); + + if (mdsc->stopping || disable_send_metrics) + return; + + if (!m->session || !check_session_state(m->session)) { + if (m->session) { + ceph_put_mds_session(m->session); + m->session = NULL; + } + metric_get_session(mdsc); + } + if (m->session) { + ceph_mdsc_send_metrics(mdsc, m->session); + metric_schedule_delayed(m); + } +} + +int ceph_metric_init(struct ceph_client_metric *m) +{ + struct ceph_metric *metric; + int ret, i; + + if (!m) + return -EINVAL; + + atomic64_set(&m->total_dentries, 0); + ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); + if (ret) + return ret; + + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); + if (ret) + goto err_d_lease_mis; + + atomic64_set(&m->total_caps, 0); + ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_hit; + + ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_mis; + + for (i = 0; i < METRIC_MAX; i++) { + metric = &m->metric[i]; + spin_lock_init(&metric->lock); + metric->size_sum = 0; + metric->size_min = U64_MAX; + metric->size_max = 0; + metric->total = 0; + metric->latency_sum = 0; + metric->latency_avg = 0; + metric->latency_sq_sum = 0; + metric->latency_min = KTIME_MAX; + metric->latency_max = 0; + } + + atomic64_set(&m->opened_files, 0); + ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); + if (ret) + goto err_opened_inodes; + ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); + if (ret) + goto err_total_inodes; + + m->session = NULL; + INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); + + return 0; + +err_total_inodes: + percpu_counter_destroy(&m->opened_inodes); +err_opened_inodes: + percpu_counter_destroy(&m->i_caps_mis); +err_i_caps_mis: + percpu_counter_destroy(&m->i_caps_hit); +err_i_caps_hit: + percpu_counter_destroy(&m->d_lease_mis); +err_d_lease_mis: + percpu_counter_destroy(&m->d_lease_hit); + + return ret; +} + +void ceph_metric_destroy(struct ceph_client_metric *m) +{ + if (!m) + return; + + cancel_delayed_work_sync(&m->delayed_work); + + percpu_counter_destroy(&m->total_inodes); + percpu_counter_destroy(&m->opened_inodes); + percpu_counter_destroy(&m->i_caps_mis); + percpu_counter_destroy(&m->i_caps_hit); + percpu_counter_destroy(&m->d_lease_mis); + percpu_counter_destroy(&m->d_lease_hit); + + ceph_put_mds_session(m->session); +} + +#define METRIC_UPDATE_MIN_MAX(min, max, new) \ +{ \ + if (unlikely(new < min)) \ + min = new; \ + if (unlikely(new > max)) \ + max = new; \ +} + +static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t avg; + + if (unlikely(total == 1)) { + *lavg = lat; + } else { + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = *lavg + div64_s64(lat - *lavg, total); + *sq_sump += (lat - *lavg)*(lat - avg); + *lavg = avg; + } +} + +void ceph_update_metrics(struct ceph_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; + + if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->lock); + total = ++m->total; + m->size_sum += size; + METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); + m->latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, + lat); + spin_unlock(&m->lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h new file mode 100644 index 000000000000..0d0c44bd3332 --- /dev/null +++ b/fs/ceph/metric.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDS_METRIC_H +#define _FS_CEPH_MDS_METRIC_H + +#include <linux/ceph/types.h> +#include <linux/percpu_counter.h> +#include <linux/ktime.h> + +extern bool disable_send_metrics; + +enum ceph_metric_type { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_OPENED_FILES, + CLIENT_METRIC_TYPE_PINNED_ICAPS, + CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_READ_IO_SIZES, + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, +}; + +/* + * This will always have the highest metric bit value + * as the last element of the array. + */ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ +} + +struct ceph_metric_header { + __le32 type; /* ceph metric type */ + __u8 ver; + __u8 compat; + __le32 data_len; /* length of sizeof(hit + mis + total) */ +} __packed; + +/* metric caps header */ +struct ceph_metric_cap { + struct ceph_metric_header header; + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + +/* metric read latency header */ +struct ceph_metric_read_latency { + struct ceph_metric_header header; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; +} __packed; + +/* metric write latency header */ +struct ceph_metric_write_latency { + struct ceph_metric_header header; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; +} __packed; + +/* metric metadata latency header */ +struct ceph_metric_metadata_latency { + struct ceph_metric_header header; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; +} __packed; + +/* metric dentry lease header */ +struct ceph_metric_dlease { + struct ceph_metric_header header; + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + +/* metric opened files header */ +struct ceph_opened_files { + struct ceph_metric_header header; + __le64 opened_files; + __le64 total; +} __packed; + +/* metric pinned i_caps header */ +struct ceph_pinned_icaps { + struct ceph_metric_header header; + __le64 pinned_icaps; + __le64 total; +} __packed; + +/* metric opened inodes header */ +struct ceph_opened_inodes { + struct ceph_metric_header header; + __le64 opened_inodes; + __le64 total; +} __packed; + +/* metric read io size header */ +struct ceph_read_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + +/* metric write io size header */ +struct ceph_write_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + +struct ceph_metric_head { + __le32 num; /* the number of metrics that will be sent */ +} __packed; + +enum metric_type { + METRIC_READ, + METRIC_WRITE, + METRIC_METADATA, + METRIC_COPYFROM, + METRIC_MAX +}; + +struct ceph_metric { + spinlock_t lock; + u64 total; + u64 size_sum; + u64 size_min; + u64 size_max; + ktime_t latency_sum; + ktime_t latency_avg; + ktime_t latency_sq_sum; + ktime_t latency_min; + ktime_t latency_max; +}; + +/* This is the global metrics */ +struct ceph_client_metric { + atomic64_t total_dentries; + struct percpu_counter d_lease_hit; + struct percpu_counter d_lease_mis; + + atomic64_t total_caps; + struct percpu_counter i_caps_hit; + struct percpu_counter i_caps_mis; + + struct ceph_metric metric[METRIC_MAX]; + + /* The total number of directories and files that are opened */ + atomic64_t opened_files; + + /* The total number of inodes that have opened files or directories */ + struct percpu_counter opened_inodes; + struct percpu_counter total_inodes; + + struct ceph_mds_session *session; + struct delayed_work delayed_work; /* delayed work */ +}; + +static inline void metric_schedule_delayed(struct ceph_client_metric *m) +{ + if (disable_send_metrics) + return; + + /* per second */ + schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ)); +} + +extern int ceph_metric_init(struct ceph_client_metric *m); +extern void ceph_metric_destroy(struct ceph_client_metric *m); + +static inline void ceph_update_cap_hit(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_hit); +} + +static inline void ceph_update_cap_mis(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_mis); +} + +extern void ceph_update_metrics(struct ceph_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc); + +static inline void ceph_update_read_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_READ], + r_start, r_end, size, rc); +} +static inline void ceph_update_write_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_WRITE], + r_start, r_end, size, rc); +} +static inline void ceph_update_metadata_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ceph_update_metrics(&m->metric[METRIC_METADATA], + r_start, r_end, 0, rc); +} +static inline void ceph_update_copyfrom_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_COPYFROM], + r_start, r_end, size, rc); +} +#endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c new file mode 100644 index 000000000000..d90eda19bcc4 --- /dev/null +++ b/fs/ceph/quota.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * quota.c - CephFS quota + * + * Copyright (C) 2017-2018 SUSE + */ + +#include <linux/statfs.h> + +#include "super.h" +#include "mds_client.h" + +void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + if (inc) + atomic64_inc(&mdsc->quotarealms_count); + else + atomic64_dec(&mdsc->quotarealms_count); +} + +static inline bool ceph_has_realms_with_quotas(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct inode *root = d_inode(sb->s_root); + + if (atomic64_read(&mdsc->quotarealms_count) > 0) + return true; + /* if root is the real CephFS root, we don't have quota realms */ + if (root && ceph_ino(root) == CEPH_INO_ROOT) + return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; + /* otherwise, we can't know for sure */ + return true; +} + +void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct ceph_mds_quota *h = msg->front.iov_base; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_vino vino; + struct inode *inode; + struct ceph_inode_info *ci; + + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + + if (msg->front.iov_len < sizeof(*h)) { + pr_err_client(cl, "corrupt message mds%d len %d\n", + session->s_mds, (int)msg->front.iov_len); + ceph_msg_dump(msg); + goto out; + } + + /* lookup inode */ + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + pr_warn_client(cl, "failed to find inode %llx\n", vino.ino); + goto out; + } + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + ci->i_rbytes = le64_to_cpu(h->rbytes); + ci->i_rfiles = le64_to_cpu(h->rfiles); + ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); + __ceph_update_quota(ci, le64_to_cpu(h->max_bytes), + le64_to_cpu(h->max_files)); + spin_unlock(&ci->i_ceph_lock); + + iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); +} + +static struct ceph_quotarealm_inode * +find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) +{ + struct ceph_quotarealm_inode *qri = NULL; + struct rb_node **node, *parent = NULL; + struct ceph_client *cl = mdsc->fsc->client; + + mutex_lock(&mdsc->quotarealms_inodes_mutex); + node = &(mdsc->quotarealms_inodes.rb_node); + while (*node) { + parent = *node; + qri = container_of(*node, struct ceph_quotarealm_inode, node); + + if (ino < qri->ino) + node = &((*node)->rb_left); + else if (ino > qri->ino) + node = &((*node)->rb_right); + else + break; + } + if (!qri || (qri->ino != ino)) { + /* Not found, create a new one and insert it */ + qri = kmalloc(sizeof(*qri), GFP_KERNEL); + if (qri) { + qri->ino = ino; + qri->inode = NULL; + qri->timeout = 0; + mutex_init(&qri->mutex); + rb_link_node(&qri->node, parent, node); + rb_insert_color(&qri->node, &mdsc->quotarealms_inodes); + } else + pr_warn_client(cl, "Failed to alloc quotarealms_inode\n"); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); + + return qri; +} + +/* + * This function will try to lookup a realm inode which isn't visible in the + * filesystem mountpoint. A list of these kind of inodes (not visible) is + * maintained in the mdsc and freed only when the filesystem is umounted. + * + * Note that these inodes are kept in this list even if the lookup fails, which + * allows to prevent useless lookup requests. + */ +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, + struct super_block *sb, + struct ceph_snap_realm *realm) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_quotarealm_inode *qri; + struct inode *in; + + qri = find_quotarealm_inode(mdsc, realm->ino); + if (!qri) + return NULL; + + mutex_lock(&qri->mutex); + if (qri->inode && ceph_is_any_caps(qri->inode)) { + /* A request has already returned the inode */ + mutex_unlock(&qri->mutex); + return qri->inode; + } + /* Check if this inode lookup has failed recently */ + if (qri->timeout && + time_before_eq(jiffies, qri->timeout)) { + mutex_unlock(&qri->mutex); + return NULL; + } + if (qri->inode) { + /* get caps */ + int ret = __ceph_do_getattr(qri->inode, NULL, + CEPH_STAT_CAP_INODE, true); + if (ret >= 0) + in = qri->inode; + else + in = ERR_PTR(ret); + } else { + in = ceph_lookup_inode(sb, realm->ino); + } + + if (IS_ERR(in)) { + doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino, + PTR_ERR(in)); + qri->timeout = jiffies + secs_to_jiffies(60); /* XXX */ + } else { + qri->timeout = 0; + qri->inode = in; + } + mutex_unlock(&qri->mutex); + + return in; +} + +void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) +{ + struct ceph_quotarealm_inode *qri; + struct rb_node *node; + + /* + * It should now be safe to clean quotarealms_inode tree without holding + * mdsc->quotarealms_inodes_mutex... + */ + mutex_lock(&mdsc->quotarealms_inodes_mutex); + while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) { + node = rb_first(&mdsc->quotarealms_inodes); + qri = rb_entry(node, struct ceph_quotarealm_inode, node); + rb_erase(node, &mdsc->quotarealms_inodes); + iput(qri->inode); + kfree(qri); + } + mutex_unlock(&mdsc->quotarealms_inodes_mutex); +} + +/* + * This function walks through the snaprealm for an inode and set the + * realmp with the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, set the realmp with the root ceph_snap_realm instead. + * + * Note that the caller is responsible for calling ceph_put_snap_realm() on the + * returned realm. + * + * Callers of this function need to hold mdsc->snap_rwsem. However, if there's + * a need to do an inode lookup, this rwsem will be temporarily dropped. Hence + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false' + * this function will return -EAGAIN; otherwise, the snaprealms walk-through + * will be restarted. + */ +static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, + enum quota_get_realm which_quota, + struct ceph_snap_realm **realmp, bool retry) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_inode_info *ci = NULL; + struct ceph_snap_realm *realm, *next; + struct inode *in; + bool has_quota; + + if (realmp) + *realmp = NULL; + if (ceph_snap(inode) != CEPH_NOSNAP) + return 0; + +restart: + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited_client(cl, + "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + while (realm) { + bool has_inode; + + spin_lock(&realm->inodes_with_caps_lock); + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (has_inode && !in) + break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + if (!retry) + return -EAGAIN; + goto restart; + } + + ci = ceph_inode(in); + has_quota = __ceph_has_quota(ci, which_quota); + iput(in); + + next = realm->parent; + if (has_quota || !next) { + if (realmp) + *realmp = realm; + return 0; + } + + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + if (realm) + ceph_put_snap_realm(mdsc, realm); + + return 0; +} + +bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); + struct ceph_snap_realm *old_realm, *new_realm; + bool is_same; + int ret; + +restart: + /* + * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem. + * However, get_quota_realm may drop it temporarily. By setting the + * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was + * dropped and we can then restart the whole operation. + */ + down_read(&mdsc->snap_rwsem); + get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true); + ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false); + if (ret == -EAGAIN) { + up_read(&mdsc->snap_rwsem); + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + goto restart; + } + is_same = (old_realm == new_realm); + up_read(&mdsc->snap_rwsem); + + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + if (new_realm) + ceph_put_snap_realm(mdsc, new_realm); + + return is_same; +} + +enum quota_check_op { + QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files + limit is approaching */ +}; + +/* + * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each + * realm, it will execute quota check operation defined by the 'op' parameter. + * The snaprealm walk is interrupted if the quota check detects that the quota + * is exceeded or if the root inode is reached. + */ +static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, + loff_t delta) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm, *next; + struct inode *in; + u64 max, rvalue; + bool exceeded = false; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return false; + + down_read(&mdsc->snap_rwsem); +restart: + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited_client(cl, + "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + while (realm) { + bool has_inode; + + spin_lock(&realm->inodes_with_caps_lock); + has_inode = realm->inode; + in = has_inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (has_inode && !in) + break; + if (!in) { + up_read(&mdsc->snap_rwsem); + in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm); + down_read(&mdsc->snap_rwsem); + if (IS_ERR_OR_NULL(in)) + break; + ceph_put_snap_realm(mdsc, realm); + goto restart; + } + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (op == QUOTA_CHECK_MAX_FILES_OP) { + max = ci->i_max_files; + rvalue = ci->i_rfiles + ci->i_rsubdirs; + } else { + max = ci->i_max_bytes; + rvalue = ci->i_rbytes; + } + spin_unlock(&ci->i_ceph_lock); + switch (op) { + case QUOTA_CHECK_MAX_FILES_OP: + case QUOTA_CHECK_MAX_BYTES_OP: + exceeded = (max && (rvalue + delta > max)); + break; + case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP: + if (max) { + if (rvalue >= max) + exceeded = true; + else { + /* + * when we're writing more that 1/16th + * of the available space + */ + exceeded = + (((max - rvalue) >> 4) < delta); + } + } + break; + default: + /* Shouldn't happen */ + pr_warn_client(cl, "Invalid quota check op (%d)\n", op); + exceeded = true; /* Just break the loop */ + } + iput(in); + + next = realm->parent; + if (exceeded || !next) + break; + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + if (realm) + ceph_put_snap_realm(mdsc, realm); + up_read(&mdsc->snap_rwsem); + + return exceeded; +} + +/* + * ceph_quota_is_max_files_exceeded - check if we can create a new file + * @inode: directory where a new file is being created + * + * This functions returns true is max_files quota allows a new file to be + * created. It is necessary to walk through the snaprealm hierarchy (until the + * FS root) to check all realms with quotas set. + */ +bool ceph_quota_is_max_files_exceeded(struct inode *inode) +{ + if (!ceph_has_realms_with_quotas(inode)) + return false; + + WARN_ON(!S_ISDIR(inode->i_mode)); + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1); +} + +/* + * ceph_quota_is_max_bytes_exceeded - check if we can write to a file + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This functions returns true is max_bytes quota allows a file size to reach + * @newsize; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize) +{ + loff_t size = i_size_read(inode); + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size)); +} + +/* + * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This function returns true if the new file size @newsize will be consuming + * more than 1/16th of the available quota space; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize) +{ + loff_t size = ceph_inode(inode)->i_reported_size; + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP, + (newsize - size)); +} + +/* + * ceph_quota_update_statfs - if root has quota update statfs with quota status + * @fsc: filesystem client instance + * @buf: statfs to update + * + * If the mounted filesystem root has max_bytes quota set, update the filesystem + * statistics with the quota status. + * + * This function returns true if the stats have been updated, false otherwise. + */ +bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm; + struct inode *in; + u64 total = 0, used, free; + bool is_updated = false; + + down_read(&mdsc->snap_rwsem); + get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, + &realm, true); + up_read(&mdsc->snap_rwsem); + if (!realm) + return false; + + spin_lock(&realm->inodes_with_caps_lock); + in = realm->inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (in) { + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (ci->i_max_bytes) { + total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } + /* It is possible for a quota to be exceeded. + * Report 'zero' in that case + */ + free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } + } + spin_unlock(&ci->i_ceph_lock); + if (total) { + buf->f_blocks = total; + buf->f_bfree = free; + buf->f_bavail = free; + is_updated = true; + } + iput(in); + } + ceph_put_snap_realm(mdsc, realm); + + return is_updated; +} + diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f01645a27752..c65f2b202b2b 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,13 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/sort.h> #include <linux/slab.h> - +#include <linux/iversion.h> #include "super.h" #include "mds_client.h" - #include <linux/ceph/decode.h> +/* unused map expires after 5 minutes */ +#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) + /* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that @@ -57,26 +61,26 @@ /* * increase ref count for the realm * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { - dout("get_realm %p %d -> %d\n", realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)+1); + lockdep_assert_held(&mdsc->snap_rwsem); + /* - * since we _only_ increment realm refs or empty the empty - * list with snap_rwsem held, adjusting the empty list here is - * safe. we do need to protect against concurrent empty list - * additions, however. + * The 0->1 and 1->0 transitions must take the snap_empty_lock + * atomically with the refcount change. Go ahead and bump the + * nref here, unless it's 0, in which case we take the spinlock + * and then do the increment and remove it from the list. */ - if (atomic_read(&realm->nref) == 0) { - spin_lock(&mdsc->snap_empty_lock); - list_del_init(&realm->empty_item); - spin_unlock(&mdsc->snap_empty_lock); - } + if (atomic_inc_not_zero(&realm->nref)) + return; - atomic_inc(&realm->nref); + spin_lock(&mdsc->snap_empty_lock); + if (atomic_inc_return(&realm->nref) == 1) + list_del_init(&realm->empty_item); + spin_unlock(&mdsc->snap_empty_lock); } static void __insert_snap_realm(struct rb_root *root, @@ -112,34 +116,46 @@ static struct ceph_snap_realm *ceph_create_snap_realm( { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + realm = kzalloc(sizeof(*realm), GFP_NOFS); if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 0); /* tree does not take a ref */ + /* Do not release the global dummy snaprealm until unmouting */ + if (ino == CEPH_INO_GLOBAL_SNAPREALM) + atomic_set(&realm->nref, 2); + else + atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); - dout("create_snap_realm %llx %p\n", realm->ino, realm); + mdsc->num_snap_realms++; + + doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm); return realm; } /* * lookup the realm rooted at @ino. * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ -struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, - u64 ino) +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) { + struct ceph_client *cl = mdsc->fsc->client; struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; + lockdep_assert_held(&mdsc->snap_rwsem); + while (n) { r = rb_entry(n, struct ceph_snap_realm, node); if (ino < r->ino) @@ -147,13 +163,23 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("lookup_snap_realm %llx %p\n", r->ino, r); + doutc(cl, "%llx %p\n", r->ino, r); return r; } } return NULL; } +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); @@ -163,9 +189,13 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, static void __destroy_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + struct ceph_client *cl = mdsc->fsc->client; + lockdep_assert_held_write(&mdsc->snap_rwsem); + + doutc(cl, "%p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); + mdsc->num_snap_realms--; if (realm->parent) { list_del_init(&realm->child_item); @@ -184,28 +214,30 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { - dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + lockdep_assert_held_write(&mdsc->snap_rwsem); + + /* + * We do not require the snap_empty_lock here, as any caller that + * increments the value must hold the snap_rwsem. + */ if (atomic_dec_and_test(&realm->nref)) __destroy_snap_realm(mdsc, realm); } /* - * caller needn't hold any locks + * See comments in ceph_get_snap_realm. Caller needn't hold any locks. */ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { - dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)-1); - if (!atomic_dec_and_test(&realm->nref)) + if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) return; if (down_write_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&mdsc->snap_empty_lock); __destroy_snap_realm(mdsc, realm); up_write(&mdsc->snap_rwsem); } else { - spin_lock(&mdsc->snap_empty_lock); list_add(&realm->empty_item, &mdsc->snap_empty); spin_unlock(&mdsc->snap_empty_lock); } @@ -222,6 +254,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + spin_lock(&mdsc->snap_empty_lock); while (!list_empty(&mdsc->snap_empty)) { realm = list_first_entry(&mdsc->snap_empty, @@ -234,9 +268,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_empty_lock); } -void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { + struct ceph_snap_realm *global_realm; + down_write(&mdsc->snap_rwsem); + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); + if (global_realm) + ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } @@ -253,8 +292,11 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, u64 parentino) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent; + lockdep_assert_held_write(&mdsc->snap_rwsem); + if (realm->parent_ino == parentino) return 0; @@ -264,16 +306,14 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", - realm->ino, realm, realm->parent_ino, realm->parent, - parentino, parent); + doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm, + realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); } realm->parent_ino = parentino; realm->parent = parent; - ceph_get_snap_realm(mdsc, parent); list_add(&realm->child_item, &parent->children); return 1; } @@ -288,11 +328,16 @@ static int cmpu64_rev(const void *a, const void *b) return 0; } + /* * build the snap context for a given realm. */ -static int build_snap_context(struct ceph_snap_realm *realm) +static int build_snap_context(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + struct list_head *realm_queue, + struct list_head *dirty_realms) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; @@ -305,9 +350,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent); - if (err) - goto fail; + /* add to the queue head */ + list_add(&parent->rebuild_item, realm_queue); + return 1; } num += parent->cached_context->num_snaps; } @@ -320,11 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" - " (unchanged)\n", - realm->ino, realm, realm->cached_context, - realm->cached_context->seq, - (unsigned int) realm->cached_context->num_snaps); + doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + (unsigned int)realm->cached_context->num_snaps); return 0; } @@ -361,13 +405,13 @@ static int build_snap_context(struct ceph_snap_realm *realm) sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", - realm->ino, realm, snapc, snapc->seq, - (unsigned int) snapc->num_snaps); + doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm, + snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) - ceph_put_snap_context(realm->cached_context); + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); return 0; fail: @@ -379,23 +423,63 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, - realm, err); + pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err); return err; } /* * rebuild snap context for the given realm and all of its children. */ -static void rebuild_snap_realms(struct ceph_snap_realm *realm) +static void rebuild_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, + struct list_head *dirty_realms) { - struct ceph_snap_realm *child; + struct ceph_client *cl = mdsc->fsc->client; + LIST_HEAD(realm_queue); + int last = 0; + bool skip = false; + + list_add_tail(&realm->rebuild_item, &realm_queue); + + while (!list_empty(&realm_queue)) { + struct ceph_snap_realm *_realm, *child; + + _realm = list_first_entry(&realm_queue, + struct ceph_snap_realm, + rebuild_item); + + /* + * If the last building failed dues to memory + * issue, just empty the realm_queue and return + * to avoid infinite loop. + */ + if (last < 0) { + list_del_init(&_realm->rebuild_item); + continue; + } - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm); + last = build_snap_context(mdsc, _realm, &realm_queue, + dirty_realms); + doutc(cl, "%llx %p, %s\n", realm->ino, realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); - list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child); + /* is any child in the list ? */ + list_for_each_entry(child, &_realm->children, child_item) { + if (!list_empty(&child->rebuild_item)) { + skip = true; + break; + } + } + + if (!skip) { + list_for_each_entry(child, &_realm->children, child_item) + list_add_tail(&child->rebuild_item, &realm_queue); + } + + /* last == 1 means need to build parent first */ + if (last <= 0) + list_del_init(&_realm->rebuild_item); + } } @@ -420,6 +504,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -435,22 +527,23 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap **pcapsnap) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap; + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_cap_snap *capsnap = *pcapsnap; + struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); - if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); - return; - } - spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -464,72 +557,98 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + doutc(cl, "%p %llx.%llx already pending\n", inode, + ceph_vinop(inode)); + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode, + ceph_vinop(inode)); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n", + inode, ceph_vinop(inode)); + goto update_snapc; + } + } - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; + doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n", + inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); + ihold(inode); - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; - } + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + if (dirty & CEPH_CAP_XATTR_EXCL) { + old_blob = __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + + if (used & CEPH_CAP_FILE_WR) { + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", inode, ceph_vinop(inode), capsnap, + old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + *pcapsnap = NULL; + old_snapc = NULL; + +update_snapc: + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + doutc(cl, " new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + ceph_buffer_put(old_blob); + ceph_put_snap_context(old_snapc); } /* @@ -543,30 +662,55 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { - struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; BUG_ON(capsnap->writing); - capsnap->size = inode->i_size; - capsnap->mtime = inode->i_mtime; - capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->size = i_size_read(inode); + capsnap->mtime = inode_get_mtime(inode); + capsnap->atime = inode_get_atime(inode); + capsnap->ctime = inode_get_ctime(inode); + capsnap->btime = ci->i_btime; + capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; + capsnap->truncate_size = ci->i_truncate_size; + capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size, - capsnap->dirty_pages); + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " + "s=%llu still has %d dirty pages\n", inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, + ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); + return 0; + } + + /* + * Defer flushing the capsnap if the dirty buffer not flushed yet. + * And trigger to flush the buffer immediately. + */ + if (ci->i_wrbuffer_ref) { + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " + "s=%llu used WRBUFFER, delaying\n", inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); + ceph_queue_writeback(inode); return 0; } - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", - inode, capsnap, capsnap->context, - capsnap->context->seq, ceph_cap_string(capsnap->dirty), - capsnap->size); + + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + inode, ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); spin_lock(&mdsc->snap_flush_lock); - list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + if (list_empty(&ci->i_snap_flush_item)) { + ihold(inode); + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + } spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } @@ -575,40 +719,54 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, * Queue cap_snaps for snap writeback for this realm and its children. * Called under snap_rwsem, so realm topology won't change. */ -static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *lastinode = NULL; - struct ceph_snap_realm *child; + struct ceph_cap_snap *capsnap = NULL; - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + doutc(cl, "%p %llx inode\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { + struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci); + + /* + * Allocate the capsnap memory outside of ceph_queue_cap_snap() + * to reduce very possible but unnecessary frequently memory + * allocate/free in this loop. + */ + if (!capsnap) { + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); + if (!capsnap) { + pr_err_client(cl, + "ENOMEM allocating ceph_cap_snap on %p\n", + inode); + return; + } + } + capsnap->cap_flush.is_capsnap = true; + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + INIT_LIST_HEAD(&capsnap->ci_item); + + ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - - list_for_each_entry(child, &realm->children, child_item) { - dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", - realm, realm->ino, child, child->ino); - list_del_init(&child->dirty_item); - list_add(&child->dirty_item, &realm->dirty_item); - } + iput(lastinode); - list_del_init(&realm->dirty_item); - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); + doutc(cl, "%p %llx done\n", realm, realm->ino); } /* @@ -619,18 +777,28 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, - void *p, void *e, bool deletion) + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm; - int invalidate = 0; + struct ceph_snap_realm *first_realm = NULL; + struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; + int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); - dout("update_snap_trace deletion=%d\n", deletion); + lockdep_assert_held_write(&mdsc->snap_rwsem); + + doutc(cl, "deletion=%d\n", deletion); more: + realm = NULL; + rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); @@ -654,11 +822,11 @@ more: err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; - invalidate += err; + rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", - realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino, + realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); @@ -676,29 +844,42 @@ more: if (err < 0) goto fail; - /* queue realm for cap_snap creation */ - list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; - invalidate = 1; + rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("update_snap_trace %llx %p seq %lld new\n", - realm->ino, realm, realm->seq); - invalidate = 1; + doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm, + realm->seq); + rebuild_snapcs = 1; } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", - realm->ino, realm, realm->seq); + doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm, + realm->seq); } - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, - realm, invalidate, p, e); + doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); + + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; + + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms); + + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); if (p < e) goto more; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate) - rebuild_snap_realms(realm); - /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. @@ -706,16 +887,48 @@ more: while (!list_empty(&dirty_realms)) { realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); - queue_realm_cap_snaps(realm); + list_del_init(&realm->dirty_item); + queue_realm_cap_snaps(mdsc, realm); } + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + __cleanup_empty_realms(mdsc); return 0; bad: - err = -EINVAL; + err = -EIO; fail: - pr_err("update_snap_trace error %d\n", err); + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); + pr_err_client(cl, "error %d\n", err); + + /* + * When receiving a corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this kclient and then this kclient + * must be remounted to continue after the corrupted metadata + * fixed in the MDS side. + */ + WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) + pr_err_client(cl, "failed to blocklist %s: %d\n", + ceph_pr_addr(&client->msgr.inst.addr), ret); + + WARN(1, "[client.%lld] %s %s%sdo remount to continue%s", + client->monc.auth->global_id, __func__, + ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + ret ? "" : " was blocklisted, ", + err == -EIO ? " after corrupted snaptrace is fixed" : ""); + return err; } @@ -728,33 +941,66 @@ fail: */ static void flush_snaps(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; struct ceph_mds_session *session = NULL; - dout("flush_snaps\n"); + doutc(cl, "begin\n"); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, &session, 0); - spin_unlock(&ci->i_ceph_lock); + ceph_flush_snaps(ci, &session); iput(inode); spin_lock(&mdsc->snap_flush_lock); } spin_unlock(&mdsc->snap_flush_lock); - if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } - dout("flush_snaps done\n"); + ceph_put_mds_session(session); + doutc(cl, "done\n"); } +/** + * ceph_change_snap_realm - change the snap_realm for an inode + * @inode: inode to move to new snap realm + * @realm: new realm to move inode into (may be NULL) + * + * Detach an inode from its old snaprealm (if any) and attach it to + * the new snaprealm (if any). The old snap realm reference held by + * the inode is put. If realm is non-NULL, then the caller's reference + * to it is taken over by the inode. + */ +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + + lockdep_assert_held(&ci->i_ceph_lock); + + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + if (oldrealm->ino == ci->i_vino.ino) + oldrealm->inode = NULL; + spin_unlock(&oldrealm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, oldrealm); + } + + ci->i_snap_realm = realm; + + if (realm) { + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + } +} /* * Handle a snap notification from the MDS. @@ -771,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; @@ -784,6 +1031,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, __le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; + bool close_sessions = false; + + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; /* decode */ if (msg->front.iov_len < sizeof(*h)) @@ -797,12 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, - ceph_snap_op_name(op), split, trace_len); - - mutex_lock(&session->s_mutex); - session->s_seq++; - mutex_unlock(&session->s_mutex); + doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -832,9 +1079,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, if (IS_ERR(realm)) goto out; } - ceph_get_snap_realm(mdsc, realm); - dout("splitting snap_realm %llx %p\n", realm->ino, realm); + doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { struct ceph_vino vino = { .ino = le64_to_cpu(split_inos[i]), @@ -842,7 +1088,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; - struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -860,28 +1105,17 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p in newer realm %llx %p\n", - inode, ci->i_snap_realm->ino, - ci->i_snap_realm); + doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, + ci->i_snap_realm); goto skip_inode; } - dout(" will move %p to split realm %llx %p\n", - inode, realm->ino, realm); - /* - * Move the inode to the new realm - */ - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - oldrealm = ci->i_snap_realm; - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - spin_unlock(&ci->i_ceph_lock); + doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - + ceph_change_snap_realm(inode, realm); + spin_unlock(&ci->i_ceph_lock); iput(inode); continue; @@ -893,20 +1127,37 @@ skip_inode: /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = - ceph_lookup_snap_realm(mdsc, + __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; adjust_snap_realm_parent(mdsc, child, realm->ino); } + } else { + /* + * In the non-split case both 'num_split_inos' and + * 'num_split_realms' should be 0, making this a no-op. + * However the MDS happens to populate 'split_realms' list + * in one of the UPDATE op cases by mistake. + * + * Skip both lists just in case to ensure that 'p' is + * positioned at the start of realm info, as expected by + * ceph_update_snap_trace(). + */ + p += sizeof(u64) * num_split_inos; + p += sizeof(u64) * num_split_realms; } /* * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps. */ - ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY); + if (ceph_update_snap_trace(mdsc, p, e, + op == CEPH_SNAP_OP_DESTROY, + NULL)) { + close_sessions = true; + goto bad; + } if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -917,16 +1168,176 @@ skip_inode: up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: - pr_err("corrupt snap message from mds%d\n", mds); + pr_err_client(cl, "corrupt snap message from mds%d\n", mds); ceph_msg_dump(msg); out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + + ceph_dec_mds_stopping_blocker(mdsc); + + if (close_sessions) + ceph_mdsc_close_sessions(mdsc); return; } +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_snapid_map *sm, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&mdsc->snapid_map_lock); + p = &mdsc->snapid_map_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) { + p = &(*p)->rb_left; + } else if (snap < exist->snap) { + p = &(*p)->rb_right; + } else { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + break; + } + exist = NULL; + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + doutc(cl, "found snapid map %llx -> %x\n", exist->snap, + exist->dev); + return exist; + } + + sm = kmalloc(sizeof(*sm), GFP_NOFS); + if (!sm) + return NULL; + + ret = get_anon_bdev(&sm->dev); + if (ret < 0) { + kfree(sm); + return NULL; + } + + INIT_LIST_HEAD(&sm->lru); + atomic_set(&sm->ref, 1); + sm->snap = snap; + + exist = NULL; + parent = NULL; + p = &mdsc->snapid_map_tree.rb_node; + spin_lock(&mdsc->snapid_map_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) + p = &(*p)->rb_left; + else if (snap < exist->snap) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist) { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + } else { + rb_link_node(&sm->node, parent, p); + rb_insert_color(&sm->node, &mdsc->snapid_map_tree); + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + free_anon_bdev(sm->dev); + kfree(sm); + doutc(cl, "found snapid map %llx -> %x\n", exist->snap, + exist->dev); + return exist; + } + + doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev); + return sm; +} + +void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm) +{ + if (!sm) + return; + if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { + if (!RB_EMPTY_NODE(&sm->node)) { + sm->last_used = jiffies; + list_add_tail(&sm->lru, &mdsc->snapid_map_lru); + spin_unlock(&mdsc->snapid_map_lock); + } else { + /* already cleaned up by + * ceph_cleanup_snapid_map() */ + spin_unlock(&mdsc->snapid_map_lock); + kfree(sm); + } + } +} +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_snapid_map *sm; + unsigned long now; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + now = jiffies; + + while (!list_empty(&mdsc->snapid_map_lru)) { + sm = list_first_entry(&mdsc->snapid_map_lru, + struct ceph_snapid_map, lru); + if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) + break; + + rb_erase(&sm->node, &mdsc->snapid_map_tree); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev); + free_anon_bdev(sm->dev); + kfree(sm); + } +} +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_snapid_map *sm; + struct rb_node *p; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + while ((p = rb_first(&mdsc->snapid_map_tree))) { + sm = rb_entry(p, struct ceph_snapid_map, node); + rb_erase(p, &mdsc->snapid_map_tree); + RB_CLEAR_NODE(p); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + free_anon_bdev(sm->dev); + if (WARN_ON_ONCE(atomic_read(&sm->ref))) { + pr_err_client(cl, "snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); + } + kfree(sm); + } +} diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a940a0f..e36e8948e728 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Ceph fs string constants */ @@ -41,6 +42,11 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; + case CEPH_SESSION_FORCE_RO: return "force_ro"; + case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; } return "???"; } @@ -52,7 +58,9 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; @@ -72,6 +80,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6627b26a800c..f6bf24b5c683 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/ceph/ceph_debug.h> @@ -8,7 +9,8 @@ #include <linux/in6.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -17,6 +19,8 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> @@ -24,6 +28,11 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#include <uapi/linux/magic.h> + +static DEFINE_SPINLOCK(ceph_fsc_lock); +static LIST_HEAD(ceph_fsc_list); + /* * Ceph superblock operations * @@ -35,33 +44,30 @@ */ static void ceph_put_super(struct super_block *s) { - struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); - dout("put_super\n"); + doutc(fsc->client, "begin\n"); + ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); - - /* - * ensure we release the bdi before put_anon_super releases - * the device name. - */ - if (s->s_bdi == &fsc->backing_dev_info) { - bdi_unregister(&fsc->backing_dev_info); - s->s_bdi = NULL; - } - - return; + doutc(fsc->client, "done\n"); } static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); - struct ceph_monmap *monmap = fsc->client->monc.monmap; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry)); + struct ceph_mon_client *monc = &fsc->client->monc; struct ceph_statfs st; - u64 fsid; - int err; + int i, err; + u64 data_pool; + + doutc(fsc->client, "begin\n"); + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; + } else { + data_pool = CEPH_NOPOOL; + } - dout("statfs\n"); - err = ceph_monc_do_statfs(&fsc->client->monc, &st); + err = ceph_monc_do_statfs(monc, data_pool, &st); if (err < 0) return err; @@ -69,48 +75,66 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ /* - * express utilization in terms of large blocks to avoid + * Express utilization in terms of large blocks to avoid * overflow on 32-bit machines. - * + */ + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; + + /* + * By default use root quota for stats; fallback to overall filesystem + * usage if using 'noquotadf' mount option or if the root dir doesn't + * have max_bytes quota set. + */ + if (ceph_test_mount_opt(fsc, NOQUOTADF) || + !ceph_quota_update_statfs(fsc, buf)) { + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + } + + /* * NOTE: for the time being, we make bsize == frsize to humor * not-yet-ancient versions of glibc that are broken. * Someday, we will probably want to report a real block * size... whatever that may mean for a network file system! */ - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; - buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; - buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bsize = buf->f_frsize; buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - /* leave fsid little-endian, regardless of host endianness */ - fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); - buf->f_fsid.val[0] = fsid & 0xffffffff; - buf->f_fsid.val[1] = fsid >> 32; + /* Must convert the fsid, for consistent values across arches */ + buf->f_fsid.val[0] = 0; + mutex_lock(&monc->mutex); + for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i) + buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]); + mutex_unlock(&monc->mutex); + + /* fold the fs_cluster_id into the upper bits */ + buf->f_fsid.val[1] = monc->fs_cluster_id; + doutc(fsc->client, "done\n"); return 0; } - static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; if (!wait) { - dout("sync_fs (non-blocking)\n"); + doutc(cl, "(non-blocking)\n"); ceph_flush_dirty_caps(fsc->mdsc); - dout("sync_fs (non-blocking) done\n"); + ceph_flush_cap_releases(fsc->mdsc); + doutc(cl, "(non-blocking) done\n"); return 0; } - dout("sync_fs (blocking)\n"); + doutc(cl, "(blocking)\n"); ceph_osdc_sync(&fsc->client->osdc); ceph_mdsc_sync(fsc->mdsc); - dout("sync_fs (blocking) done\n"); + doutc(cl, "(blocking) done\n"); return 0; } @@ -123,153 +147,486 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, - Opt_cap_release_safety, + Opt_caps_max, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, - Opt_last_int, /* int args above */ Opt_snapdirname, - Opt_last_string, + Opt_mds_namespace, + Opt_recover_session, + Opt_source, + Opt_mon_addr, + Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, - Opt_nodirstat, Opt_rbytes, - Opt_norbytes, Opt_asyncreaddir, - Opt_noasyncreaddir, Opt_dcache, - Opt_nodcache, Opt_ino32, - Opt_noino32, + Opt_fscache, + Opt_poolperm, + Opt_require_active_mds, + Opt_acl, + Opt_quotadf, + Opt_copyfrom, + Opt_wsync, + Opt_pagecache, + Opt_sparseread, }; -static match_table_t fsopt_tokens = { - {Opt_wsize, "wsize=%d"}, - {Opt_rsize, "rsize=%d"}, - {Opt_rasize, "rasize=%d"}, - {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, - {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_cap_release_safety, "cap_release_safety=%d"}, - {Opt_readdir_max_entries, "readdir_max_entries=%d"}, - {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, - {Opt_congestion_kb, "write_congestion_kb=%d"}, - /* int args above */ - {Opt_snapdirname, "snapdirname=%s"}, - /* string args above */ - {Opt_dirstat, "dirstat"}, - {Opt_nodirstat, "nodirstat"}, - {Opt_rbytes, "rbytes"}, - {Opt_norbytes, "norbytes"}, - {Opt_asyncreaddir, "asyncreaddir"}, - {Opt_noasyncreaddir, "noasyncreaddir"}, - {Opt_dcache, "dcache"}, - {Opt_nodcache, "nodcache"}, - {Opt_ino32, "ino32"}, - {Opt_noino32, "noino32"}, - {-1, NULL} +enum ceph_recover_session_mode { + ceph_recover_session_no, + ceph_recover_session_clean }; -static int parse_fsopt_token(char *c, void *private) +static const struct constant_table ceph_param_recover[] = { + { "no", ceph_recover_session_no }, + { "clean", ceph_recover_session_clean }, + {} +}; + +static const struct fs_parameter_spec ceph_mount_parameters[] = { + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), + fsparam_s32 ("caps_max", Opt_caps_max), + fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), + fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_flag_no ("copyfrom", Opt_copyfrom), + fsparam_flag_no ("dcache", Opt_dcache), + fsparam_flag_no ("dirstat", Opt_dirstat), + fsparam_flag_no ("fsc", Opt_fscache), // fsc|nofsc + fsparam_string ("fsc", Opt_fscache), // fsc=... + fsparam_flag_no ("ino32", Opt_ino32), + fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_string ("mon_addr", Opt_mon_addr), + fsparam_flag_no ("poolperm", Opt_poolperm), + fsparam_flag_no ("quotadf", Opt_quotadf), + fsparam_u32 ("rasize", Opt_rasize), + fsparam_flag_no ("rbytes", Opt_rbytes), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_enum ("recover_session", Opt_recover_session, ceph_param_recover), + fsparam_flag_no ("require_active_mds", Opt_require_active_mds), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_string ("source", Opt_source), + fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_u32 ("wsize", Opt_wsize), + fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), + fsparam_flag_no ("sparseread", Opt_sparseread), + {} +}; + +struct ceph_parse_opts_ctx { + struct ceph_options *copts; + struct ceph_mount_options *opts; +}; + +/* + * Remove adjacent slashes and then the trailing slash, unless it is + * the only remaining character. + * + * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/". + */ +static void canonicalize_path(char *path) { - struct ceph_mount_options *fsopt = private; - substring_t argstr[MAX_OPT_ARGS]; - int token, intval, ret; + int i, j = 0; - token = match_token((char *)c, fsopt_tokens, argstr); - if (token < 0) + for (i = 0; path[i] != '\0'; i++) { + if (path[i] != '/' || j < 1 || path[j - 1] != '/') + path[j++] = path[i]; + } + + if (j > 1 && path[j - 1] == '/') + j--; + path[j] = '\0'; +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_options *opts = pctx->copts; + struct ceph_mount_options *fsopt = pctx->opts; + const char *name_start = dev_name; + const char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); return -EINVAL; + } - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); - return ret; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + len = fsid_start - name_start; + kfree(opts->name); + opts->name = kstrndup(name_start, len, GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + dout("using %s entity name", opts->name); + + ++fsid_start; /* start of cluster fsid */ + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. + * + * New device syntax will looks like: + * <device_spec>=/<path> + * where + * <device_spec> is name@fsid.fsname + * <path> is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) + * + * Old device syntax is: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ +static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *dev_name = param->string, *dev_name_end; + int ret; + + dout("'%s'\n", dev_name); + if (!dev_name || !*dev_name) + return invalfc(fc, "Empty source"); + + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* + * The server_path will include the whole chars from userland + * including the leading '/'. + */ + kfree(fsopt->server_path); + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; + + canonicalize_path(fsopt->server_path); } else { - dout("got token %d\n", token); + dev_name_end = dev_name + strlen(dev_name); } + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); + + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); + + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } + + fc->source = param->string; + param->string = NULL; + return 0; +} + +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + +static int ceph_parse_mount_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct fs_parse_result result; + unsigned int mode; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, fc->log.log); + if (ret != -ENOPARAM) + return ret; + + token = fs_parse(fc, ceph_mount_parameters, param, &result); + dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token); + if (token < 0) + return token; + switch (token) { case Opt_snapdirname: + if (strlen(param->string) > NAME_MAX) + return invalfc(fc, "snapdirname too long"); kfree(fsopt->snapdir_name); - fsopt->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->snapdir_name) - return -ENOMEM; + fsopt->snapdir_name = param->string; + param->string = NULL; break; - - /* misc */ + case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = param->string; + param->string = NULL; + break; + case Opt_recover_session: + mode = result.uint_32; + if (mode == ceph_recover_session_no) + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + else if (mode == ceph_recover_session_clean) + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + else + BUG(); + break; + case Opt_source: + if (fc->source) + return invalfc(fc, "Multiple sources specified"); + return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: - fsopt->wsize = intval; + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_WRITE_SIZE) + goto out_of_range; + fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rsize: - fsopt->rsize = intval; + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_READ_SIZE) + goto out_of_range; + fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rasize: - fsopt->rasize = intval; + fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: - fsopt->caps_wanted_delay_min = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_min = result.uint_32; break; case Opt_caps_wanted_delay_max: - fsopt->caps_wanted_delay_max = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_max = result.uint_32; + break; + case Opt_caps_max: + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: - fsopt->max_readdir = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->max_readdir = result.uint_32; break; case Opt_readdir_max_bytes: - fsopt->max_readdir_bytes = intval; + if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) + goto out_of_range; + fsopt->max_readdir_bytes = result.uint_32; break; case Opt_congestion_kb: - fsopt->congestion_kb = intval; + if (result.uint_32 < 1024) /* at least 1M */ + goto out_of_range; + fsopt->congestion_kb = result.uint_32; break; case Opt_dirstat: - fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; - break; - case Opt_nodirstat: - fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; break; case Opt_rbytes: - fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; - break; - case Opt_norbytes: - fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; case Opt_asyncreaddir: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; - break; - case Opt_noasyncreaddir: - fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; case Opt_dcache: - fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; - break; - case Opt_nodcache: - fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; break; case Opt_ino32: - fsopt->flags |= CEPH_MOUNT_OPT_INO32; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; + + case Opt_fscache: +#ifdef CONFIG_CEPH_FSCACHE + kfree(fsopt->fscache_uniq); + fsopt->fscache_uniq = NULL; + if (result.negated) { + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + } else { + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + fsopt->fscache_uniq = param->string; + param->string = NULL; + } + break; +#else + return invalfc(fc, "fscache support is disabled"); +#endif + case Opt_poolperm: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; + case Opt_require_active_mds: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + else + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + break; + case Opt_quotadf: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + break; + case Opt_copyfrom: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + break; + case Opt_acl: + if (!result.negated) { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalfc(fc, "POSIX ACL support is disabled"); +#endif + } else { + fc->sb_flags &= ~SB_POSIXACL; + } + break; + case Opt_wsync: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; + else + fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; - case Opt_noino32: - fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; + case Opt_sparseread: + if (result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; + else + fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); + ret = fscrypt_parse_test_dummy_encryption(param, + &fsopt->dummy_enc_policy); + if (ret == -EINVAL) { + warnfc(fc, "Value of option \"%s\" is unrecognized", + param->key); + } else if (ret == -EEXIST) { + warnfc(fc, "Conflicting test_dummy_encryption options"); + ret = -EINVAL; + } +#else + warnfc(fc, + "FS encryption not supported: test_dummy_encryption mount option ignored"); +#endif break; default: - BUG_ON(token); + BUG(); } return 0; + +out_of_range: + return invalfc(fc, "%s out of range", param->key); } static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); + if (!args) + return; + kfree(args->snapdir_name); + kfree(args->mds_namespace); + kfree(args->server_path); + kfree(args->fscache_uniq); + kfree(args->mon_addr); + fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } @@ -301,84 +658,23 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; - return ceph_compare_options(new_opt, fsc->client); -} - -static int parse_mount_options(struct ceph_mount_options **pfsopt, - struct ceph_options **popt, - int flags, char *options, - const char *dev_name, - const char **path) -{ - struct ceph_mount_options *fsopt; - const char *dev_name_end; - int err; - - if (!dev_name || !*dev_name) - return -EINVAL; - - fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); - if (!fsopt) - return -ENOMEM; - - dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - - fsopt->rsize = CEPH_RSIZE_DEFAULT; - fsopt->rasize = CEPH_RASIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); + if (ret) + return ret; - /* - * Distinguish the server list from the path in "dev_name". - * Internally we do not include the leading '/' in the path. - * - * "dev_name" will look like: - * <server_spec>[,<server_spec>...]:[<path>] - * where - * <server_spec> is <ip>[:<port>] - * <path> is optional, but if present must begin with '/' - */ - dev_name_end = strchr(dev_name, '/'); - if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; - } else { - /* path is empty */ - dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; - } - err = -EINVAL; - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') { - pr_err("device name is missing path (no : separator in %s)\n", - dev_name); - goto out; - } - dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; - *popt = ceph_parse_options(options, dev_name, dev_name_end, - parse_fsopt_token, (void *)fsopt); - if (IS_ERR(*popt)) { - err = PTR_ERR(*popt); - goto out; - } + ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); + if (ret) + return ret; - /* success */ - *pfsopt = fsopt; - return 0; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; -out: - destroy_mount_options(fsopt); - return err; + return ceph_compare_options(new_opt, fsc->client); } /** @@ -388,64 +684,93 @@ out: */ static int ceph_show_options(struct seq_file *m, struct dentry *root) { - struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; - struct ceph_options *opt = fsc->client->options; - - if (opt->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &opt->fsid); - if (opt->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (opt->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - - if (opt->name) - seq_printf(m, ",name=%s", opt->name); - if (opt->key) - seq_puts(m, ",secret=<hidden>"); - - if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); - if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - opt->osd_keepalive_timeout); + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client, false); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); - if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) - seq_puts(m, ",norbytes"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES)) + seq_puts(m, ",rbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) - seq_puts(m, ",dcache"); - else + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_INO32) + seq_puts(m, ",ino32"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { + seq_show_option(m, "fsc", fsopt->fscache_uniq); + } + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) + seq_puts(m, ",noquotadf"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (root->d_sb->s_flags & SB_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + + if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) + seq_puts(m, ",copyfrom"); - if (fsopt->wsize) - seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_RSIZE_DEFAULT) - seq_printf(m, ",rsize=%d", fsopt->rsize); + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) + seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + + if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) + seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + seq_puts(m, ",sparseread"); + + fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); + + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) + seq_printf(m, ",wsize=%u", fsopt->wsize); + if (fsopt->rsize != CEPH_MAX_READ_SIZE) + seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) - seq_printf(m, ",rasize=%d", fsopt->rasize); + seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); + if (fsopt->caps_max) + seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", + seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", + seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); - if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - fsopt->cap_release_safety); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); + return 0; } @@ -460,9 +785,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(fsc->mdsc, msg); + ceph_mdsc_handle_mdsmap(fsc->mdsc, msg); + return 0; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(fsc->mdsc, msg); return 0; - default: return -1; } @@ -470,106 +797,106 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) /* * create a new fs client + * + * Success or not, this function consumes @fsopt and @opt. */ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; - int page_count; - size_t size; - int err = -ENOMEM; + int err; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); - if (!fsc) - return ERR_PTR(-ENOMEM); + if (!fsc) { + err = -ENOMEM; + goto fail; + } - fsc->client = ceph_create_client(opt, fsc, supported_features, - required_features); + fsc->client = ceph_create_client(opt, fsc); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; } + opt = NULL; /* fsc->client now owns this */ + fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->monc.want_mdsmap = 1; + ceph_set_opt(fsc->client, ABORT_ON_FULL); + + if (!fsopt->mds_namespace) { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + } else { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP, + 0, false); + } fsc->mount_options = fsopt; fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; + fsc->have_copy_from2 = true; atomic_long_set(&fsc->writeback_count, 0); - - err = bdi_init(&fsc->backing_dev_info); - if (err < 0) - goto fail_client; + fsc->write_congested = false; err = -ENOMEM; /* * The number of concurrent works can be high but they don't need * to be processed in parallel, limit concurrency. */ - fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); - if (fsc->wb_wq == NULL) - goto fail_bdi; - fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); - if (fsc->pg_inv_wq == NULL) - goto fail_wb_wq; - fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); - if (fsc->trunc_wq == NULL) - goto fail_pg_inv_wq; - - /* set up mempools */ - err = -ENOMEM; - page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; - size = sizeof (struct page *) * (page_count ? page_count : 1); - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); - if (!fsc->wb_pagevec_pool) - goto fail_trunc_wq; + fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); + if (!fsc->inode_wq) + goto fail_client; + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); + if (!fsc->cap_wq) + goto fail_inode_wq; + + hash_init(fsc->async_unlink_conflict); + spin_lock_init(&fsc->async_unlink_conflict_lock); - /* caps */ - fsc->min_caps = fsopt->max_readdir; + spin_lock(&ceph_fsc_lock); + list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); + spin_unlock(&ceph_fsc_lock); return fsc; -fail_trunc_wq: - destroy_workqueue(fsc->trunc_wq); -fail_pg_inv_wq: - destroy_workqueue(fsc->pg_inv_wq); -fail_wb_wq: - destroy_workqueue(fsc->wb_wq); -fail_bdi: - bdi_destroy(&fsc->backing_dev_info); +fail_inode_wq: + destroy_workqueue(fsc->inode_wq); fail_client: ceph_destroy_client(fsc->client); fail: kfree(fsc); + if (opt) + ceph_destroy_options(opt); + destroy_mount_options(fsopt); return ERR_PTR(err); } -static void destroy_fs_client(struct ceph_fs_client *fsc) +static void flush_fs_workqueues(struct ceph_fs_client *fsc) { - dout("destroy_fs_client %p\n", fsc); + flush_workqueue(fsc->inode_wq); + flush_workqueue(fsc->cap_wq); +} - destroy_workqueue(fsc->wb_wq); - destroy_workqueue(fsc->pg_inv_wq); - destroy_workqueue(fsc->trunc_wq); +static void destroy_fs_client(struct ceph_fs_client *fsc) +{ + doutc(fsc->client, "%p\n", fsc); - bdi_destroy(&fsc->backing_dev_info); + spin_lock(&ceph_fsc_lock); + list_del(&fsc->metric_wakeup); + spin_unlock(&ceph_fsc_lock); - mempool_destroy(fsc->wb_pagevec_pool); + ceph_mdsc_destroy(fsc); + destroy_workqueue(fsc->inode_wq); + destroy_workqueue(fsc->cap_wq); destroy_mount_options(fsc->mount_options); - ceph_fs_debugfs_cleanup(fsc); - ceph_destroy_client(fsc->client); kfree(fsc); - dout("destroy_fs_client %p done\n", fsc); + dout("%s: %p done\n", __func__, fsc); } /* @@ -577,49 +904,84 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_snap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; +struct kmem_cache *ceph_dir_file_cachep; +struct kmem_cache *ceph_mds_request_cachep; +mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); + inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) { + int error = -ENOMEM; + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, ceph_inode_init_once); - if (ceph_inode_cachep == NULL) + if (!ceph_inode_cachep) return -ENOMEM; - ceph_cap_cachep = KMEM_CACHE(ceph_cap, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) + ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0); + if (!ceph_cap_cachep) goto bad_cap; + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0); + if (!ceph_cap_snap_cachep) + goto bad_cap_snap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT); + if (!ceph_cap_flush_cachep) + goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) + SLAB_RECLAIM_ACCOUNT); + if (!ceph_dentry_cachep) goto bad_dentry; - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) + ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0); + if (!ceph_file_cachep) goto bad_file; + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0); + if (!ceph_dir_file_cachep) + goto bad_dir_file; + + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0); + if (!ceph_mds_request_cachep) + goto bad_mds_req; + + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, + (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *)); + if (!ceph_wb_pagevec_pool) + goto bad_pagevec_pool; + return 0; +bad_pagevec_pool: + kmem_cache_destroy(ceph_mds_request_cachep); +bad_mds_req: + kmem_cache_destroy(ceph_dir_file_cachep); +bad_dir_file: + kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: + kmem_cache_destroy(ceph_cap_snap_cachep); +bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; + return error; } static void destroy_caches(void) @@ -629,32 +991,45 @@ static void destroy_caches(void) * destroy cache. */ rcu_barrier(); + kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_snap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + kmem_cache_destroy(ceph_dir_file_cachep); + kmem_cache_destroy(ceph_mds_request_cachep); + mempool_destroy(ceph_wb_pagevec_pool); } +static void __ceph_umount_begin(struct ceph_fs_client *fsc) +{ + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); + ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files +} /* - * ceph_umount_begin - initiate forced umount. Tear down down the + * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). */ -static void ceph_umount_begin(struct super_block *sb) +void ceph_umount_begin(struct super_block *sb) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + + doutc(fsc->client, "starting forced umount\n"); - dout("ceph_umount_begin - starting forced umount\n"); - if (!fsc) - return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; - return; + __ceph_umount_begin(fsc); } static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, + .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, + .drop_inode = inode_just_drop, + .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -670,40 +1045,40 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; /* open dir */ - dout("open_root_inode opening '%s'\n", path); + doutc(cl, "opening '%s'\n", path); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; - dout("open_root_inode success\n"); - if (ceph_ino(inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) { - root = d_make_root(inode); - if (!root) { - root = ERR_PTR(-ENOMEM); - goto out; - } - } else { - root = d_obtain_alias(inode); + doutc(cl, "success\n"); + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; } - ceph_init_dentry(root); - dout("open_root_inode success, root dentry is %p\n", root); + doutc(cl, "success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } @@ -712,127 +1087,178 @@ out: return root; } +#ifdef CONFIG_FS_ENCRYPTION +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + struct ceph_fs_client *fsc = sb->s_fs_info; + if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) + return 0; + /* No changing encryption context on remount. */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + /* Also make sure fsopt doesn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + + fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; + memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); + + warnfc(fc, "test_dummy_encryption mode enabled"); + return 0; +} +#else +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + return 0; +} +#endif /* * mount: join the ceph cluster, and open root directory. */ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) + struct fs_context *fc) { + struct ceph_client *cl = fsc->client; int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; - int first = 0; /* first vfsmount for this super_block */ - dout("mount start\n"); + doutc(cl, "mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); - err = __ceph_open_session(fsc->client, started); - if (err < 0) - goto out; + if (!fsc->sb->s_root) { + const char *path = fsc->mount_options->server_path ? + fsc->mount_options->server_path + 1 : ""; - dout("mount opening root\n"); - root = open_root_dentry(fsc, "", started); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - if (fsc->sb->s_root) { - dput(root); - } else { - fsc->sb->s_root = root; - first = 1; - - err = ceph_fs_debugfs_init(fsc); + err = __ceph_open_session(fsc->client); if (err < 0) - goto fail; - } + goto out; + + /* setup fscache */ + if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { + err = ceph_fscache_register_fs(fsc, fc); + if (err < 0) + goto out; + } + + err = ceph_apply_test_dummy_encryption(fsc->sb, fc, + fsc->mount_options); + if (err) + goto out; + + doutc(cl, "mount opening path '%s'\n", path); + + ceph_fs_debugfs_init(fsc); - if (path[0] == 0) { - dget(root); - } else { - dout("mount opening base mountpoint\n"); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); - goto fail; + goto out; } + fsc->sb->s_root = dget(root); + } else { + root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; - dout("mount success\n"); + doutc(cl, "mount success\n"); mutex_unlock(&fsc->client->mount_mutex); return root; out: mutex_unlock(&fsc->client->mount_mutex); + ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); - -fail: - if (first) { - dput(fsc->sb->s_root); - fsc->sb->s_root = NULL; - } - goto out; } -static int ceph_set_super(struct super_block *s, void *data) +static int ceph_set_super(struct super_block *s, struct fs_context *fc) { - struct ceph_fs_client *fsc = data; + struct ceph_fs_client *fsc = s->s_fs_info; + struct ceph_client *cl = fsc->client; int ret; - dout("set_super %p data %p\n", s, data); + doutc(cl, "%p\n", s); - s->s_flags = fsc->mount_options->sb_flags; - s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + s->s_maxbytes = MAX_LFS_FILESIZE; - s->s_fs_info = fsc; + s->s_xattr = ceph_xattr_handlers; fsc->sb = s; + fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ s->s_op = &ceph_super_ops; + set_default_d_op(s, &ceph_dentry_ops); s->s_export_op = &ceph_export_ops; - s->s_time_gran = 1000; /* 1000 ns == 1 us */ + s->s_time_gran = 1; + s->s_time_min = 0; + s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; + s->s_magic = CEPH_SUPER_MAGIC; - ret = set_anon_super(s, NULL); /* what is that second arg for? */ - if (ret != 0) - goto fail; - - return ret; + ceph_fscrypt_set_ops(s); -fail: - s->s_fs_info = NULL; - fsc->sb = NULL; + ret = set_anon_super_fc(s, fc); + if (ret != 0) + fsc->sb = NULL; return ret; } /* * share superblock if same fs AND options */ -static int ceph_compare_super(struct super_block *sb, void *data) +static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) { - struct ceph_fs_client *new = data; + struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; - struct ceph_fs_client *other = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; - dout("ceph_compare_super %p\n", sb); + doutc(cl, "%p\n", sb); - if (compare_mount_options(fsopt, opt, other)) { - dout("monitor(s)/mount options don't match\n"); + if (compare_mount_options(fsopt, opt, fsc)) { + doutc(cl, "monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && - ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { - dout("fsid doesn't match\n"); + ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { + doutc(cl, "fsid doesn't match\n"); + return 0; + } + if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { + doutc(cl, "flags differ\n"); + return 0; + } + + if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { + doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n"); return 0; } - if (fsopt->sb_flags != other->mount_options->sb_flags) { - dout("flags differ\n"); + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + doutc(cl, "client has been forcibly unmounted\n"); return 0; } + return 1; } @@ -841,124 +1267,361 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, - struct ceph_fs_client *fsc) +static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) { int err; + err = super_setup_bdi_name(sb, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (err) + return err; + /* set ra_pages based on rasize mount option? */ - if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) - fsc->backing_dev_info.ra_pages = - (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) - >> PAGE_SHIFT; - else - fsc->backing_dev_info.ra_pages = - default_backing_dev_info.ra_pages; + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", - atomic_long_inc_return(&bdi_seq)); - if (!err) - sb->s_bdi = &fsc->backing_dev_info; - return err; + /* set io_pages based on max osd read size */ + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; + + return 0; } -static struct dentry *ceph_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ceph_get_tree(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; + int (*compare_super)(struct super_block *, struct fs_context *) = + ceph_compare_super; int err; - int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; - struct ceph_mount_options *fsopt = NULL; - struct ceph_options *opt = NULL; - - dout("ceph_mount\n"); - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); - if (err < 0) { - res = ERR_PTR(err); - goto out_final; - } + + dout("ceph_get_tree\n"); + + if (!fc->source) + return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ - fsc = create_fs_client(fsopt, opt); + fsc = create_fs_client(pctx->opts, pctx->copts); + pctx->opts = NULL; + pctx->copts = NULL; if (IS_ERR(fsc)) { - res = ERR_CAST(fsc); - destroy_mount_options(fsopt); - ceph_destroy_options(opt); + err = PTR_ERR(fsc); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out; - } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); + + fc->s_fs_info = fsc; + sb = sget_fc(fc, compare_super, ceph_set_super); + fc->s_fs_info = NULL; if (IS_ERR(sb)) { - res = ERR_CAST(sb); + err = PTR_ERR(sb); goto out; } - if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); + if (ceph_sb_to_fs_client(sb) != fsc) { destroy_fs_client(fsc); - fsc = ceph_sb_to_client(sb); + fsc = ceph_sb_to_fs_client(sb); dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); - err = ceph_register_bdi(sb, fsc); - if (err < 0) { - res = ERR_PTR(err); + err = ceph_setup_bdi(sb, fsc); + if (err < 0) goto out_splat; - } } - res = ceph_real_mount(fsc, path); - if (IS_ERR(res)) + res = ceph_real_mount(fsc, fc); + if (IS_ERR(res)) { + err = PTR_ERR(res); goto out_splat; - dout("root %p inode %p ino %llx.%llx\n", res, - res->d_inode, ceph_vinop(res->d_inode)); - return res; + } + + doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res, + d_inode(res), ceph_vinop(d_inode(res))); + fc->root = fsc->sb->s_root; + return 0; out_splat: + if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) { + pr_info("No mds server is up or the cluster is laggy\n"); + err = -EHOSTUNREACH; + } + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: - dout("ceph_mount fail %ld\n", PTR_ERR(res)); - return res; + dout("ceph_get_tree fail %d\n", err); + return err; +} + +static void ceph_free_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + + if (pctx) { + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + } +} + +static int ceph_reconfigure_fc(struct fs_context *fc) +{ + int err; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct super_block *sb = fc->root->d_sb; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + + err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); + if (err) + return err; + + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + ceph_set_mount_opt(fsc, ASYNC_DIROPS); + else + ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + ceph_set_mount_opt(fsc, SPARSEREAD); + else + ceph_clear_mount_opt(fsc, SPARSEREAD); + + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice_client(fsc->client, + "monitor addresses recorded, but not used for reconnection"); + } + + sync_filesystem(sb); + return 0; +} + +static const struct fs_context_operations ceph_context_ops = { + .free = ceph_free_fc, + .parse_param = ceph_parse_mount_param, + .get_tree = ceph_get_tree, + .reconfigure = ceph_reconfigure_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int ceph_init_fs_context(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx; + struct ceph_mount_options *fsopt; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return -ENOMEM; + + pctx->copts = ceph_alloc_options(); + if (!pctx->copts) + goto nomem; + + pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); + if (!pctx->opts) + goto nomem; + + fsopt = pctx->opts; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) + goto nomem; + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + + fc->fs_private = pctx; + fc->ops = &ceph_context_ops; + return 0; + +nomem: + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + return -ENOMEM; +} + +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + +/* For data IO requests */ +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); } static void ceph_kill_sb(struct super_block *s) { - struct ceph_fs_client *fsc = ceph_sb_to_client(s); - dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); - kill_anon_super(s); /* will call put_super after sb is r/o */ - ceph_mdsc_destroy(fsc); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); + struct ceph_client *cl = fsc->client; + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; + + doutc(cl, "%p\n", s); + + ceph_mdsc_pre_umount(mdsc); + flush_fs_workqueues(fsc); + + /* + * Though the kill_anon_super() will finally trigger the + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. + */ + sync_filesystem(s); + + if (atomic64_read(&mdsc->dirty_folios) > 0) { + wait_queue_head_t *wq = &mdsc->flush_end_wq; + long timeleft = wait_event_killable_timeout(*wq, + atomic64_read(&mdsc->dirty_folios) <= 0, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + kill_anon_super(s); + + fsc->client->extra_mon_dispatch = NULL; + ceph_fs_debugfs_cleanup(fsc); + + ceph_fscache_unregister_fs(fsc); + destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .mount = ceph_mount, + .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, - .fs_flags = FS_RENAME_DOES_D_MOVE, + .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ceph"); -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + int err = 0; + + fsc->mount_state = CEPH_MOUNT_RECOVER; + __ceph_umount_begin(fsc); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blocklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blocklisted = false; + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} static int __init init_ceph(void) { @@ -966,17 +1629,16 @@ static int __init init_ceph(void) if (ret) goto out; - ceph_xattr_init(); + ceph_flock_init(); ret = register_filesystem(&ceph_fs_type); if (ret) - goto out_icache; + goto out_caches; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; -out_icache: - ceph_xattr_exit(); +out_caches: destroy_caches(); out: return ret; @@ -986,10 +1648,53 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); - ceph_xattr_exit(); destroy_caches(); } +static int param_set_metrics(const char *val, const struct kernel_param *kp) +{ + struct ceph_fs_client *fsc; + int ret; + + ret = param_set_bool(val, kp); + if (ret) { + pr_err("Failed to parse sending metrics switch value '%s'\n", + val); + return ret; + } else if (!disable_send_metrics) { + // wake up all the mds clients + spin_lock(&ceph_fsc_lock); + list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) { + metric_schedule_delayed(&fsc->mdsc->metric); + } + spin_unlock(&ceph_fsc_lock); + } + + return 0; +} + +static const struct kernel_param_ops param_ops_metrics = { + .set = param_set_metrics, + .get = param_get_bool, +}; + +bool disable_send_metrics = false; +module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); +MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); + +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + +bool enable_unsafe_idmap = false; +module_param(enable_unsafe_idmap, bool, 0644); +MODULE_PARM_DESC(enable_unsafe_idmap, + "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID"); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cbded572345e..a1f781c46b41 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,9 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/osd_client.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <linux/backing-dev.h> #include <linux/completion.h> #include <linux/exportfs.h> @@ -13,48 +15,81 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/posix_acl.h> +#include <linux/refcount.h> +#include <linux/security.h> +#include <linux/netfs.h> +#include <linux/fscache.h> +#include <linux/hashtable.h> #include <linux/ceph/libceph.h> - -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 +#include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ - -#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ +#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ +#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ +#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ +#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ +#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ + +#define CEPH_MOUNT_OPT_DEFAULT \ + (CEPH_MOUNT_OPT_DCACHE | \ + CEPH_MOUNT_OPT_NOCOPYFROM | \ + CEPH_MOUNT_OPT_ASYNC_DIROPS) #define ceph_set_mount_opt(fsc, opt) \ - (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt +#define ceph_clear_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT 0 /* max read size */ -#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +/* max size of osd read request, limited by libceph */ +#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN +/* osd has a configurable limitation of max write size. + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ +#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN +#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + struct ceph_mount_options { - int flags; - int sb_flags; - - int wsize; /* max write size */ - int rsize; /* max read size */ - int rasize; /* max readahead */ - int congestion_kb; /* max writeback in flight */ - int caps_wanted_delay_min, caps_wanted_delay_max; - int cap_release_safety; - int max_readdir; /* max readdir result (entires) */ - int max_readdir_bytes; /* max readdir result (bytes) */ + unsigned int flags; + + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; + int caps_max; + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + + bool new_dev_syntax; /* * everything above this point can be memcmp'd; everything below @@ -62,36 +97,85 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *mds_namespace; /* default NULL */ + char *server_path; /* default NULL (means "/") */ + char *fscache_uniq; /* default NULL */ + char *mon_addr; + struct fscrypt_dummy_policy dummy_enc_policy; }; +/* + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static inline int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +/* mount state */ +enum { + CEPH_MOUNT_MOUNTING, + CEPH_MOUNT_MOUNTED, + CEPH_MOUNT_UNMOUNTING, + CEPH_MOUNT_UNMOUNTED, + CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, + CEPH_MOUNT_FENCE_IO, +}; + +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 + struct ceph_fs_client { struct super_block *sb; + struct list_head metric_wakeup; + struct ceph_mount_options *mount_options; struct ceph_client *client; - unsigned long mount_state; - int min_caps; /* min caps i added */ + int mount_state; + + bool blocklisted; + + bool have_copy_from2; + + u32 filp_gen; + loff_t max_file_size; struct ceph_mds_client *mdsc; - /* writeback */ - mempool_t *wb_pagevec_pool; - struct workqueue_struct *wb_wq; - struct workqueue_struct *pg_inv_wq; - struct workqueue_struct *trunc_wq; atomic_long_t writeback_count; + bool write_congested; - struct backing_dev_info backing_dev_info; + struct workqueue_struct *inode_wq; + struct workqueue_struct *cap_wq; + + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); + spinlock_t async_unlink_conflict_lock; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_status; + struct dentry *debugfs_mds_sessions; + struct dentry *debugfs_metrics_dir; #endif -}; +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_volume *fscache; +#endif +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_dummy_policy fsc_dummy_enc_policy; +#endif +}; /* * File i/o capability. This tracks shared state with the metadata @@ -108,20 +192,41 @@ struct ceph_cap { struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ - int mds; u64 cap_id; /* unique cap id (mds provided) */ - int issued; /* latest, from the mds */ - int implemented; /* implemented superset of issued (for revocation) */ - int mds_wanted; + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds; /* mds index for this cap */ + int mds_wanted; /* caps wanted from this mds */ + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; struct list_head caps_item; }; -#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ -#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ -#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ +#define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */ + +struct ceph_cap_flush { + u64 tid; + int caps; + bool wake; /* wake up flush waiters when finish ? */ + bool is_capsnap; /* true means capsnap */ + struct list_head g_list; // global + struct list_head i_list; // per inode +}; /* * Snapped cap state that is pending flush to mds. When a snapshot occurs, @@ -129,11 +234,12 @@ struct ceph_cap { * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { - atomic_t nref; - struct ceph_inode_info *ci; - struct list_head ci_item, flushing_item; + refcount_t nref; + struct list_head ci_item; - u64 follows, flush_tid; + struct ceph_cap_flush cap_flush; + + u64 follows; int issued, dirty; struct ceph_snap_context *context; @@ -145,18 +251,23 @@ struct ceph_cap_snap { u64 xattr_version; u64 size; - struct timespec mtime, atime, ctime; + u64 change_attr; + struct timespec64 mtime, atime, ctime, btime; u64 time_warp_seq; + u64 truncate_size; + u32 truncate_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; + bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) { + if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); - kfree(capsnap); + kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } @@ -204,16 +315,28 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + struct dentry *dentry; struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; + struct list_head lease_list; + struct hlist_node hnode; + unsigned long flags; + int lease_shared_gen; + u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; + unsigned long time; u64 offset; }; +#define CEPH_DENTRY_REFERENCED (1 << 0) +#define CEPH_DENTRY_LEASE_LIST (1 << 1) +#define CEPH_DENTRY_SHRINK_LIST (1 << 2) +#define CEPH_DENTRY_PRIMARY_LINK (1 << 3) +#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) +#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) +#define CEPH_DENTRY_ASYNC_CREATE_BIT (5) +#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -236,28 +359,37 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; u64 i_version; + u64 i_inline_version; u32 i_time_warp_seq; - unsigned i_ceph_flags; - atomic_t i_release_count; - atomic_t i_complete_count; + unsigned long i_ceph_flags; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + struct ceph_file_layout i_cached_layout; // for async creates char *i_symlink; /* for dirs */ - struct timespec i_rctime; - u64 i_rbytes, i_rfiles, i_rsubdirs; + struct timespec64 i_rctime; + u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ + + /* quotas */ + u64 i_max_bytes, i_max_files; + + s32 i_dir_pin; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; @@ -267,30 +399,52 @@ struct ceph_inode_info { struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ - struct list_head i_dirty_item, i_flushing_item; - u64 i_cap_flush_seq; + + /* + * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty + * is protected by the mdsc->cap_dirty_lock, but each individual item + * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty + * requires the mdsc->cap_dirty_lock. List presence for an item can + * be tested under the i_ceph_lock. Changing anything requires both. + */ + struct list_head i_dirty_item; + + /* + * Link to session's s_cap_flushing list. Protected in a similar + * fashion to i_dirty_item, but also by the s_mutex for changes. The + * s_cap_flushing list can be walked while holding either the s_mutex + * or msdc->cap_dirty_lock. List presence can also be checked while + * holding the i_ceph_lock for this inode. + */ + struct list_head i_flushing_item; + /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ - u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + struct ceph_cap_flush *i_prealloc_cap_flush; + struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ - unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ - int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + unsigned long i_last_rd; + unsigned long i_last_wr; + int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ + struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ + /* + * For none fscrypt case it equals to i_truncate_size or it will + * equals to fscrypt_file_size + */ + u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ @@ -299,58 +453,88 @@ struct ceph_inode_info { /* held references to caps */ int i_pin_ref; - int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; - u32 i_shared_gen; /* increment each time we get FILE_SHARED */ + atomic_t i_filelock_ref; + atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ - struct list_head i_unsafe_writes; /* uncommitted sync writes */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ + struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; - struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ - int i_snap_realm_counter; /* snap realm (if caps) */ + union { + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ + }; struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; + struct timespec64 i_btime; + struct timespec64 i_snap_btime; + + struct work_struct i_work; + unsigned long i_work_mask; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 *fscrypt_auth; + u8 *fscrypt_file; +#endif +}; - struct work_struct i_wb_work; /* writeback work */ - struct work_struct i_pg_inv_work; /* page invalidation work */ +struct ceph_netfs_request_data { + int caps; - struct work_struct i_vmtruncate_work; + /* + * Maximum size of a file readahead request. + * The fadvise could update the bdi's default ra_pages. + */ + unsigned int file_ra_pages; - struct inode vfs_inode; /* at end */ + /* Set it if fadvise disables file readahead entirely */ + bool file_ra_disabled; }; -static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +static inline struct ceph_inode_info * +ceph_inode(const struct inode *inode) { - return container_of(inode, struct ceph_inode_info, vfs_inode); + return container_of(inode, struct ceph_inode_info, netfs.inode); } -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client * +ceph_inode_to_fs_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client * +ceph_sb_to_fs_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } -static inline struct ceph_vino ceph_vino(struct inode *inode) +static inline struct ceph_mds_client * +ceph_sb_to_mdsc(const struct super_block *sb) +{ + return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc; +} + +static inline struct ceph_client * +ceph_inode_to_client(const struct inode *inode) +{ + return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client; +} + +static inline struct ceph_vino +ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } -/* - * ino_t is <64 bits on many architectures, blech. - * - * i_ino (kernel inode) st_ino (userspace) - * i386 32 32 - * x86_64+ino32 64 32 - * x86_64 64 64 - */ -static inline u32 ceph_ino_to_ino32(__u64 vino) +static inline u32 ceph_ino_to_ino32(u64 vino) { u32 ino = vino & 0xffffffff; ino ^= vino >> 32; @@ -360,35 +544,18 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) } /* - * kernel i_ino value + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on + * some arches. We generally do not use this value inside the ceph driver, but + * we do want to set it to something, so that generic vfs code has an + * appropriate value for tracepoints and the like. */ -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) { -#if BITS_PER_LONG == 32 - return ceph_ino_to_ino32(vino.ino); -#else + if (sizeof(ino_t) == sizeof(u32)) + return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino; -#endif } -/* - * user-visible ino (stat, filldir) - */ -#if BITS_PER_LONG == 32 -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - return ino; -} -#else -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) -{ - if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) - ino = ceph_ino_to_ino32(ino); - return ino; -} -#endif - - /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -396,11 +563,34 @@ static inline u64 ceph_ino(struct inode *inode) { return ceph_inode(inode)->i_vino.ino; } + static inline u64 ceph_snap(struct inode *inode) { return ceph_inode(inode)->i_vino.snap; } +/** + * ceph_present_ino - format an inode number for presentation to userland + * @sb: superblock where the inode lives + * @ino: inode number to (possibly) convert + * + * If the user mounted with the ino32 option, then the 64-bit value needs + * to be converted to something that can fit inside 32 bits. Note that + * internal kernel code never uses this value, so this is entirely for + * userland consumption. + */ +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) +{ + if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32))) + return ceph_ino_to_ino32(ino); + return ino; +} + +static inline u64 ceph_present_inode(struct inode *inode) +{ + return ceph_present_ino(inode->i_sb, ceph_ino(inode)); +} + static inline int ceph_ino_compare(struct inode *inode, void *data) { struct ceph_vino *pvino = (struct ceph_vino *)data; @@ -409,36 +599,138 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } +/* + * The MDS reserves a set of inodes for its own usage. These should never + * be accessible by clients, and so the MDS has no reason to ever hand these + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. + * + * These come from src/mds/mdstypes.h in the ceph sources. + */ +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 +#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) +#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) + +static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) +{ + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; +} + static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { - ino_t t = ceph_vino_to_ino(vino); - return ilookup5(sb, t, ceph_ino_compare, &vino); + if (ceph_vino_is_reserved(vino)) + return NULL; + + /* + * NB: The hashval will be run through the fs/inode.c hash function + * anyway, so there is no need to squash the inode number down to + * 32-bits first. Just use low-order bits on arches with 32-bit long. + */ + return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); } /* * Ceph inode. */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ +#define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */ +#define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT) +#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ +#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) +#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ +#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async + creating finishes */ + +/* + * Masks of ceph inode work. + */ +#define CEPH_I_WORK_WRITEBACK 0 +#define CEPH_I_WORK_INVALIDATE_PAGES 1 +#define CEPH_I_WORK_VMTRUNCATE 2 +#define CEPH_I_WORK_CHECK_CAPS 3 +#define CEPH_I_WORK_FLUSH_SNAPS 4 + +/* + * We set the ERROR_WRITE bit when we start seeing write errors on an inode + * and then clear it when they start succeeding. Note that we do a lockless + * check first, and only take the lock if it looks like it needs to be changed. + * The write submission code just takes this as a hint, so we're not too + * worried if a few slip through in either direction. + */ +static inline void ceph_set_error_write(struct ceph_inode_info *ci) +{ + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void ceph_clear_error_write(struct ceph_inode_info *ci) +{ + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count) + long long release_count, + long long ordered_count) { - atomic_set(&ci->i_complete_count, release_count); + /* + * Makes sure operations that setup readdir cache (update page + * cache and i_size) are strongly ordered w.r.t. the following + * atomic64_set() operations. + */ + smp_mb(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - atomic_inc(&ci->i_release_count); + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { - return atomic_read(&ci->i_complete_count) == - atomic_read(&ci->i_release_count); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); +} + +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) @@ -446,11 +738,17 @@ static inline void ceph_dir_clear_complete(struct inode *inode) __ceph_dir_clear_complete(ceph_inode(inode)); } -static inline bool ceph_dir_is_complete(struct inode *inode) +static inline void ceph_dir_clear_ordered(struct inode *inode) { - return __ceph_dir_is_complete(ceph_inode(inode)); + __ceph_dir_clear_ordered(ceph_inode(inode)); } +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); + return ret; +} /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, @@ -464,16 +762,11 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -484,6 +777,8 @@ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int t); extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *cap); @@ -496,12 +791,12 @@ static inline int ceph_caps_issued(struct ceph_inode_info *ci) return issued; } -static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, - int touch) +static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, + int mask, int touch) { int r; spin_lock(&ci->i_ceph_lock); - r = __ceph_caps_issued_mask(ci, mask, touch); + r = __ceph_caps_issued_mask_metric(ci, mask, touch); spin_unlock(&ci->i_ceph_lock); return r; } @@ -510,37 +805,38 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); -extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); -extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); - -/* - * wanted, by virtue of open file modes AND cap refs (buffered/cached data) - */ -static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) { - int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (w & CEPH_CAP_FILE_BUFFER) - w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ - return w; + return ci->i_nr_by_mode[0]; } +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_wanted(struct ceph_inode_info *ci); /* what the mds thinks we want */ -extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); -extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt); +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); -extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); +extern void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session); @@ -554,23 +850,79 @@ struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ + spinlock_t rw_contexts_lock; + struct list_head rw_contexts; + + u32 filp_gen; +}; + +struct ceph_dir_file_info { + struct ceph_file_info file_info; + /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ - struct dentry *dentry; /* next dentry (for dcache readdir) */ - int dir_release_count; + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; +struct ceph_rw_context { + struct list_head list; + struct task_struct *thread; + int caps; +}; + +#define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ + struct ceph_rw_context _name = { \ + .thread = current, \ + .caps = _caps, \ + } + +static inline void ceph_add_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_add(&ctx->list, &cf->rw_contexts); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline void ceph_del_rw_context(struct ceph_file_info *cf, + struct ceph_rw_context *ctx) +{ + spin_lock(&cf->rw_contexts_lock); + list_del(&ctx->list); + spin_unlock(&cf->rw_contexts_lock); +} + +static inline struct ceph_rw_context* +ceph_find_rw_context(struct ceph_file_info *cf) +{ + struct ceph_rw_context *ctx, *found = NULL; + spin_lock(&cf->rw_contexts_lock); + list_for_each_entry(ctx, &cf->rw_contexts, list) { + if (ctx->thread == current) { + found = ctx; + break; + } + } + spin_unlock(&cf->rw_contexts_lock); + return found; +} +struct ceph_readdir_cache_control { + struct folio *folio; + struct dentry **dentries; + int index; +}; /* * A "snap realm" describes a subset of the file hierarchy sharing @@ -583,6 +935,7 @@ struct ceph_file_info { */ struct ceph_snap_realm { u64 ino; + struct inode *inode; atomic_t nref; struct rb_node node; @@ -603,6 +956,8 @@ struct ceph_snap_realm { struct list_head dirty_item; /* if realm needs new context */ + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; @@ -632,7 +987,7 @@ static inline int default_congestion_kb(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; @@ -640,7 +995,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); @@ -649,14 +1005,24 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, - void *p, void *e, bool deletion); + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); -extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); + +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap); +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm); +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); +void ceph_umount_begin(struct super_block *sb); + /* * a cap_snap is "pending" if it is still awaiting an in-progress @@ -665,160 +1031,335 @@ extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && - list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, - ci_item)->writing; + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; } /* inode.c */ +struct ceph_mds_reply_info_in; +struct ceph_mds_reply_dirfrag; +struct ceph_acl_sec_ctx; + extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); -extern void ceph_destroy_inode(struct inode *inode); +extern void ceph_evict_inode(struct inode *inode); +extern void ceph_free_inode(struct inode *inode); + +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx); extern struct inode *ceph_get_inode(struct super_block *sb, - struct ceph_vino vino); + struct ceph_vino vino, + struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); extern void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime); + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, + struct timespec64 *atime); +extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation); extern int ceph_fill_trace(struct super_block *sb, - struct ceph_mds_request *req, - struct ceph_mds_session *session); + struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); -extern int ceph_inode_holds_cap(struct inode *inode, int mask); - -extern int ceph_inode_set_size(struct inode *inode, loff_t size); +extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); -extern void ceph_queue_vmtruncate(struct inode *inode); -extern void ceph_queue_invalidate(struct inode *inode); -extern void ceph_queue_writeback(struct inode *inode); +void ceph_queue_inode_work(struct inode *inode, int work_bit); -extern int ceph_do_getattr(struct inode *inode, int mask); -extern int ceph_permission(struct inode *inode, int mask); -extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); +static inline void ceph_queue_vmtruncate(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE); +} + +static inline void ceph_queue_invalidate(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES); +} + +static inline void ceph_queue_writeback(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK); +} + +static inline void ceph_queue_check_caps(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS); +} + +static inline void ceph_queue_flush_snaps(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); +} + +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} +extern int ceph_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); + +struct ceph_iattr { + struct ceph_fscrypt_auth *fscrypt_auth; +}; + +extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, + struct iattr *attr, struct ceph_iattr *cia); +extern int ceph_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); +void ceph_inode_shutdown(struct inode *inode); + +static inline bool ceph_inode_is_shutdown(struct inode *inode) +{ + unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + int state = READ_ONCE(fsc->mount_state); + + return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; +} /* xattr.c */ -extern int ceph_setxattr(struct dentry *, const char *, const void *, - size_t, int); -extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); +int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); -extern int ceph_removexattr(struct dentry *, const char *); -extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); +extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); -extern void __init ceph_xattr_init(void); -extern void ceph_xattr_exit(void); +extern const struct xattr_handler * const ceph_xattr_handlers[]; + +struct ceph_acl_sec_ctx { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + void *default_acl; + void *acl; +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + struct lsm_context lsmctx; +#endif +#ifdef CONFIG_FS_ENCRYPTION + struct ceph_fscrypt_auth *fscrypt_auth; +#endif + struct ceph_pagelist *pagelist; +}; + +#ifdef CONFIG_SECURITY +extern bool ceph_security_xattr_deadlock(struct inode *in); +extern bool ceph_security_xattr_wanted(struct inode *in); +#else +static inline bool ceph_security_xattr_deadlock(struct inode *in) +{ + return false; +} +static inline bool ceph_security_xattr_wanted(struct inode *in) +{ + return false; +} +#endif + +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} +#else +static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *ctx) +{ + return 0; +} +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ +} +#endif + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); + +/* acl.c */ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int, bool); +int ceph_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, struct posix_acl *acl, int type); +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx); +void ceph_init_inode_acls(struct inode *inode, + struct ceph_acl_sec_ctx *as_ctx); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + return 0; +} +static inline void ceph_init_inode_acls(struct inode *inode, + struct ceph_acl_sec_ctx *as_ctx) +{ +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned cap, unsigned seq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - spin_lock(&cap->ci->i_ceph_lock); - __ceph_remove_cap(cap); - spin_unlock(&cap->ci->i_ceph_lock); -} +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + bool queue_release); +extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); -extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, - u64 cap_id, u32 migrate_seq, u32 issue_seq); -extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci); +extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); +extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, + bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); -extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int again); -extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, - struct ceph_mds_session *session); -extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern void __ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession); +extern bool __ceph_should_report_size(struct ceph_inode_info *ci); +extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); +extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); - +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc); +extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, + struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff); +extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, + int need, int want, loff_t endoff, int *got); +extern int ceph_get_caps(struct file *filp, int need, int want, + loff_t endoff, int *got); +extern int ceph_try_get_caps(struct inode *inode, + int need, int want, bool nonblock, int *got); /* for counting open files by mode */ -static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) -{ - ci->i_nr_by_mode[mode]++; -} -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode); /* addr.c */ extern const struct address_space_operations ceph_aops; -extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern const struct netfs_request_ops ceph_netfs_ops; +int ceph_mmap_prepare(struct vm_area_desc *desc); +extern int ceph_uninline_data(struct file *file); +extern int ceph_pool_perm_check(struct inode *inode, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); + +static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) +{ + if (ci->i_inline_version == CEPH_INLINE_NONE || + ci->i_inline_version == 1) /* initial version, no data */ + return false; + return true; +} /* file.c */ extern const struct file_operations ceph_file_fops; -extern const struct address_space_operations ceph_aops; +extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode, - int *opened); + struct file *file, unsigned flags, umode_t mode); +extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); /* dir.c */ extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; -extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, - ceph_snapdir_dentry_ops; +extern const struct inode_operations ceph_snapdir_iops; +extern const struct dentry_operations ceph_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); -extern int ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err); +extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -extern void ceph_dentry_lru_add(struct dentry *dn); -extern void ceph_dentry_lru_touch(struct dentry *dn); -extern void ceph_dentry_lru_del(struct dentry *dn); +extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); +extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); -extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); - -/* - * our d_ops vary depending on whether the inode is live, - * snapshotted (read-only), or a virtual ".snap" directory. - */ -int ceph_init_dentry(struct dentry *dentry); - +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* ioctl.c */ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; +struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); @@ -829,10 +1370,81 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode, extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); -extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ -extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); +/* quota.c */ + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) +{ + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; +} + +extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); + +static inline void __ceph_update_quota(struct ceph_inode_info *ci, + u64 max_bytes, u64 max_files) +{ + bool had_quota, has_quota; + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); + ci->i_max_bytes = max_bytes; + ci->i_max_files = max_files; + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); + + if (had_quota != has_quota) + ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); +} + +static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) +{ + int cnt = 0; + + if (IS_ENCRYPTED(inode)) { + cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; + if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) + cnt = 0; + } + + return cnt; +} + +extern void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); +extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); +extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, + struct kstatfs *buf); +extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); + +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/util.c b/fs/ceph/util.c new file mode 100644 index 000000000000..2c34875675bf --- /dev/null +++ b/fs/ceph/util.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some non-inline ceph helpers + */ +#include <linux/module.h> +#include <linux/ceph/types.h> + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0 && fl->stripe_unit == 0 && + fl->stripe_count == 0 && fl->object_size == 0) + fl->pool_id = -1; +} + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} + +int ceph_flags_to_mode(int flags) +{ + int mode; + +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif + + switch (flags & O_ACCMODE) { + case O_WRONLY: + mode = CEPH_FILE_MODE_WR; + break; + case O_RDONLY: + mode = CEPH_FILE_MODE_RD; + break; + case O_RDWR: + case O_ACCMODE: /* this is what the VFS does */ + mode = CEPH_FILE_MODE_RDWR; + break; + } +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; +} + +int ceph_caps_for_mode(int mode) +{ + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; +} diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index be661d8f532a..ad1f30bea175 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,4 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/pagelist.h> #include "super.h" #include "mds_client.h" @@ -6,16 +8,20 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -27,169 +33,379 @@ static bool ceph_is_valid_xattr(const char *name) struct ceph_vxattr { char *name; size_t name_size; /* strlen(name) + 1 (for '\0') */ - size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, - size_t size); - bool readonly, hidden; + ssize_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, + size_t size); bool (*exists_cb)(struct ceph_inode_info *ci); + unsigned int flags; }; +#define VXATTR_FLAG_READONLY (1<<0) +#define VXATTR_FLAG_HIDDEN (1<<1) +#define VXATTR_FLAG_RSTAT (1<<2) +#define VXATTR_FLAG_DIRSTAT (1<<3) + /* layouts */ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) { - size_t s; - char *p = (char *)&ci->i_layout; - - for (s = 0; s < sizeof(ci->i_layout); s++, p++) - if (*p) - return true; - return false; + struct ceph_file_layout *fl = &ci->i_layout; + return (fl->stripe_unit > 0 || fl->stripe_count > 0 || + fl->object_size > 0 || fl->pool_id >= 0 || + rcu_dereference_raw(fl->pool_ns) != NULL); } -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) { - int ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); + struct ceph_client *cl = fsc->client; struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + struct ceph_string *pool_ns; + s64 pool = ci->i_layout.pool_id; const char *pool_name; + const char *ns_field = " pool_namespace="; + char buf[128]; + size_t len, total_len = 0; + ssize_t ret; - dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + + doutc(cl, "%p\n", &ci->netfs.inode); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - pool_name); - else - ret = snprintf(val, size, - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - (unsigned long long)pool); - - up_read(&osdc->map_sem); + if (pool_name) { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size); + total_len = len + strlen(pool_name); + } else { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size, pool); + total_len = len; + } + + if (pool_ns) + total_len += strlen(ns_field) + pool_ns->len; + + ret = total_len; + if (size >= total_len) { + memcpy(val, buf, len); + ret = len; + if (pool_name) { + len = strlen(pool_name); + memcpy(val + ret, pool_name, len); + ret += len; + } + if (pool_ns) { + len = strlen(ns_field); + memcpy(val + ret, ns_field, len); + ret += len; + memcpy(val + ret, pool_ns->str, pool_ns->len); + ret += pool_ns->len; + } + } + up_read(&osdc->lock); + ceph_put_string(pool_ns); return ret; } -static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, - char *val, size_t size) +/* + * The convention with strings in xattrs is that they should not be NULL + * terminated, since we're returning the length with them. snprintf always + * NULL terminates however, so call it on a temporary buffer and then memcpy + * the result into place. + */ +static __printf(3, 4) +int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout)); + int ret; + va_list args; + char buf[96]; /* NB: reevaluate size if new vxattrs are added */ + + va_start(args, fmt); + ret = vsnprintf(buf, size ? sizeof(buf) : 0, fmt, args); + va_end(args); + + /* Sanity check */ + if (size && ret + 1 > sizeof(buf)) { + WARN_ONCE(true, "Returned length too big (%d)", ret); + return -E2BIG; + } + + if (ret <= size) + memcpy(val, buf, ret); + return ret; } -static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, +static ssize_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_unit); } -static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.stripe_count); } -static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, - char *val, size_t size) +static ssize_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) { - int ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + return ceph_fmt_xattr(val, size, "%u", ci->i_layout.object_size); +} + +static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + ssize_t ret; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + s64 pool = ci->i_layout.pool_id; const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, "%s", pool_name); - else - ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + if (pool_name) { + ret = strlen(pool_name); + if (ret <= size) + memcpy(val, pool_name, ret); + } else { + ret = ceph_fmt_xattr(val, size, "%lld", pool); + } + up_read(&osdc->lock); + return ret; +} + +static ssize_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, + char *val, size_t size) +{ + ssize_t ret = 0; + struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); + + if (ns) { + ret = ns->len; + if (ret <= size) + memcpy(val, ns->str, ret); + ceph_put_string(ns); + } return ret; } /* directories */ -static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, +static ssize_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_files + ci->i_subdirs); +} + +static ssize_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_files); +} + +static ssize_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_subdirs); +} + +static ssize_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", + ci->i_rfiles + ci->i_rsubdirs); +} + +static ssize_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_files); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, +static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps); +} + +static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", ci->i_subdirs); + return ceph_fmt_xattr(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); + return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_rctime); } -static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, - size_t size) +/* dir pin */ +static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) { - return snprintf(val, size, "%lld", ci->i_rfiles); + return ci->i_dir_pin != -ENODATA; } -static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%lld", ci->i_rsubdirs); + return ceph_fmt_xattr(val, size, "%d", (int)ci->i_dir_pin); } -static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, - size_t size) +/* quotas */ +static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { - return snprintf(val, size, "%lld", ci->i_rbytes); + bool ret = false; + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + ci->i_vino.snap == CEPH_NOSNAP && + ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino) + ret = true; + spin_unlock(&ci->i_ceph_lock); + return ret; } -static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, - size_t size) +static ssize_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, + size_t size) { - return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + return ceph_fmt_xattr(val, size, "max_bytes=%llu max_files=%llu", + ci->i_max_bytes, ci->i_max_files); } +static ssize_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_bytes); +} + +static ssize_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return ceph_fmt_xattr(val, size, "%llu", ci->i_max_files); +} + +/* snapshots */ +static bool ceph_vxattrcb_snap_btime_exists(struct ceph_inode_info *ci) +{ + return (ci->i_snap_btime.tv_sec != 0 || ci->i_snap_btime.tv_nsec != 0); +} + +static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%ptSp", &ci->i_snap_btime); +} + +static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); + + return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); +} + +static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); + + return ceph_fmt_xattr(val, size, "client%lld", + ceph_client_gid(fsc->client)); +} + +static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int issued; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + + return ceph_fmt_xattr(val, size, "%s/0x%x", + ceph_cap_string(issued), issued); +} + +static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = ceph_fmt_xattr(val, size, "%d", + ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci) +{ + return ci->fscrypt_auth_len; +} + +static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci, + char *val, size_t size) +{ + if (size) { + if (size < ci->fscrypt_auth_len) + return -ERANGE; + memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len); + } + return ci->fscrypt_auth_len; +} +#endif /* CONFIG_FS_ENCRYPTION */ #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -#define XATTR_NAME_CEPH(_type, _name) \ +#define XATTR_NAME_CEPH(_type, _name, _flags) \ { \ .name = CEPH_XATTR_NAME(_type, _name), \ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ - .readonly = true, \ - .hidden = false, \ - .exists_cb = NULL, \ + .exists_cb = NULL, \ + .flags = (VXATTR_FLAG_READONLY | _flags), \ + } +#define XATTR_RSTAT_FIELD(_type, _name) \ + XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ - .readonly = false, \ - .hidden = true, \ .exists_cb = ceph_vxattrcb_layout_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ + } +#define XATTR_QUOTA_FIELD(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = ceph_vxattrcb_quota_exists, \ + .flags = VXATTR_FLAG_HIDDEN, \ } static struct ceph_vxattr ceph_dir_vxattrs[] = { @@ -197,25 +413,55 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .name = "ceph.dir.layout", .name_size = sizeof("ceph.dir.layout"), .getxattr_cb = ceph_vxattrcb_layout, - .readonly = false, - .hidden = false, .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, }, XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), XATTR_LAYOUT_FIELD(dir, layout, stripe_count), XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), - XATTR_NAME_CEPH(dir, entries), - XATTR_NAME_CEPH(dir, files), - XATTR_NAME_CEPH(dir, subdirs), - XATTR_NAME_CEPH(dir, rentries), - XATTR_NAME_CEPH(dir, rfiles), - XATTR_NAME_CEPH(dir, rsubdirs), - XATTR_NAME_CEPH(dir, rbytes), - XATTR_NAME_CEPH(dir, rctime), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), + XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT), + XATTR_RSTAT_FIELD(dir, rentries), + XATTR_RSTAT_FIELD(dir, rfiles), + XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rsnaps), + XATTR_RSTAT_FIELD(dir, rbytes), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), + { + .name = "ceph.dir.pin", + .name_size = sizeof("ceph.dir.pin"), + .getxattr_cb = ceph_vxattrcb_dir_pin, + .exists_cb = ceph_vxattrcb_dir_pin_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + { + .name = "ceph.quota", + .name_size = sizeof("ceph.quota"), + .getxattr_cb = ceph_vxattrcb_quota, + .exists_cb = ceph_vxattrcb_quota_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, { .name = NULL, 0 } /* Required table terminator */ }; -static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ /* files */ @@ -224,17 +470,64 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .name = "ceph.file.layout", .name_size = sizeof("ceph.file.layout"), .getxattr_cb = ceph_vxattrcb_layout, - .readonly = false, - .hidden = false, .exists_cb = ceph_vxattrcb_layout_exists, + .flags = VXATTR_FLAG_HIDDEN, }, XATTR_LAYOUT_FIELD(file, layout, stripe_unit), XATTR_LAYOUT_FIELD(file, layout, stripe_count), XATTR_LAYOUT_FIELD(file, layout, object_size), XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), + { + .name = "ceph.snap.btime", + .name_size = sizeof("ceph.snap.btime"), + .getxattr_cb = ceph_vxattrcb_snap_btime, + .exists_cb = ceph_vxattrcb_snap_btime_exists, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = NULL, 0 } /* Required table terminator */ +}; + +static struct ceph_vxattr ceph_common_vxattrs[] = { + { + .name = "ceph.cluster_fsid", + .name_size = sizeof("ceph.cluster_fsid"), + .getxattr_cb = ceph_vxattrcb_cluster_fsid, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.client_id", + .name_size = sizeof("ceph.client_id"), + .getxattr_cb = ceph_vxattrcb_client_id, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.auth_mds", + .name_size = sizeof("ceph.auth_mds"), + .getxattr_cb = ceph_vxattrcb_auth_mds, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + { + .name = "ceph.fscrypt.auth", + .name_size = sizeof("ceph.fscrypt.auth"), + .getxattr_cb = ceph_vxattrcb_fscrypt_auth, + .exists_cb = ceph_vxattrcb_fscrypt_auth_exists, + .flags = VXATTR_FLAG_READONLY, + }, +#endif /* CONFIG_FS_ENCRYPTION */ { .name = NULL, 0 } /* Required table terminator */ }; -static size_t ceph_file_vxattrs_name_size; /* total size of all names */ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { @@ -245,47 +538,6 @@ static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) -{ - if (vxattrs == ceph_dir_vxattrs) - return ceph_dir_vxattrs_name_size; - if (vxattrs == ceph_file_vxattrs) - return ceph_file_vxattrs_name_size; - BUG(); - - return 0; -} - -/* - * Compute the aggregate size (including terminating '\0') of all - * virtual extended attribute names in the given vxattr table. - */ -static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) -{ - struct ceph_vxattr *vxattr; - size_t size = 0; - - for (vxattr = vxattrs; vxattr->name; vxattr++) - if (!vxattr->hidden) - size += vxattr->name_size; - - return size; -} - -/* Routines called at initialization and exit time */ - -void __init ceph_xattr_init(void) -{ - ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); - ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); -} - -void ceph_xattr_exit(void) -{ - ceph_dir_vxattrs_name_size = 0; - ceph_file_vxattrs_name_size = 0; -} - static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { @@ -299,16 +551,26 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, } } + vxattr = ceph_common_vxattrs; + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + return NULL; } +#define MAX_XATTR_VAL_PRINT_LEN 256 + static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, - int dirty, - int should_free_name, int should_free_val, + int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; @@ -335,23 +597,45 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr = NULL; } + if (update_xattr) { + int err = 0; + + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + kfree(*newxattr); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + kfree(*newxattr); + return 0; + } + } + if (!xattr) { new = 1; xattr = *newxattr; xattr->name = name; xattr->name_len = name_len; - xattr->should_free_name = should_free_name; + xattr->should_free_name = update_xattr; ci->i_xattrs.count++; - dout("__set_xattr count=%d\n", ci->i_xattrs.count); + doutc(cl, "count=%d\n", ci->i_xattrs.count); } else { kfree(*newxattr); *newxattr = NULL; if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); - if (should_free_name) { - kfree((void *)name); + if (update_xattr) { + kfree(name); name = xattr->name; } ci->i_xattrs.names_size -= xattr->name_len; @@ -365,17 +649,19 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->val = ""; xattr->val_len = val_len; - xattr->dirty = dirty; - xattr->should_free_val = (val && should_free_val); + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); if (new) { rb_link_node(&xattr->node, parent, p); rb_insert_color(&xattr->node, &ci->i_xattrs.index); - dout("__set_xattr_val p=%p\n", p); + doutc(cl, "p=%p\n", p); } - dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val); + doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode, + ceph_vinop(inode), xattr, name_len, name, min(val_len, + MAX_XATTR_VAL_PRINT_LEN), val, + val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); return 0; } @@ -383,6 +669,7 @@ static int __set_xattr(struct ceph_inode_info *ci, static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, const char *name) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; @@ -401,13 +688,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, else if (c > 0) p = &(*p)->rb_right; else { - dout("__get_xattr %s: found %.*s\n", name, - xattr->val_len, xattr->val); + int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN); + + doutc(cl, "%s found %.*s%s\n", name, len, xattr->val, + xattr->val_len > len ? "..." : ""); return xattr; } } - dout("__get_xattr %s: not found\n", name); + doutc(cl, "%s not found\n", name); return NULL; } @@ -417,9 +706,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr) BUG_ON(!xattr); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); kfree(xattr); } @@ -428,14 +717,14 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr) { if (!xattr) - return -EOPNOTSUPP; + return -ENODATA; rb_erase(&xattr->node, &ci->i_xattrs.index); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; @@ -445,35 +734,23 @@ static int __remove_xattr(struct ceph_inode_info *ci, return 0; } -static int __remove_xattr_by_name(struct ceph_inode_info *ci, - const char *name) -{ - struct rb_node **p; - struct ceph_inode_xattr *xattr; - int err; - - p = &ci->i_xattrs.index.rb_node; - xattr = __get_xattr(ci, name); - err = __remove_xattr(ci, xattr); - return err; -} - static char *__copy_xattr_names(struct ceph_inode_info *ci, char *dest) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; p = rb_first(&ci->i_xattrs.index); - dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + doutc(cl, "count=%d\n", ci->i_xattrs.count); while (p) { xattr = rb_entry(p, struct ceph_inode_xattr, node); memcpy(dest, xattr->name, xattr->name_len); dest[xattr->name_len] = '\0'; - dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, - xattr->name_len, ci->i_xattrs.names_size); + doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); dest += xattr->name_len + 1; p = rb_next(p); @@ -484,19 +761,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci, void __ceph_destroy_xattrs(struct ceph_inode_info *ci) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node *p, *tmp; struct ceph_inode_xattr *xattr = NULL; p = rb_first(&ci->i_xattrs.index); - dout("__ceph_destroy_xattrs p=%p\n", p); + doutc(cl, "p=%p\n", p); while (p) { xattr = rb_entry(p, struct ceph_inode_xattr, node); tmp = p; p = rb_next(tmp); - dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, - xattr->name_len, xattr->name); + doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name); rb_erase(tmp, &ci->i_xattrs.index); __free_xattr(xattr); @@ -513,19 +790,20 @@ static int __build_xattrs(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { + struct ceph_client *cl = ceph_inode_to_client(inode); u32 namelen; u32 numattr = 0; void *p, *end; u32 len; const char *name, *val; struct ceph_inode_info *ci = ceph_inode(inode); - int xattr_version; + u64 xattr_version; struct ceph_inode_xattr **xattrs = NULL; int err = 0; int i; - dout("__build_xattrs() len=%d\n", - ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + doutc(cl, "len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); if (ci->i_xattrs.index_version >= ci->i_xattrs.version) return 0; /* already built */ @@ -541,12 +819,12 @@ start: xattr_version = ci->i_xattrs.version; spin_unlock(&ci->i_ceph_lock); - xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), + xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *), GFP_NOFS); err = -ENOMEM; if (!xattrs) goto bad_lock; - memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); + for (i = 0; i < numattr; i++) { xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); @@ -574,7 +852,7 @@ start: p += len; err = __set_xattr(ci, name, namelen, val, len, - 0, 0, 0, &xattrs[numattr]); + 0, 0, &xattrs[numattr]); if (err < 0) goto bad; @@ -600,6 +878,8 @@ bad: static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, int val_size) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); + /* * 4 bytes for the length, and additional 4 bytes per each xattr name, * 4 bytes per each value @@ -607,9 +887,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, int size = 4 + ci->i_xattrs.count*(4 + 4) + ci->i_xattrs.names_size + ci->i_xattrs.vals_size; - dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", - ci->i_xattrs.count, ci->i_xattrs.names_size, - ci->i_xattrs.vals_size); + doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count, + ci->i_xattrs.names_size, ci->i_xattrs.vals_size); if (name_size) size += 4 + 4 + name_size + val_size; @@ -618,16 +897,21 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, } /* - * If there are dirty xattrs, reencode xattrs into the prealloc_blob - * and swap into place. + * If there are dirty xattrs, re-encode xattrs into the prealloc_blob + * and swap into place. It returns the old i_xattrs.blob (or NULL) so + * that it can be freed by the caller as the i_ceph_lock is likely to be + * held. */ -void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) +struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; + struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); @@ -655,56 +939,103 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) dest - ci->i_xattrs.prealloc_blob->vec.iov_base; if (ci->i_xattrs.blob) - ceph_buffer_put(ci->i_xattrs.blob); + old_blob = ci->i_xattrs.blob; ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; ci->i_xattrs.version++; } + + return old_blob; +} + +static inline int __get_request_mask(struct inode *in) { + struct ceph_mds_request *req = current->journal_info; + int mask = 0; + if (req && req->r_target_inode == in) { + if (req->r_op == CEPH_MDS_OP_LOOKUP || + req->r_op == CEPH_MDS_OP_LOOKUPINO || + req->r_op == CEPH_MDS_OP_LOOKUPPARENT || + req->r_op == CEPH_MDS_OP_GETATTR) { + mask = le32_to_cpu(req->r_args.getattr.mask); + } else if (req->r_op == CEPH_MDS_OP_OPEN || + req->r_op == CEPH_MDS_OP_CREATE) { + mask = le32_to_cpu(req->r_args.open.mask); + } + } + return mask; } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr *vxattr = NULL; - - if (!ceph_is_valid_xattr(name)) - return -ENODATA; + struct ceph_vxattr *vxattr; + int req_mask; + ssize_t err; + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto handle_non_vxattrs; /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); - if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { - err = vxattr->getxattr_cb(ci, value, size); + if (vxattr) { + int mask = 0; + if (vxattr->flags & VXATTR_FLAG_RSTAT) + mask |= CEPH_STAT_RSTAT; + if (vxattr->flags & VXATTR_FLAG_DIRSTAT) + mask |= CEPH_CAP_FILE_SHARED; + err = ceph_do_getattr(inode, mask, true); + if (err) + return err; + err = -ENODATA; + if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + if (size && size < err) + err = -ERANGE; + } + return err; + } else { + err = ceph_do_getvxattr(inode, name, value, size); + /* this would happen with a new client and old server combo */ + if (err == -EOPNOTSUPP) + err = -ENODATA; return err; } +handle_non_vxattrs: + req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); + doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode, + ceph_vinop(inode), name, ci->i_xattrs.version, + ci->i_xattrs.index_version); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { - goto get_xattr; - } else { + if (ci->i_xattrs.version == 0 || + !((req_mask & CEPH_CAP_XATTR_SHARED) || + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); + + /* security module gets xattr while filling trace */ + if (current->journal_info) { + pr_warn_ratelimited_client(cl, + "sync %p %llx.%llx during filling trace\n", + inode, ceph_vinop(inode)); + return -EBUSY; + } + /* get xattrs from mds (if we don't already have them) */ - err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) return err; + spin_lock(&ci->i_ceph_lock); } - spin_lock(&ci->i_ceph_lock); - err = __build_xattrs(inode); if (err < 0) goto out; -get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); if (!xattr) @@ -720,6 +1051,10 @@ get_xattr: memcpy(value, xattr->val, xattr->val_len); + if (current->journal_info && + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) + ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); return err; @@ -727,167 +1062,147 @@ out: ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); - u32 vir_namelen = 0; + bool len_only = (size == 0); u32 namelen; int err; - u32 len; - int i; spin_lock(&ci->i_ceph_lock); - dout("listxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); + doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode, + ceph_vinop(inode), ci->i_xattrs.version, + ci->i_xattrs.index_version); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { - goto list_xattr; - } else { + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); - err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) return err; + spin_lock(&ci->i_ceph_lock); } - spin_lock(&ci->i_ceph_lock); - err = __build_xattrs(inode); if (err < 0) goto out; -list_xattr: - /* - * Start with virtual dir xattr names (if any) (including - * terminating '\0' characters for each). - */ - vir_namelen = ceph_vxattrs_name_size(vxattrs); - - /* adding 1 byte per each variable due to the null termination */ + /* add 1 byte for each xattr due to the null termination */ namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; - err = -ERANGE; - if (size && vir_namelen + namelen > size) - goto out; - - err = namelen + vir_namelen; - if (size == 0) - goto out; - - names = __copy_xattr_names(ci, names); - - /* virtual xattr names, too */ - err = namelen; - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - if (!vxattrs[i].hidden && - !(vxattrs[i].exists_cb && - !vxattrs[i].exists_cb(ci))) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; - err += len + 1; - } + if (!len_only) { + if (namelen > size) { + err = -ERANGE; + goto out; } + names = __copy_xattr_names(ci, names); + size -= namelen; } - + err = namelen; out: spin_unlock(&ci->i_ceph_lock); return err; } -static int ceph_sync_setxattr(struct dentry *dentry, const char *name, +static int ceph_sync_setxattr(struct inode *inode, const char *name, const char *value, size_t size, int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; - int i, nr_pages; - struct page **pages = NULL; - void *kaddr; - - /* copy value into some pages */ - nr_pages = calc_pages_for(0, size); - if (nr_pages) { - pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); - if (!pages) + + if (size > 0) { + /* copy value into pagelist */ + pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!pagelist) return -ENOMEM; - err = -ENOMEM; - for (i = 0; i < nr_pages; i++) { - pages[i] = __page_cache_alloc(GFP_NOFS); - if (!pages[i]) { - nr_pages = i; - goto out; - } - kaddr = kmap(pages[i]); - memcpy(kaddr, value + i*PAGE_CACHE_SIZE, - min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); - } + + err = ceph_pagelist_append(pagelist, value, size); + if (err) + goto out; + } else if (!value) { + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } - dout("setxattr value=%.*s\n", (int)size, value); + doutc(cl, "name %s value size %zu\n", name, size); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } + + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } + + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_args.setxattr.osdmap_epoch = + cpu_to_le32(osdc->osdmap->epoch); + req->r_pagelist = pagelist; + pagelist = NULL; + } + req->r_inode = inode; ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; - req->r_args.setxattr.flags = cpu_to_le32(flags); - req->r_path2 = kstrdup(name, GFP_NOFS); - - req->r_pages = pages; - req->r_num_pages = nr_pages; - req->r_data_len = size; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); - dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version); out: - if (pages) { - for (i = 0; i < nr_pages; i++) - __free_page(pages[i]); - kfree(pages); - } + if (pagelist) + ceph_pagelist_release(pagelist); return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; + struct ceph_buffer *old_blob = NULL; int issued; int err; - int dirty; + int dirty = 0; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool check_realm = false; + bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - if (!ceph_is_valid_xattr(name)) - return -EOPNOTSUPP; - vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; + if (vxattr) { + if (vxattr->flags & VXATTR_FLAG_READONLY) + return -EOPNOTSUPP; + if (value && !strncmp(vxattr->name, "ceph.quota", 10)) + check_realm = true; + } /* pass any unhandled ceph.* xattrs through to the MDS */ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) @@ -909,147 +1224,245 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (!(issued & CEPH_CAP_XATTR_EXCL)) + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { + doutc(cl, "sync version: %llu size: %d max: %llu\n", + ci->i_xattrs.version, required_blob_size, + mdsc->mdsmap->m_max_xattr_size); goto do_sync; - __build_xattrs(inode); + } - required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode, + ceph_vinop(inode), name, ceph_cap_string(issued)); + __build_xattrs(inode); if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); - dout(" preaallocating new blob size=%d\n", required_blob_size); + ceph_buffer_put(old_blob); /* Shouldn't be required */ + doutc(cl, " pre-allocating new blob size=%d\n", + required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); + /* prealloc_blob can't be released while holding i_ceph_lock */ if (ci->i_xattrs.prealloc_blob) - ceph_buffer_put(ci->i_xattrs.prealloc_blob); + old_blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = blob; goto retry; } - err = __set_xattr(ci, newname, name_len, newval, - val_len, 1, 1, 1, &xattr); + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); + ci->i_xattrs.dirty = true; + inode_set_ctime_current(inode); + } spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: - err = ceph_sync_setxattr(dentry, name, value, size, flags); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + + /* security module set xattr while filling trace */ + if (current->journal_info) { + pr_warn_ratelimited_client(cl, + "sync %p %llx.%llx during filling trace\n", + inode, ceph_vinop(inode)); + err = -EBUSY; + } else { + err = ceph_sync_setxattr(inode, name, value, size, flags); + if (err >= 0 && check_realm) { + /* check if snaprealm was created for quota inode */ + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + !(ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino)) + err = -EOPNOTSUPP; + spin_unlock(&ci->i_ceph_lock); + } + } out: + ceph_free_cap_flush(prealloc_cf); kfree(newname); kfree(newval); kfree(xattr); return err; } -static int ceph_send_removexattr(struct dentry *dentry, const char *name) +static int ceph_get_xattr_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; - struct inode *parent_inode; - struct ceph_mds_request *req; - int err; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR, - USE_AUTH_MDS); - if (IS_ERR(req)) - return PTR_ERR(req); - req->r_inode = inode; - ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - req->r_num_caps = 1; - req->r_path2 = kstrdup(name, GFP_NOFS); - - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); - ceph_mdsc_put_request(req); - return err; + if (!ceph_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __ceph_getxattr(inode, name, value, size); } -int ceph_removexattr(struct dentry *dentry, const char *name) +static int ceph_set_xattr_handler(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = dentry->d_inode; - struct ceph_vxattr *vxattr; - struct ceph_inode_info *ci = ceph_inode(inode); - int issued; - int err; - int required_blob_size; - int dirty; - - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; + return __ceph_setxattr(inode, name, value, size, flags); +} - vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; +static const struct xattr_handler ceph_other_xattr_handler = { + .prefix = "", /* match any name => handlers called with full name */ + .get = ceph_get_xattr_handler, + .set = ceph_set_xattr_handler, +}; - /* pass any unhandled ceph.* xattrs through to the MDS */ - if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) - goto do_sync_unlocked; +#ifdef CONFIG_SECURITY +bool ceph_security_xattr_wanted(struct inode *in) +{ + return in->i_security != NULL; +} - err = -ENOMEM; +bool ceph_security_xattr_deadlock(struct inode *in) +{ + struct ceph_inode_info *ci; + bool ret; + if (!in->i_security) + return false; + ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); -retry: - issued = __ceph_caps_issued(ci, NULL); - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - - if (!(issued & CEPH_CAP_XATTR_EXCL)) - goto do_sync; - __build_xattrs(inode); + ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) && + !(ci->i_xattrs.version > 0 && + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)); + spin_unlock(&ci->i_ceph_lock); + return ret; +} - required_blob_size = __get_required_blob_size(ci, 0, 0); +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL +int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, + struct ceph_acl_sec_ctx *as_ctx) +{ + struct ceph_pagelist *pagelist = as_ctx->pagelist; + const char *name; + size_t name_len; + int err; - if (!ci->i_xattrs.prealloc_blob || - required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob; + err = security_dentry_init_security(dentry, mode, &dentry->d_name, + &name, &as_ctx->lsmctx); + if (err < 0) { + WARN_ON_ONCE(err != -EOPNOTSUPP); + err = 0; /* do nothing */ + goto out; + } - spin_unlock(&ci->i_ceph_lock); - dout(" preaallocating new blob size=%d\n", required_blob_size); - blob = ceph_buffer_new(required_blob_size, GFP_NOFS); - if (!blob) + err = -ENOMEM; + if (!pagelist) { + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) goto out; - spin_lock(&ci->i_ceph_lock); - if (ci->i_xattrs.prealloc_blob) - ceph_buffer_put(ci->i_xattrs.prealloc_blob); - ci->i_xattrs.prealloc_blob = blob; - goto retry; + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out; + ceph_pagelist_encode_32(pagelist, 1); } - err = __remove_xattr_by_name(ceph_inode(inode), name); + /* + * FIXME: Make security_dentry_init_security() generic. Currently + * It only supports single security module and only selinux has + * dentry_init_security hook. + */ + name_len = strlen(name); + err = ceph_pagelist_reserve(pagelist, + 4 * 2 + name_len + as_ctx->lsmctx.len); + if (err) + goto out; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; - spin_unlock(&ci->i_ceph_lock); - if (dirty) - __mark_inode_dirty(inode, dirty); - return err; -do_sync: - spin_unlock(&ci->i_ceph_lock); -do_sync_unlocked: - err = ceph_send_removexattr(dentry, name); + if (as_ctx->pagelist) { + /* update count of KV pairs */ + BUG_ON(pagelist->length <= sizeof(__le32)); + if (list_is_singular(&pagelist->head)) { + le32_add_cpu((__le32*)pagelist->mapped_tail, 1); + } else { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + void *addr = kmap_atomic(page); + le32_add_cpu((__le32*)addr, 1); + kunmap_atomic(addr); + } + } else { + as_ctx->pagelist = pagelist; + } + + ceph_pagelist_encode_32(pagelist, name_len); + ceph_pagelist_append(pagelist, name, name_len); + + ceph_pagelist_encode_32(pagelist, as_ctx->lsmctx.len); + ceph_pagelist_append(pagelist, as_ctx->lsmctx.context, + as_ctx->lsmctx.len); + + err = 0; out: + if (pagelist && !as_ctx->pagelist) + ceph_pagelist_release(pagelist); return err; } +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ + +void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) +{ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + posix_acl_release(as_ctx->acl); + posix_acl_release(as_ctx->default_acl); +#endif +#ifdef CONFIG_CEPH_FS_SECURITY_LABEL + security_release_secctx(&as_ctx->lsmctx); +#endif +#ifdef CONFIG_FS_ENCRYPTION + kfree(as_ctx->fscrypt_auth); +#endif + if (as_ctx->pagelist) + ceph_pagelist_release(as_ctx->pagelist); +} +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler * const ceph_xattr_handlers[] = { + &ceph_other_xattr_handler, + NULL, +}; |
