summaryrefslogtreecommitdiff
path: root/fs/ceph/caps.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/caps.c')
-rw-r--r--fs/ceph/caps.c283
1 files changed, 221 insertions, 62 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 157fe59fbabe..23dbfae16156 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
@@ -153,13 +154,19 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_reserve_caps(struct ceph_mds_client *mdsc,
+/*
+ * Called under mdsc->mutex.
+ */
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
- int i;
+ int i, j;
struct ceph_cap *cap;
int have;
int alloc = 0;
+ int max_caps;
+ bool trimmed = false;
+ struct ceph_mds_session *s;
LIST_HEAD(newcaps);
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -177,17 +184,56 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc,
mdsc->caps_avail_count);
spin_unlock(&mdsc->caps_list_lock);
- for (i = have; i < need; i++) {
+ for (i = have; i < need; ) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap)
- break;
- list_add(&cap->caps_item, &newcaps);
- alloc++;
- }
- /* we didn't manage to reserve as much as we needed */
- if (have + alloc != need)
+ if (cap) {
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ i++;
+ continue;
+ }
+
+ if (!trimmed) {
+ for (j = 0; j < mdsc->max_sessions; j++) {
+ s = __ceph_lookup_mds_session(mdsc, j);
+ if (!s)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ max_caps = s->s_nr_caps - (need - i);
+ ceph_trim_caps(mdsc, s, max_caps);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ trimmed = true;
+
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ int more_have;
+ if (mdsc->caps_avail_count >= need - i)
+ more_have = need - i;
+ else
+ more_have = mdsc->caps_avail_count;
+
+ i += more_have;
+ have += more_have;
+ mdsc->caps_avail_count -= more_have;
+ mdsc->caps_reserve_count += more_have;
+
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
+ continue;
+ }
+
pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
ctx, need, have + alloc);
+ goto out_nomem;
+ }
+ BUG_ON(have + alloc != need);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -203,17 +249,61 @@ void ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
+ return 0;
+
+out_nomem:
+
+ spin_lock(&mdsc->caps_list_lock);
+ mdsc->caps_avail_count += have;
+ mdsc->caps_reserve_count -= have;
+
+ while (!list_empty(&newcaps)) {
+ cap = list_first_entry(&newcaps,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ /* Keep some preallocated caps around (ceph_min_count), to
+ * avoid lots of free/alloc churn. */
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ kmem_cache_free(ceph_cap_cachep, cap);
+ } else {
+ mdsc->caps_avail_count++;
+ mdsc->caps_total_count++;
+ list_add(&cap->caps_item, &mdsc->caps_list);
+ }
+ }
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count +
+ mdsc->caps_avail_count);
+ spin_unlock(&mdsc->caps_list_lock);
+ return -ENOMEM;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx)
{
+ int i;
+ struct ceph_cap *cap;
+
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
if (ctx->count) {
spin_lock(&mdsc->caps_list_lock);
BUG_ON(mdsc->caps_reserve_count < ctx->count);
mdsc->caps_reserve_count -= ctx->count;
- mdsc->caps_avail_count += ctx->count;
+ if (mdsc->caps_avail_count >=
+ mdsc->caps_reserve_count + mdsc->caps_min_count) {
+ mdsc->caps_total_count -= ctx->count;
+ for (i = 0; i < ctx->count; i++) {
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ } else {
+ mdsc->caps_avail_count += ctx->count;
+ }
ctx->count = 0;
dout("unreserve caps %d = %d used + %d resv + %d avail\n",
mdsc->caps_total_count, mdsc->caps_use_count,
@@ -239,7 +329,23 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
mdsc->caps_use_count++;
mdsc->caps_total_count++;
spin_unlock(&mdsc->caps_list_lock);
+ } else {
+ spin_lock(&mdsc->caps_list_lock);
+ if (mdsc->caps_avail_count) {
+ BUG_ON(list_empty(&mdsc->caps_list));
+
+ mdsc->caps_avail_count--;
+ mdsc->caps_use_count++;
+ cap = list_first_entry(&mdsc->caps_list,
+ struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
+ mdsc->caps_reserve_count + mdsc->caps_avail_count);
+ }
+ spin_unlock(&mdsc->caps_list_lock);
}
+
return cap;
}
@@ -295,6 +401,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
{
struct ceph_mds_client *mdsc = fsc->mdsc;
+ spin_lock(&mdsc->caps_list_lock);
+
if (total)
*total = mdsc->caps_total_count;
if (avail)
@@ -305,6 +413,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc,
*reserved = mdsc->caps_reserve_count;
if (min)
*min = mdsc->caps_min_count;
+
+ spin_unlock(&mdsc->caps_list_lock);
}
/*
@@ -497,7 +607,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
- ci->i_shared_gen++;
+ atomic_inc(&ci->i_shared_gen);
if (S_ISDIR(ci->vfs_inode.i_mode)) {
dout(" marking %p NOT complete\n", &ci->vfs_inode);
__ceph_dir_clear_complete(ci);
@@ -576,18 +686,32 @@ void ceph_add_cap(struct inode *inode,
}
}
- if (!ci->i_snap_realm) {
+ if (!ci->i_snap_realm ||
+ ((flags & CEPH_CAP_FLAG_AUTH) &&
+ realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
/*
* add this inode to the appropriate snap realm
*/
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
if (realm) {
+ struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
+ if (oldrealm) {
+ spin_lock(&oldrealm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ spin_unlock(&oldrealm->inodes_with_caps_lock);
+ }
+
spin_lock(&realm->inodes_with_caps_lock);
- ci->i_snap_realm = realm;
list_add(&ci->i_snap_realm_item,
&realm->inodes_with_caps);
+ ci->i_snap_realm = realm;
+ if (realm->ino == ci->i_vino.ino)
+ realm->inode = inode;
spin_unlock(&realm->inodes_with_caps_lock);
+
+ if (oldrealm)
+ ceph_put_snap_realm(mdsc, oldrealm);
} else {
pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
realmino);
@@ -889,6 +1013,11 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
/*
* called under i_ceph_lock
*/
+static int __ceph_is_single_caps(struct ceph_inode_info *ci)
+{
+ return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
+}
+
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
{
return !RB_EMPTY_ROOT(&ci->i_caps);
@@ -1159,7 +1288,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
struct cap_msg_args arg;
- int held, revoking, dropping;
+ int held, revoking;
int wake = 0;
int delayed = 0;
int ret;
@@ -1167,7 +1296,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
- dropping = cap->issued & ~retain;
dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
inode, cap, cap->session,
@@ -1703,21 +1831,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int delayed = 0, sent = 0, num;
- bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0;
+ bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
- bool force_requeue = false;
bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
- is_delayed = 1;
+ no_delay = true;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
+ if (!(flags & CHECK_CAPS_AUTHONLY) ||
+ (ci->i_auth_cap && __ceph_is_single_caps(ci)))
+ __cap_delay_cancel(mdsc, ci);
+
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1772,7 +1903,7 @@ retry_locked:
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
- if ((!is_delayed || mdsc->stopping) &&
+ if ((!no_delay || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
@@ -1781,27 +1912,16 @@ retry_locked:
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
- if (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO)) {
- dout("check_caps queuing invalidate\n");
- queue_invalidate = true;
- ci->i_rdcache_revoking = ci->i_rdcache_gen;
- } else {
- dout("check_caps failed to invalidate pages\n");
- /* we failed to invalidate pages. check these
- caps again later. */
- force_requeue = true;
- __cap_set_timeouts(mdsc, ci);
- }
+ dout("check_caps queuing invalidate\n");
+ queue_invalidate = true;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
tried_invalidate = true;
goto retry_locked;
}
- num = 0;
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- num++;
/* avoid looping forever */
if (mds >= cap->mds ||
@@ -1864,7 +1984,7 @@ retry_locked:
cap->mds_wanted == want)
continue; /* nope, all good */
- if (is_delayed)
+ if (no_delay)
goto ack;
/* delay? */
@@ -1955,15 +2075,8 @@ ack:
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
- /*
- * Reschedule delayed caps release if we delayed anything,
- * otherwise cancel.
- */
- if (delayed && is_delayed)
- force_requeue = true; /* __send_cap delayed release; requeue */
- if (!delayed && !is_delayed)
- __cap_delay_cancel(mdsc, ci);
- else if (!is_delayed || force_requeue)
+ /* Reschedule delayed caps release if we delayed anything */
+ if (delayed)
__cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
@@ -1991,6 +2104,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ spin_unlock(&ci->i_ceph_lock);
dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
goto out;
}
@@ -2008,8 +2122,10 @@ retry:
mutex_lock(&session->s_mutex);
goto retry;
}
- if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+ if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
+ spin_unlock(&ci->i_ceph_lock);
goto out;
+ }
flushing = __mark_caps_flushing(inode, session, true,
&flush_tid, &oldest_flush_tid);
@@ -2157,7 +2273,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
u64 flush_tid;
int err = 0;
int dirty;
- int wait = wbc->sync_mode == WB_SYNC_ALL;
+ int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
dout("write_inode %p wait=%d\n", inode, wait);
if (wait) {
@@ -3185,8 +3301,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
bool drop = false;
- bool wake_ci = 0;
- bool wake_mdsc = 0;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
@@ -3423,7 +3539,14 @@ retry:
*/
issued = cap->issued;
- WARN_ON(issued != cap->implemented);
+ if (issued != cap->implemented)
+ pr_err_ratelimited("handle_cap_export: issued != implemented: "
+ "ino (%llx.%llx) mds%d seq %d mseq %d "
+ "issued %s implemented %s\n",
+ ceph_vinop(inode), mds, cap->seq, cap->mseq,
+ ceph_cap_string(issued),
+ ceph_cap_string(cap->implemented));
+
tcap = __get_cap_for_mds(ci, target);
if (tcap) {
@@ -3569,12 +3692,13 @@ retry:
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
(ocap->seq != le32_to_cpu(ph->seq) ||
ocap->mseq != le32_to_cpu(ph->mseq))) {
- pr_err("handle_cap_import: mismatched seq/mseq: "
- "ino (%llx.%llx) mds%d seq %d mseq %d "
- "importer mds%d has peer seq %d mseq %d\n",
- ceph_vinop(inode), peer, ocap->seq,
- ocap->mseq, mds, le32_to_cpu(ph->seq),
- le32_to_cpu(ph->mseq));
+ pr_err_ratelimited("handle_cap_import: "
+ "mismatched seq/mseq: ino (%llx.%llx) "
+ "mds%d seq %d mseq %d importer mds%d "
+ "has peer seq %d mseq %d\n",
+ ceph_vinop(inode), peer, ocap->seq,
+ ocap->mseq, mds, le32_to_cpu(ph->seq),
+ le32_to_cpu(ph->mseq));
}
__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
@@ -3907,6 +4031,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
}
/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+int ceph_drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ if (__ceph_caps_dirty(ci)) {
+ struct ceph_mds_client *mdsc =
+ ceph_inode_to_client(inode)->mdsc;
+ __cap_delay_requeue_front(mdsc, ci);
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ return drop;
+}
+
+/*
* Helpers for embedding cap and dentry lease releases into mds
* requests.
*
@@ -3936,11 +4086,20 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
cap = __get_cap_for_mds(ci, mds);
if (cap && __cap_is_valid(cap)) {
- if (force ||
- ((cap->issued & drop) &&
- (cap->issued & unless) == 0)) {
- if ((cap->issued & drop) &&
- (cap->issued & unless) == 0) {
+ unless &= cap->issued;
+ if (unless) {
+ if (unless & CEPH_CAP_AUTH_EXCL)
+ drop &= ~CEPH_CAP_AUTH_SHARED;
+ if (unless & CEPH_CAP_LINK_EXCL)
+ drop &= ~CEPH_CAP_LINK_SHARED;
+ if (unless & CEPH_CAP_XATTR_EXCL)
+ drop &= ~CEPH_CAP_XATTR_SHARED;
+ if (unless & CEPH_CAP_FILE_EXCL)
+ drop &= ~CEPH_CAP_FILE_SHARED;
+ }
+
+ if (force || (cap->issued & drop)) {
+ if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
wanted |= cap->mds_wanted;
@@ -3972,7 +4131,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
*p += sizeof(*rel);
ret = 1;
} else {
- dout("encode_inode_release %p cap %p %s\n",
+ dout("encode_inode_release %p cap %p %s (noop)\n",
inode, cap, ceph_cap_string(cap->issued));
}
}