summaryrefslogtreecommitdiff
path: root/fs/ceph/caps.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/caps.c')
-rw-r--r--fs/ceph/caps.c213
1 files changed, 129 insertions, 84 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9c02f328c966..a8d8b56cf9d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/iversion.h>
#include <linux/filelock.h>
+#include <linux/jiffies.h>
#include "super.h"
#include "mds_client.h"
@@ -977,20 +978,6 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
return 0;
}
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
-{
- struct inode *inode = &ci->netfs.inode;
- struct ceph_client *cl = ceph_inode_to_client(inode);
- int ret;
-
- spin_lock(&ci->i_ceph_lock);
- ret = __ceph_caps_revoking_other(ci, NULL, mask);
- spin_unlock(&ci->i_ceph_lock);
- doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
- ceph_cap_string(mask), ret);
- return ret;
-}
-
int __ceph_caps_used(struct ceph_inode_info *ci)
{
int used = 0;
@@ -1452,7 +1439,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
if (flushing & CEPH_CAP_XATTR_EXCL) {
arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
arg->xattr_version = ci->i_xattrs.version;
- arg->xattr_buf = ci->i_xattrs.blob;
+ arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
} else {
arg->xattr_buf = NULL;
arg->old_xattr_buf = NULL;
@@ -1553,6 +1540,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
encode_cap_msg(msg, arg);
ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
+ ceph_buffer_put(arg->xattr_buf);
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -2015,6 +2003,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
+ * CHECK_CAPS_FLUSH_FORCE - we should flush any caps immediately, without
+ * further delay.
*/
void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
@@ -2096,7 +2086,7 @@ retry:
}
doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
- "flushing %s issued %s revoking %s retain %s %s%s%s\n",
+ "flushing %s issued %s revoking %s retain %s %s%s%s%s\n",
inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
@@ -2104,7 +2094,8 @@ retry:
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
- (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "",
+ (flags & CHECK_CAPS_FLUSH_FORCE) ? " FLUSH_FORCE" : "");
/*
* If we no longer need to hold onto old our caps, and we may
@@ -2155,6 +2146,35 @@ retry:
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
+ /* completed revocation? going down and there are no caps? */
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ doutc(cl, "completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
+ }
+
+ if (flags & CHECK_CAPS_FLUSH_FORCE) {
+ doutc(cl, "force to flush caps\n");
+ goto ack;
+ }
+
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
@@ -2182,30 +2202,6 @@ retry:
}
}
- /* completed revocation? going down and there are no caps? */
- if (revoking) {
- if ((revoking & cap_used) == 0) {
- doutc(cl, "completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /*
- * If the "i_wrbuffer_ref" was increased by mmap or generic
- * cache write just before the ceph_check_caps() is called,
- * the Fb capability revoking will fail this time. Then we
- * must wait for the BDI's delayed work to flush the dirty
- * pages and to release the "i_wrbuffer_ref", which will cost
- * at most 5 seconds. That means the MDS needs to wait at
- * most 5 seconds to finished the Fb capability's revocation.
- *
- * Let's queue a writeback for it.
- */
- if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
- (revoking & CEPH_CAP_FILE_BUFFER))
- queue_writeback = true;
- }
-
/* want more caps from mds? */
if (want & ~cap->mds_wanted) {
if (want & ~(cap->mds_wanted | cap->issued))
@@ -2803,7 +2799,7 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
* requested from the MDS.
*
* Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
- * or a negative error code. There are 3 speical error codes:
+ * or a negative error code. There are 3 special error codes:
* -EAGAIN: need to sleep but non-blocking is specified
* -EFBIG: ask caller to call check_max_size() and try again.
* -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
@@ -3066,10 +3062,13 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
flags, &_got);
WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
+#ifdef CONFIG_DEBUG_FS
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
+#endif
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+#ifdef CONFIG_DEBUG_FS
cw.ino = ceph_ino(inode);
cw.tgid = current->tgid;
cw.need = need;
@@ -3078,6 +3077,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
spin_lock(&mdsc->caps_list_lock);
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
+#endif
/* make sure used fmode not timeout */
ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
@@ -3096,9 +3096,11 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
remove_wait_queue(&ci->i_cap_wq, &wait);
ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
+#ifdef CONFIG_DEBUG_FS
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
spin_unlock(&mdsc->caps_list_lock);
+#endif
if (ret == -EAGAIN)
continue;
@@ -3215,7 +3217,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
enum put_cap_refs_mode {
PUT_CAP_REFS_SYNC = 0,
- PUT_CAP_REFS_NO_CHECK,
PUT_CAP_REFS_ASYNC,
};
@@ -3331,11 +3332,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
}
-void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
-{
- __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
-}
-
/*
* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
* context. Adjust per-snap dirty page accounting as appropriate.
@@ -3509,6 +3505,8 @@ static void handle_cap_grant(struct inode *inode,
bool queue_invalidate = false;
bool deleted_inode = false;
bool fill_inline = false;
+ bool revoke_wait = false;
+ int flags = 0;
/*
* If there is at least one crypto block then we'll trust
@@ -3704,16 +3702,18 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
ceph_cap_string(revoking));
if (S_ISREG(inode->i_mode) &&
- (revoking & used & CEPH_CAP_FILE_BUFFER))
+ (revoking & used & CEPH_CAP_FILE_BUFFER)) {
writeback = true; /* initiate writeback; will delay ack */
- else if (queue_invalidate &&
+ revoke_wait = true;
+ } else if (queue_invalidate &&
revoking == CEPH_CAP_FILE_CACHE &&
- (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
- ; /* do nothing yet, invalidation will be queued */
- else if (cap == ci->i_auth_cap)
+ (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) {
+ revoke_wait = true; /* do nothing yet, invalidation will be queued */
+ } else if (cap == ci->i_auth_cap) {
check_caps = 1; /* check auth cap only */
- else
+ } else {
check_caps = 2; /* check all caps */
+ }
/* If there is new caps, try to wake up the waiters */
if (~cap->issued & newcaps)
wake = true;
@@ -3740,8 +3740,9 @@ static void handle_cap_grant(struct inode *inode,
BUG_ON(cap->issued & ~cap->implemented);
/* don't let check_caps skip sending a response to MDS for revoke msgs */
- if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
+ if (!revoke_wait && le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
cap->mds_wanted = 0;
+ flags |= CHECK_CAPS_FLUSH_FORCE;
if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
else
@@ -3797,9 +3798,9 @@ static void handle_cap_grant(struct inode *inode,
mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
+ ceph_check_caps(ci, flags | CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
+ ceph_check_caps(ci, flags | CHECK_CAPS_NOINVAL);
}
/*
@@ -4070,23 +4071,22 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 t_cap_id;
- unsigned mseq = le32_to_cpu(ex->migrate_seq);
- unsigned t_seq, t_mseq;
+ u32 t_issue_seq, t_mseq;
int target, issued;
int mds = session->s_mds;
if (ph) {
t_cap_id = le64_to_cpu(ph->cap_id);
- t_seq = le32_to_cpu(ph->seq);
+ t_issue_seq = le32_to_cpu(ph->issue_seq);
t_mseq = le32_to_cpu(ph->mseq);
target = le32_to_cpu(ph->mds);
} else {
- t_cap_id = t_seq = t_mseq = 0;
+ t_cap_id = t_issue_seq = t_mseq = 0;
target = -1;
}
- doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
- inode, ceph_vinop(inode), ci, mds, mseq, target);
+ doutc(cl, " cap %llx.%llx export to peer %d piseq %u pmseq %u\n",
+ ceph_vinop(inode), target, t_issue_seq, t_mseq);
retry:
down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
@@ -4119,12 +4119,12 @@ retry:
if (tcap) {
/* already have caps from the target */
if (tcap->cap_id == t_cap_id &&
- ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+ ceph_seq_cmp(tcap->seq, t_issue_seq) < 0) {
doutc(cl, " updating import cap %p mds%d\n", tcap,
target);
tcap->cap_id = t_cap_id;
- tcap->seq = t_seq - 1;
- tcap->issue_seq = t_seq - 1;
+ tcap->seq = t_issue_seq - 1;
+ tcap->issue_seq = t_issue_seq - 1;
tcap->issued |= issued;
tcap->implemented |= issued;
if (cap == ci->i_auth_cap) {
@@ -4135,11 +4135,11 @@ retry:
ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
} else if (tsession) {
- /* add placeholder for the export tagert */
+ /* add placeholder for the export target */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
- t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+ t_issue_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
ci->i_auth_cap == tcap) {
@@ -4213,18 +4213,22 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
u64 realmino = le64_to_cpu(im->realm);
u64 cap_id = le64_to_cpu(im->cap_id);
u64 p_cap_id;
+ u32 piseq = 0;
+ u32 pmseq = 0;
int peer;
if (ph) {
p_cap_id = le64_to_cpu(ph->cap_id);
peer = le32_to_cpu(ph->mds);
+ piseq = le32_to_cpu(ph->issue_seq);
+ pmseq = le32_to_cpu(ph->mseq);
} else {
p_cap_id = 0;
peer = -1;
}
- doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
- inode, ceph_vinop(inode), ci, mds, mseq, peer);
+ doutc(cl, " cap %llx.%llx import from peer %d piseq %u pmseq %u\n",
+ ceph_vinop(inode), peer, piseq, pmseq);
retry:
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
@@ -4253,15 +4257,13 @@ retry:
doutc(cl, " remove export cap %p mds%d flags %d\n",
ocap, peer, ph->flags);
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
- (ocap->seq != le32_to_cpu(ph->seq) ||
- ocap->mseq != le32_to_cpu(ph->mseq))) {
+ (ocap->seq != piseq ||
+ ocap->mseq != pmseq)) {
pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
"%p %llx.%llx mds%d seq %d mseq %d"
" importer mds%d has peer seq %d mseq %d\n",
inode, ceph_vinop(inode), peer,
- ocap->seq, ocap->mseq, mds,
- le32_to_cpu(ph->seq),
- le32_to_cpu(ph->mseq));
+ ocap->seq, ocap->mseq, mds, piseq, pmseq);
}
ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
@@ -4335,7 +4337,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_snap_realm *realm = NULL;
int op;
int msg_version = le16_to_cpu(msg->hdr.version);
- u32 seq, mseq;
+ u32 seq, mseq, issue_seq;
struct ceph_vino vino;
void *snaptrace;
size_t snaptrace_len;
@@ -4345,8 +4347,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
bool close_sessions = false;
bool do_cap_release = false;
- doutc(cl, "from mds%d\n", session->s_mds);
-
if (!ceph_inc_mds_stopping_blocker(mdsc, session))
return;
@@ -4360,6 +4360,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
vino.snap = CEPH_NOSNAP;
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
+ issue_seq = le32_to_cpu(h->issue_seq);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -4447,12 +4448,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
- vino.ino, vino.snap, inode);
+ doutc(cl, " caps mds%d op %s ino %llx.%llx inode %p seq %u iseq %u mseq %u\n",
+ session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode,
+ seq, issue_seq, mseq);
mutex_lock(&session->s_mutex);
- doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
- session->s_seq, (unsigned)seq);
if (!inode) {
doutc(cl, " i don't have ino %llx\n", vino.ino);
@@ -4588,7 +4588,7 @@ flush_cap_releases:
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
goto done;
bad:
@@ -4639,6 +4639,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
+
+ /*
+ * Make sure too many dirty caps or general
+ * slowness doesn't block mdsc delayed work,
+ * preventing send_renew_caps() from running.
+ */
+ if (time_after_eq(jiffies, loop_start + 5 * HZ))
+ break;
}
spin_unlock(&mdsc->cap_delay_lock);
doutc(cl, "done\n");
@@ -4679,6 +4687,28 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
+/*
+ * Flush all cap releases to the mds
+ */
+static void flush_cap_releases(struct ceph_mds_session *s)
+{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&s->s_cap_lock);
+ if (s->s_num_cap_releases)
+ ceph_flush_session_cap_releases(mdsc, s);
+ spin_unlock(&s->s_cap_lock);
+ doutc(cl, "done\n");
+
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true);
+}
+
void __ceph_touch_fmode(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc, int fmode)
{
@@ -4777,7 +4807,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_fs_client(inode)->mdsc;
- __cap_delay_requeue_front(mdsc, ci);
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ spin_lock(&mdsc->cap_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add_tail(&ci->i_cap_delay_list,
+ &mdsc->cap_unlink_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+
+ /*
+ * Fire the work immediately, because the MDS maybe
+ * waiting for caps release.
+ */
+ ceph_queue_cap_unlink_work(mdsc);
}
}
spin_unlock(&ci->i_ceph_lock);