From f1a79365a7416c5046f88d0db025e1d84c32b252 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 18 Nov 2018 21:35:59 -0500 Subject: bcachefs: Don't block on journal reservation with btree locks held Fixes a deadlock between the allocator thread, when it first starts up, and journal replay Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 6 +-- fs/bcachefs/btree_update_leaf.c | 38 ++++++++++++----- fs/bcachefs/journal.c | 84 ++++++++++++++++++++++++------------- fs/bcachefs/journal.h | 39 +++++++++-------- 4 files changed, 108 insertions(+), 59 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 01e476d72595..af31819c88c7 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -638,12 +638,12 @@ static void btree_update_wait_on_journal(struct closure *cl) int ret; ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); - if (ret < 0) - goto err; - if (!ret) { + if (ret == -EAGAIN) { continue_at(cl, btree_update_wait_on_journal, system_wq); return; } + if (ret < 0) + goto err; bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); err: diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 093e480977c7..41691bebf679 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -344,19 +344,35 @@ static inline int do_btree_insert_at(struct btree_insert *trans, trans_for_each_entry(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - u64s = 0; - trans_for_each_entry(trans, i) - u64s += jset_u64s(i->k->k.u64s); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch2_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; - if (ret) - return ret; + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + u64s = 0; + trans_for_each_entry(trans, i) + u64s += jset_u64s(i->k->k.u64s); + + while ((ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) { + struct btree_iter *iter = trans->entries[0].iter; + + bch2_btree_iter_unlock(iter); + + ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_CHECK); + if (ret) + return ret; + + if (!bch2_btree_iter_relock(iter)) { + trans_restart(" (iter relock after journal res get blocked)"); + return -EINTR; + } + } + + if (ret) + return ret; + } multi_lock_write(c, trans); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index b4d037664628..5db0a469ac24 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -335,15 +335,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) } static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret; retry: - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) - return ret; + if (journal_res_get_fast(j, res, flags)) + return 0; spin_lock(&j->lock); /* @@ -351,10 +350,9 @@ retry: * that just did journal_entry_open() and call journal_entry_close() * unnecessarily */ - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) { + if (journal_res_get_fast(j, res, flags)) { spin_unlock(&j->lock); - return 1; + return 0; } /* @@ -377,7 +375,12 @@ retry: spin_unlock(&j->lock); return -EROFS; case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ + /* + * The current journal entry is still open, but we failed to get + * a journal reservation because there's not enough space in it, + * and we can't close it and start another because we haven't + * finished writing out the previous entry: + */ spin_unlock(&j->lock); trace_journal_entry_full(c); goto blocked; @@ -408,7 +411,7 @@ retry: blocked: if (!j->res_get_blocked_start) j->res_get_blocked_start = local_clock() ?: 1; - return 0; + return -EAGAIN; } /* @@ -422,14 +425,14 @@ blocked: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { int ret; wait_event(j->wait, - (ret = __journal_res_get(j, res, u64s_min, - u64s_max))); - return ret < 0 ? ret : 0; + (ret = __journal_res_get(j, res, flags)) != -EAGAIN || + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; } u64 bch2_journal_last_unwritten_seq(struct journal *j) @@ -453,28 +456,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j) * btree root - every journal entry contains the roots of all the btrees, so it * doesn't need to bother with getting a journal reservation */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) +int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) { - int ret; - + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool need_reclaim = false; +retry: spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); if (seq < journal_cur_seq(j) || journal_entry_is_open(j)) { spin_unlock(&j->lock); - return 1; + return 0; + } + + if (journal_cur_seq(j) < seq) { + switch (journal_buf_switch(j, false)) { + case JOURNAL_ENTRY_ERROR: + spin_unlock(&j->lock); + return -EROFS; + case JOURNAL_ENTRY_INUSE: + /* haven't finished writing out the previous one: */ + trace_journal_entry_full(c); + goto blocked; + case JOURNAL_ENTRY_CLOSED: + break; + case JOURNAL_UNLOCKED: + goto retry; + } + } + + BUG_ON(journal_cur_seq(j) < seq); + + if (!journal_entry_open(j)) { + need_reclaim = true; + goto blocked; } - ret = journal_entry_open(j); - if (!ret) - closure_wait(&j->async_wait, parent); spin_unlock(&j->lock); - if (!ret) - bch2_journal_reclaim_work(&j->reclaim_work.work); + return 0; +blocked: + if (!j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; - return ret; + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + + if (need_reclaim) + bch2_journal_reclaim_work(&j->reclaim_work.work); + return -EAGAIN; } static int journal_seq_error(struct journal *j, u64 seq) @@ -594,11 +624,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) void bch2_journal_meta_async(struct journal *j, struct closure *parent) { struct journal_res res; - unsigned u64s = jset_u64s(0); memset(&res, 0, sizeof(res)); - bch2_journal_res_get(j, &res, u64s, u64s); + bch2_journal_res_get(j, &res, jset_u64s(0), 0); bch2_journal_res_put(j, &res); bch2_journal_flush_seq_async(j, res.seq, parent); @@ -607,12 +636,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent) int bch2_journal_meta(struct journal *j) { struct journal_res res; - unsigned u64s = jset_u64s(0); int ret; memset(&res, 0, sizeof(res)); - ret = bch2_journal_res_get(j, &res, u64s, u64s); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 77cf39cc64ff..d9c094ba2ca0 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -272,12 +272,14 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, unsigned); + unsigned); + +#define JOURNAL_RES_GET_NONBLOCK (1 << 0) +#define JOURNAL_RES_GET_CHECK (1 << 1) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, - unsigned u64s_min, - unsigned u64s_max) + unsigned flags) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -289,42 +291,45 @@ static inline int journal_res_get_fast(struct journal *j, * Check if there is still room in the current journal * entry: */ - if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; - res->offset = old.cur_entry_offset; - res->u64s = min(u64s_max, j->cur_entry_u64s - - old.cur_entry_offset); + if (flags & JOURNAL_RES_GET_CHECK) + return 1; - journal_state_inc(&new); new.cur_entry_offset += res->u64s; + journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - res->ref = true; - res->idx = new.idx; - res->seq = le64_to_cpu(j->buf[res->idx].data->seq); + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned u64s, unsigned flags) { int ret; EBUG_ON(res->ref); - EBUG_ON(u64s_max < u64s_min); EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - if (journal_res_get_fast(j, res, u64s_min, u64s_max)) + res->u64s = u64s; + + if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max); + ret = bch2_journal_res_get_slowpath(j, res, flags); if (ret) return ret; out: - lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); - EBUG_ON(!res->ref); + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } return 0; } -- cgit