summaryrefslogtreecommitdiff
path: root/fs/bcachefs/journal_io.c
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-11-02 18:57:19 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2024-01-01 11:47:41 -0500
commit09caeabe1a5daa3b667b797c401d43206d583f23 (patch)
tree38285367be9d45f8ecbbe12f233744548d2ec455 /fs/bcachefs/journal_io.c
parentb05c0e9370bec71c62df690250f451e58e8ed2a4 (diff)
bcachefs: btree write buffer now slurps keys from journal
Previosuly, the transaction commit path would have to add keys to the btree write buffer as a separate operation, requiring additional global synchronization. This patch introduces a new journal entry type, which indicates that the keys need to be copied into the btree write buffer prior to being written out. We switch the journal entry type back to JSET_ENTRY_btree_keys prior to write, so this is not an on disk format change. Flushing the btree write buffer may require pulling keys out of journal entries yet to be written, and quiescing outstanding journal reservations; we previously added journal->buf_lock for synchronization with the journal write path. We also can't put strict bounds on the number of keys in the journal destined for the write buffer, which means we might overflow the size of the preallocated buffer and have to reallocate - this introduces a potentially fatal memory allocation failure. This is something we'll have to watch for, if it becomes an issue in practice we can do additional mitigation. The transaction commit path no longer has to explicitly check if the write buffer is full and wait on flushing; this is another performance optimization. Instead, when the btree write buffer is close to full we change the journal watermark, so that only reservations for journal reclaim are allowed. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/journal_io.c')
-rw-r--r--fs/bcachefs/journal_io.c58
1 files changed, 53 insertions, 5 deletions
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index afcb2a435956..4d8e10c901a8 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@@ -723,6 +724,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -1503,6 +1520,8 @@ done:
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
@@ -1510,6 +1529,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (buf->buf_size >= new_size)
return;
+ size_t btree_write_buffer_size = new_size / 64;
+
+ if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+ return;
+
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1703,9 +1727,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
+ struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
- bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
+ bool validate_before_checksum = false;
+ u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
@@ -1733,9 +1759,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
- if (i->type == BCH_JSET_ENTRY_btree_root) {
+ switch (i->type) {
+ case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
+ break;
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ EBUG_ON(!w->need_flush_to_write_buffer);
+
+ if (!wb.wb)
+ bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+ struct bkey_i *k;
+ jset_entry_for_each_key(i, k) {
+ ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+ if (ret) {
+ bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ return ret;
+ }
+ }
+ i->type = BCH_JSET_ENTRY_btree_keys;
+ break;
}
/* Can we merge with previous entry? */
@@ -1758,6 +1803,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
memmove_u64s_down(prev, i, jset_u64s(u64s));
}
+ if (wb.wb)
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ w->need_flush_to_write_buffer = false;
+
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
@@ -1765,8 +1814,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
- bch2_journal_super_entries_add_common(c, &end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1789,7 +1837,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = le64_to_cpu(jset->seq);
+ j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;